diff --git a/cv/3d_detection/centerpoint/pytorch/.gitignore b/cv/3d_detection/centerpoint/pytorch/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6a56d2c5ca122b8905ff174ab0500c3298726d69 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/.gitignore @@ -0,0 +1,14 @@ +**__pycache__** +**build +**egg-info** +**dist +data/ +*.pyc +venv/ +*.idea/ +*.so +*.pth +*.pkl +*.zip +*.bin +.vscode/ diff --git a/cv/3d_detection/centerpoint/pytorch/LICENSE b/cv/3d_detection/centerpoint/pytorch/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f8c21e6f908a82416a6e39286d05de9136a58e89 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020-2021 Tianwei Yin and Xingyi Zhou + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/cv/3d_detection/centerpoint/pytorch/README.md b/cv/3d_detection/centerpoint/pytorch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1c2d6fe06e6d203ee4f5aa77f088fab35731275c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/README.md @@ -0,0 +1,67 @@ +# CenterPoint + +## Model description +Three-dimensional objects are commonly represented as 3D boxes in a point-cloud. This representation mimics the well-studied image-based 2D bounding-box detection but comes with additional challenges. Objects in a 3D world do not follow any particular orientation, and box-based detectors have difficulties enumerating all orientations or fitting an axis-aligned bounding box to rotated objects. In this paper, we instead propose to represent, detect, and track 3D objects as points. Our framework, CenterPoint, first detects centers of objects using a keypoint detector and regresses to other attributes, including 3D size, 3D orientation, and velocity. In a second stage, it refines these estimates using additional point features on the object. In CenterPoint, 3D object tracking simplifies to greedy closest-point matching. The resulting detection and tracking algorithm is simple, efficient, and effective. CenterPoint achieved state-of-the-art performance on the nuScenes benchmark for both 3D detection and tracking, with 65.5 NDS and 63.8 AMOTA for a single model. On the Waymo Open Dataset, CenterPoint outperforms all previous single model method by a large margin and ranks first among all Lidar-only submissions. + +## Installing packages +``` +## install libGL and libboost +yum install mesa-libGL +yum install boost-devel + +# Install numba +cd numba +bash clean_numba.sh +bash build_numba.sh +bash install_numba.sh +cd .. + +# Install spconv which need cudnn.h +cd spconv +bash clean_spconv.sh +bash build_spconv.sh +bash install_spconv.sh +cd .. + +pip3 install -r requirements.txt + +bash setup.sh + +export PYTHONPATH="${PYTHONPATH}:PATH_TO_CENTERPOINT" +``` + +## Prepare Data +Download nuScenes from https://www.nuscenes.org/download +``` +mkdir -p data/nuscenes +# For nuScenes Dataset +└── NUSCENES_DATASET_ROOT + ├── samples <-- key frames + ├── sweeps <-- frames without annotation + ├── maps <-- unused + ├── v1.0-trainval <-- metadata + +python3 tools/create_data.py nuscenes_data_prep --root-path ./data/nuscenes --version="v1.0-trainval" --nsweeps=10 + +``` + + +## Training +Single GPU training +``` +python3 ./tools/train.py ./configs/nusc/voxelnet/nusc_centerpoint_voxelnet_01voxel.py +``` +Multiple GPU training +``` +python3 -m torch.distributed.launch --nproc_per_node=8 ./tools/train.py ./configs/nusc/voxelnet/nusc_centerpoint_voxelnet_01voxel.py +``` + +## Evaluate +``` +python3 ./tools/dist_test.py ./configs/nusc/voxelnet/nusc_centerpoint_voxelnet_01voxel.py --work_dir work_dirs/nusc_centerpoint_voxelnet_01voxel --checkpoint work_dirs/nusc_centerpoint_voxelnet_01voxel/latest.pth +``` +## Training Results + + +## Reference +- [CenterPoint](https://github.com/tianweiy/CenterPoint) \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale.py b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale.py new file mode 100644 index 0000000000000000000000000000000000000000..c58a60469a97e6ed5449f9b9dfe48127ae588f46 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale.py @@ -0,0 +1,238 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="PointPillars", + pretrained=None, + reader=dict( + type="PillarFeatureNet", + num_filters=[64, 64], + num_input_features=5, + with_distance=False, + voxel_size=(0.2, 0.2, 8), + pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0), + ), + backbone=dict(type="PointPillarsScatter", ds_factor=1), + neck=dict( + type="RPN", + layer_nums=[3, 5, 5], + ds_layer_strides=[2, 2, 2], + ds_num_filters=[64, 128, 256], + us_layer_strides=[0.5, 1, 2], + us_num_filters=[128, 128, 128], + num_input_features=64, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + # type='RPNHead', + type="CenterHead", + in_channels=sum([128, 128, 128]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], + voxel_size=[0.2, 0.2, 8] +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-51.2, -51.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.2, 0.2] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.9, 1.1], + global_translate_std=0.5, + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], + voxel_size=[0.2, 0.2, 8], + max_points_in_voxel=20, + max_voxel_num=[30000, 60000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type, lidar_only=True), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), + # dict(type='PointCloudCollect', keys=['points', 'voxels', 'annotations', 'calib']), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type, lidar_only=True), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + virtual=False, + load_interval=1, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + virtual=False, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + virtual=False, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale_virtual.py b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale_virtual.py new file mode 100644 index 0000000000000000000000000000000000000000..2071f2ac9737a83d8c74b17b7d28530df2e82e93 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_pp_fix_bn_z_scale_virtual.py @@ -0,0 +1,239 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="PointPillars", + pretrained=None, + reader=dict( + type="PillarFeatureNet", + num_filters=[64, 64], + num_input_features=16, + with_distance=False, + virtual=True, + voxel_size=(0.2, 0.2, 8), + pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0), + ), + backbone=dict(type="PointPillarsScatter", ds_factor=1), + neck=dict( + type="RPN", + layer_nums=[3, 5, 5], + ds_layer_strides=[2, 2, 2], + ds_num_filters=[64, 128, 256], + us_layer_strides=[0.5, 1, 2], + us_num_filters=[128, 128, 128], + num_input_features=64, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + # type='RPNHead', + type="CenterHead", + in_channels=sum([128, 128, 128]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], + voxel_size=[0.2, 0.2, 8] +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-51.2, -51.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.2, 0.2] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo_virtual.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.9, 1.1], + global_translate_std=0.5, + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], + voxel_size=[0.2, 0.2, 8], + max_points_in_voxel=20, + max_voxel_num=[30000, 60000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), + # dict(type='PointCloudCollect', keys=['points', 'voxels', 'annotations', 'calib']), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + virtual=True, + load_interval=1, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + virtual=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + virtual=True, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale.py b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale.py new file mode 100644 index 0000000000000000000000000000000000000000..40c3425c1ce0b7f4e396feb50e390e8b0c03dc77 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale.py @@ -0,0 +1,225 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="DynamicVoxelEncoder", + pc_range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2], + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, + share_conv_channel=64, + dcn_head=False + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + pc_range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2] +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-54, -54], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.075, 0.075] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.9, 1.1], + global_translate_std=0.5, + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), + # dict(type='PointCloudCollect', keys=['points', 'voxels', 'annotations', 'calib']), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_debug.py b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_debug.py new file mode 100644 index 0000000000000000000000000000000000000000..dd7e0fc04150cc247b5f3a0bf9e76074ce30707c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_debug.py @@ -0,0 +1,226 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="DynamicVoxelEncoder", + pc_range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2], + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, + share_conv_channel=64, + dcn_head=False + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + pc_range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2] +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-54, -54], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.075, 0.075] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.9, 1.1], + global_translate_std=0.5, + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), + # dict(type='PointCloudCollect', keys=['points', 'voxels', 'annotations', 'calib']), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=2, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + load_interval=1000, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_virtual.py b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_virtual.py new file mode 100644 index 0000000000000000000000000000000000000000..39b8e4cf6a52d450470056218b7109c5a3a5037b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_virtual.py @@ -0,0 +1,230 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="DynamicVoxelEncoder", + pc_range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2], + virtual=True + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=21, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, + share_conv_channel=64, + dcn_head=False + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + pc_range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2] +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-54, -54], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.075, 0.075] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo_virtual.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.9, 1.1], + global_translate_std=0.5, + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), + # dict(type='PointCloudCollect', keys=['points', 'voxels', 'annotations', 'calib']), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + virtual=True, + load_interval=1, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + virtual=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + virtual=True, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_two_stage_base_with_virtual.py b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_two_stage_base_with_virtual.py new file mode 100644 index 0000000000000000000000000000000000000000..6344e21c1683d1e66e9917a247bf61933f4657f2 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/mvp/nusc_two_stage_base_with_virtual.py @@ -0,0 +1,285 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type='TwoStageDetector', + first_stage_cfg=dict( + type="VoxelNet", + pretrained='work_dirs/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_scale_virtual/epoch_20.pth', + reader=dict( + type="DynamicVoxelEncoder", + pc_range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2], + virtual=True + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=21, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, + share_conv_channel=64, + dcn_head=False + ), + ), + second_stage_modules=[ + dict( + type="BEVFeatureExtractor", + pc_start=[-54, -54], + voxel_size=[0.075, 0.075], + out_stride=8 + ) + ], + roi_head=dict( + type="RoIHead", + input_channels=64*5+10, + add_box_param=True, + model_cfg=dict( + CLASS_AGNOSTIC=True, + SHARED_FC=[256, 256], + CLS_FC=[256, 256], + REG_FC=[256, 256], + DP_RATIO=0.3, + + TARGET_CONFIG=dict( + ROI_PER_IMAGE=128, + FG_RATIO=0.5, + SAMPLE_ROI_BY_EACH_CLASS=True, + CLS_SCORE_TYPE='roi_iou', + CLS_FG_THRESH=0.75, + CLS_BG_THRESH=0.25, + CLS_BG_THRESH_LO=0.1, + HARD_BG_RATIO=0.8, + REG_FG_THRESH=0.55 + ), + LOSS_CONFIG=dict( + CLS_LOSS='BinaryCrossEntropy', + REG_LOSS='L1', + LOSS_WEIGHTS={ + 'rcnn_cls_weight': 1.0, + 'rcnn_reg_weight': 1.0, + 'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + } + ) + ), + code_size=9 + ), + NMS_POST_MAXSIZE=500, + num_point=5, + freeze=True, + use_final_feature=True + +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + pc_range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2] +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-54, -54], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.075, 0.075] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo_virtual.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.9, 1.1], + global_translate_std=0.5, + db_sampler=None, # db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2], + max_points_in_voxel=5, + max_voxel_num=[250000, 400000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_painted_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_painted_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + virtual=True, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + virtual=True, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + virtual=True, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=3.0, pct_start=0.05, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 6 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/nusc/README.md b/cv/3d_detection/centerpoint/pytorch/configs/nusc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9f57ee6ca1e6ad81b16d07c92fd5019acf756cf4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/nusc/README.md @@ -0,0 +1,43 @@ +# MODEL ZOO + +### Common settings and notes + +- The experiments are run with PyTorch 1.1, CUDA 10.0, and CUDNN 7.5. +- The training is conducted on 4 V100 GPUs in a DGX server. +- Testing times are measured on a TITAN RTX GPU with batch size 1. + +## nuScenes 3D Detection + +**We provide training / validation configurations, logs, pretrained models, and prediction files for all models in the paper** + +### VoxelNet +| Model | Validation MAP | Validation NDS | Link | +|-----------------------|-----------------|-----------------|---------------| +| [centerpoint_voxel_1440](voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z.py) |59.6 | 66.8 | [URL](https://mitprod-my.sharepoint.com/:f:/g/personal/tianweiy_mit_edu/EhgzjwV2EghOnHFKyRgSadoBr2kUo7yPu52N-I3dG3c5dA?e=a9MdhX) | + +Please refer to [LINK](https://github.com/tianweiy/CenterPoint/issues/249) for centerpoint detection predicitons on nuScenes train/val/test sets. + +### VoxelNet(depreacted) + +These results are obtained before the sync bn bug fix + z axis augmentation . + +| Model | FPS | Validation MAP | Validation NDS | Link | +|-----------------------|------------------|-----------------|-----------------|---------------| +| [centerpoint_voxel_1024](voxelnet/nusc_centerpoint_voxelnet_01voxel.py) | 16 | 56.4 | 64.8 | [URL](https://mitprod-my.sharepoint.com/:f:/g/personal/tianweiy_mit_edu/EhT7DKpbj6VDin12xN42PYYB8UqkFTha-qb1F5srEE5UXQ?e=mVaJkC) | + + +### PointPillars + +| Model | FPS | Validation MAP | Validation NDS | Link | +|-----------------------|-----------------|-----------------|-----------------|---------------| +| [centerpoint_pillar](pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep.py) | 31 | 50.3 | 60.2 | [URL](https://mitprod-my.sharepoint.com/:f:/g/personal/tianweiy_mit_edu/EkN9vbDmXMJCtSn6dgBLE4wBA1PL96U6MbGhh3lME_G6wA?e=vjhpd2) | + + +## nuScenes 3D Tracking + +| Model | Tracking time | Total time | Validation AMOTA ↑ | Validation AMOTP ↓ | Link | +|-----------------------|-----------|------------------|------------------|-------------------|---------------| +| centerpoint_voxel_1024 | 1ms | 64ms | 63.7* | 0.606 | [URL](https://mitprod-my.sharepoint.com/:f:/g/personal/tianweiy_mit_edu/Epy78yQMnZlCuMBWPlUtQ3oBWqQQ2fArTs637DlBHdaHIw?e=q6a2bA) | + + +*The numbers are from the centerpoint_voxel_1024 config (before the sync bn bug fix + z axis augmentation). Current detection [models](voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z.py) should perform slightly better. diff --git a/cv/3d_detection/centerpoint/pytorch/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep.py b/cv/3d_detection/centerpoint/pytorch/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep.py new file mode 100644 index 0000000000000000000000000000000000000000..1a89adf3041f8bb48b102d95f7278a7dfc393acf --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep.py @@ -0,0 +1,227 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + + +# model settings +model = dict( + type="PointPillars", + pretrained=None, + reader=dict( + type="PillarFeatureNet", + num_filters=[64, 64], + num_input_features=5, + with_distance=False, + voxel_size=(0.2, 0.2, 8), + pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0), + ), + backbone=dict(type="PointPillarsScatter", ds_factor=1), + neck=dict( + type="RPN", + layer_nums=[3, 5, 5], + ds_layer_strides=[2, 2, 2], + ds_num_filters=[64, 128, 256], + us_layer_strides=[0.5, 1, 2], + us_num_filters=[128, 128, 128], + num_input_features=64, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + # type='RPNHead', + type="CenterHead", + in_channels=sum([128, 128, 128]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-51.2, -51.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.2, 0.2] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.3925, 0.3925], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], + voxel_size=[0.2, 0.2, 8], + max_points_in_voxel=20, + max_voxel_num=[30000, 60000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_circular_nms.py b/cv/3d_detection/centerpoint/pytorch/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_circular_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..71c22461f89bc8118b46b5a3269d4368ebc4fc5b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_circular_nms.py @@ -0,0 +1,229 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + + +# model settings +model = dict( + type="PointPillars", + pretrained=None, + reader=dict( + type="PillarFeatureNet", + num_filters=[64, 64], + num_input_features=5, + with_distance=False, + voxel_size=(0.2, 0.2, 8), + pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0), + ), + backbone=dict(type="PointPillarsScatter", ds_factor=1), + neck=dict( + type="RPN", + layer_nums=[3, 5, 5], + ds_layer_strides=[2, 2, 2], + ds_num_filters=[64, 128, 256], + us_layer_strides=[0.5, 1, 2], + us_num_filters=[128, 128, 128], + num_input_features=64, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + # type='RPNHead', + type="CenterHead", + in_channels=sum([128, 128, 128]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + min_radius=[4, 12, 10, 1, 0.85, 0.175], + circular_nms=True, + nms=dict( + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-51.2, -51.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.2, 0.2] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.3925, 0.3925], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], + voxel_size=[0.2, 0.2, 8], + max_points_in_voxel=20, + max_voxel_num=[30000, 60000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_demo.py b/cv/3d_detection/centerpoint/pytorch/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..9d03bf3fb910c18ea0823314372f93951aaa3346 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep_demo.py @@ -0,0 +1,227 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + + +# model settings +model = dict( + type="PointPillars", + pretrained=None, + reader=dict( + type="PillarFeatureNet", + num_filters=[64, 64], + num_input_features=5, + with_distance=False, + voxel_size=(0.2, 0.2, 8), + pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0), + ), + backbone=dict(type="PointPillarsScatter", ds_factor=1), + neck=dict( + type="RPN", + layer_nums=[3, 5, 5], + ds_layer_strides=[2, 2, 2], + ds_num_filters=[64, 128, 256], + us_layer_strides=[0.5, 1, 2], + us_num_filters=[128, 128, 128], + num_input_features=64, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + # type='RPNHead', + type="CenterHead", + in_channels=sum([128, 128, 128]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-51.2, -51.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.2, 0.2] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.3925, 0.3925], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], + voxel_size=[0.2, 0.2, 8], + max_points_in_voxel=20, + max_voxel_num=[30000, 60000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "demo/nuScenes/demo_infos.pkl" +val_anno = "demo/nuScenes/demo_infos.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn.py b/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn.py new file mode 100644 index 0000000000000000000000000000000000000000..e51f4c28970ce356083a0807221d5f04993f08bc --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn.py @@ -0,0 +1,231 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="VoxelFeatureExtractorV3", + # type='SimpleVoxel', + num_input_features=5, + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, + share_conv_channel=64, + dcn_head=True + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-54, -54], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.075, 0.075] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.3925, 0.3925], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2], + max_points_in_voxel=10, + max_voxel_num=[120000, 160000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), + # dict(type='PointCloudCollect', keys=['points', 'voxels', 'annotations', 'calib']), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn_flip.py b/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn_flip.py new file mode 100644 index 0000000000000000000000000000000000000000..b5e729e5e0d494b37137053768810756c17aee8b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_dcn_flip.py @@ -0,0 +1,237 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor +DOUBLE_FLIP = True + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="VoxelFeatureExtractorV3", + # type='SimpleVoxel', + num_input_features=5, + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, + share_conv_channel=64, + dcn_head=True + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-54, -54], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.075, 0.075], + double_flip=DOUBLE_FLIP +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.3925, 0.3925], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2], + max_points_in_voxel=10, + max_voxel_num=[120000, 160000], + double_flip=DOUBLE_FLIP +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), + # dict(type='PointCloudCollect', keys=['points', 'voxels', 'annotations', 'calib']), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="DoubleFlip") if DOUBLE_FLIP else dict(type="Empty"), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat", double_flip=DOUBLE_FLIP), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = "data/nuScenes/infos_test_10sweeps_withvelo_filter_True.pkl" + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + test_mode=True, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + version='v1.0-test' + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z.py b/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z.py new file mode 100644 index 0000000000000000000000000000000000000000..a4b8db5ae623909d7ffe9258b457e319d8d8e3c6 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z.py @@ -0,0 +1,233 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="VoxelFeatureExtractorV3", + # type='SimpleVoxel', + num_input_features=5, + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, + share_conv_channel=64, + dcn_head=False + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-54, -54], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.075, 0.075] +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.9, 1.1], + global_translate_std=0.5, + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2], + max_points_in_voxel=10, + max_voxel_num=[120000, 160000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), + # dict(type='PointCloudCollect', keys=['points', 'voxels', 'annotations', 'calib']), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=6, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + test_mode=True, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_flip.py b/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_flip.py new file mode 100644 index 0000000000000000000000000000000000000000..978589272f363841d544b9b332ac3fab36ef3c83 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_flip.py @@ -0,0 +1,238 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor +DOUBLE_FLIP = True + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="VoxelFeatureExtractorV3", + # type='SimpleVoxel', + num_input_features=5, + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, + share_conv_channel=64, + dcn_head=False + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-54, -54], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.075, 0.075], + double_flip=DOUBLE_FLIP +) + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuScenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.9, 1.1], + global_translate_std=0.5, + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-54, -54, -5.0, 54, 54, 3.0], + voxel_size=[0.075, 0.075, 0.2], + max_points_in_voxel=10, + max_voxel_num=[120000, 160000], + double_flip=DOUBLE_FLIP +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), + # dict(type='PointCloudCollect', keys=['points', 'voxels', 'annotations', 'calib']), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="DoubleFlip") if DOUBLE_FLIP else dict(type="Empty"), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat", double_flip=DOUBLE_FLIP), +] + +train_anno = "data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = "data/nuScenes/infos_test_10sweeps_withvelo_filter_True.pkl" + +data = dict( + samples_per_gpu=4, + workers_per_gpu=6, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + test_mode=True, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + version='v1.0-test' + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_01voxel.py b/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_01voxel.py new file mode 100644 index 0000000000000000000000000000000000000000..a23e4208d76f0524a30a6eb2cd3e0f8998579612 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/nusc/voxelnet/nusc_centerpoint_voxelnet_01voxel.py @@ -0,0 +1,226 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=1, class_names=["car"]), + dict(num_class=2, class_names=["truck", "construction_vehicle"]), + dict(num_class=2, class_names=["bus", "trailer"]), + dict(num_class=1, class_names=["barrier"]), + dict(num_class=2, class_names=["motorcycle", "bicycle"]), + dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="VoxelFeatureExtractorV3", + num_input_features=5, + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='nuscenes', + weight=0.25, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv) + share_conv_channel=64, + dcn_head=False + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + nms=dict( + nms_pre_max_size=1000, + nms_post_max_size=83, + nms_iou_threshold=0.2, + ), + score_threshold=0.1, + pc_range=[-51.2, -51.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.1, 0.1] +) + + +# dataset settings +dataset_type = "NuScenesDataset" +nsweeps = 10 +data_root = "data/nuscenes" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/nuscenes/dbinfos_train_10sweeps_withvelo.pkl", + sample_groups=[ + dict(car=2), + dict(truck=3), + dict(construction_vehicle=7), + dict(bus=4), + dict(trailer=6), + dict(barrier=2), + dict(motorcycle=6), + dict(bicycle=6), + dict(pedestrian=2), + dict(traffic_cone=2), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.3925, 0.3925], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], + voxel_size=[0.1, 0.1, 0.2], + max_points_in_voxel=10, + max_voxel_num=[90000, 120000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/nuscenes/infos_train_10sweeps_withvelo_filter_True.pkl" +val_anno = "data/nuscenes/infos_val_10sweeps_withvelo_filter_True.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + ], +) +# yapf:enable +# runtime settings +total_epochs = 20 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/README.md b/cv/3d_detection/centerpoint/pytorch/configs/waymo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4e37d517bfd35f233127798f77fecb78664e21ab --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/README.md @@ -0,0 +1,68 @@ +# MODEL ZOO + +### Common settings and notes + +- The experiments are run with PyTorch 1.1, CUDA 10.0, and CUDNN 7.5. +- The training is conducted on 4 V100 GPUs in a DGX server. +- Testing times are measured on a TITAN RTX GPU with batch size 1. + +## Waymo 3D Detection + +We provide training / validation configurations, pretrained models, and prediction files for all models in the paper. To access these pretrained models, please send us an [email](mailto:yintianwei@utexas.edu) with your name, institute, a screenshot of the the Waymo dataset registration confirmation mail, and your intended usage. Please send a second email if we don't get back to you in two days. Please note that Waymo open dataset is under strict non-commercial license so we are not allowed to share the model with you if it will used for any profit-oriented activities. + +### One-stage VoxelNet +| Model | Veh_L2 | Ped_L2 | Cyc_L2 | MAPH | FPS | +|---------|--------|--------|---------|--------|------------| +| [VoxelNet](voxelnet/waymo_centerpoint_voxelnet_3x.py) | 66.2 | 62.6 | 67.6 | 65.5 | 13 | + +In the paper, our models only detect Vehicle and Pedestrian. Here, we provide the three classes config that also enables cyclist detection (and perform similarly). We encourage the community to also report three class performance in the future. + +### Ablations for training schedule + +CenterPoint is fast to train and converge in as little as 3~6 epochs. We tried a few training schedules for CenterPoint-Voxel and list their performance below. + +| Schedule | Veh_L2 | Ped_L2 | Cyc_L2 | MAPH | Training Time | +|------------|--------|--------|---------|--------|----------------| +| [36 epoch](voxelnet/waymo_centerpoint_voxelnet_3x.py) | 66.2 | 62.6 | 67.6 | 65.5 | 84hr | +| [12 epoch](voxelnet/waymo_centerpoint_voxelnet_1x.py) | 65.6 | 61.3 | 67.1 | 64.7 | 28hr | +| [6 epoch](voxelnet/waymo_centerpoint_voxelnet_6epoch.py) | 65.5 | 59.5 | 66.4 | 63.4 | 14hr | +| [3 epoch](voxelnet/waymo_centerpoint_voxelnet_3epoch.py) | 61.5 | 56.2 | 64.5 | 60.7 | 7hr | + +### Two-stage VoxelNet + +By default, we finetune a pretrained [one stage model](voxelnet/waymo_centerpoint_voxelnet_3x.py) for 6 epochs. To save GPU memory, we also freeze the backbone weight. + +| Model | Split | Veh_L2 | Ped_L2 | Cyc_L2 | MAPH | FPS | +|------------|----|----|--------|---------|--------|----------------| +| [VoxelNet](voxelnet/two_stage/waymo_centerpoint_voxelnet_two_stage_bev_5point_ft_6epoch_freeze.py) | Val | 67.9 | 65.6 | 68.6 | 67.4 | 13 | +| [VoxelNet](voxelnet/two_stage/waymo_centerpoint_voxelnet_two_stage_bev_5point_ft_6epoch_freeze.py) | Test| 71.9 | 67.0 | 68.2| 69.0 | 13 | + + +### Two frame model + +To provide richer input information and enable a more reasonable velocity estimation, we transform and merge the Lidar points of previous frame into current frame. This two frame model significanty boosts the detection performance. + +| Model | Split | Veh_L2 | Ped_L2 | Cyc_L2 | MAPH | FPS | +|------------|----|----|--------|---------|--------|----------------| +| [One-stage](voxelnet/waymo_centerpoint_voxelnet_two_sweeps_3x_with_velo.py) | Val | 67.3 | 67.5 | 69.9 | 68.2 | 11 | +| [Two-stage](voxelnet/two_stage/waymo_centerpoint_voxelnet_two_sweep_two_stage_bev_5point_ft_6epoch_freeze_with_vel.py) | Val | 69.7 | 70.3 | 70.9 | 70.3 | 11 | +| [Two-stage](voxelnet/two_stage/waymo_centerpoint_voxelnet_two_sweep_two_stage_bev_5point_ft_6epoch_freeze_with_vel.py) | Test | 73.0 | 71.5 | 71.3 | 71.9 | 11 | + + +### PointPillars + +| Model | Veh_L2 | Ped_L2 | Cyc_L2 | MAPH | FPS | +|---------|--------|--------|---------|--------|------------| +| [centerpoint_pillar](pp/waymo_centerpoint_pp_two_pfn_stride1_3x.py) | 65.5 | 55.1 | 60.2 | 60.3 | 19 | +| [centerpoint_pillar_two_stage](pp/two_stage/waymo_centerpoint_pp_two_pfn_stride1_two_stage_bev_6epoch.py) | 66.7 | 55.9 | 61.7 | 61.4 | 16 | + +For PointPillars, we notice a 1.5 mAPH drop when converting from two class model to three class model. You can refer to [ONE_STAGE](pp/waymo_centerpoint_pp_two_cls_two_pfn_stride1_3x.py) and [TWO_STAGE](pp/two_stage/waymo_centerpoint_pp_two_cls_two_pfn_stride1_two_stage_bev_6epoch.py) configs to reproduce the two class result. + +## Waymo 3D Tracking + +For 3D Tracking, we apply our center-based tracking on top of our two frame model's detection result. + +| | Split | Veh_L2 | Ped_L2 | Cyc_L2 | MOTA | FPS | +|---------|---------|--------|--------|---------|--------|-------| +| [centerpoint_voxel_two_sweep](../../tracking_scripts/centerpoint_voxel_two_sweep_val.sh)| Val | 55.0 | 55.0 | 57.4 | 55.8 | 11 | +| [centerpoint_voxel_two_sweep](../../tracking_scripts/centerpoint_voxel_two_sweep_test.sh)| Test | 59.4 | 56.6 | 60.0 | 58.7 | 11 | diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/two_stage/waymo_centerpoint_pp_two_cls_two_pfn_stride1_two_stage_bev_6epoch.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/two_stage/waymo_centerpoint_pp_two_cls_two_pfn_stride1_two_stage_bev_6epoch.py new file mode 100644 index 0000000000000000000000000000000000000000..1d772a656c7a0319f1734266a9c052be4ece25da --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/two_stage/waymo_centerpoint_pp_two_cls_two_pfn_stride1_two_stage_bev_6epoch.py @@ -0,0 +1,238 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=2, class_names=['VEHICLE', 'PEDESTRIAN']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type='TwoStageDetector', + first_stage_cfg=dict( + type="PointPillars", + pretrained='work_dirs/waymo_centerpoint_pp_two_cls_two_pfn_stride1_3x/epoch_36.pth', + reader=dict( + type="PillarFeatureNet", + num_filters=[64, 64], + num_input_features=5, + with_distance=False, + voxel_size=(0.32, 0.32, 6.0), + pc_range=(-74.88, -74.88, -2, 74.88, 74.88, 4.0), + ), + backbone=dict(type="PointPillarsScatter", ds_factor=1), + neck=dict( + type="RPN", + layer_nums=[3, 5, 5], + ds_layer_strides=[1, 2, 2], + ds_num_filters=[64, 128, 256], + us_layer_strides=[1, 2, 4], + us_num_filters=[128, 128, 128], + num_input_features=64, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=128*3, + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2)}, # (output_channel, num_conv) + ), + ), + second_stage_modules=[ + dict( + type="BEVFeatureExtractor", + pc_start=[-74.88, -74.88], + voxel_size=[0.32, 0.32], + out_stride=1 + ) + ], + roi_head=dict( + type="RoIHead", + input_channels=128*3*5, + model_cfg=dict( + CLASS_AGNOSTIC=True, + SHARED_FC=[256, 256], + CLS_FC=[256, 256], + REG_FC=[256, 256], + DP_RATIO=0.3, + + TARGET_CONFIG=dict( + ROI_PER_IMAGE=128, + FG_RATIO=0.5, + SAMPLE_ROI_BY_EACH_CLASS=True, + CLS_SCORE_TYPE='roi_iou', + CLS_FG_THRESH=0.75, + CLS_BG_THRESH=0.25, + CLS_BG_THRESH_LO=0.1, + HARD_BG_RATIO=0.8, + REG_FG_THRESH=0.55 + ), + LOSS_CONFIG=dict( + CLS_LOSS='BinaryCrossEntropy', + REG_LOSS='L1', + LOSS_WEIGHTS={ + 'rcnn_cls_weight': 1.0, + 'rcnn_reg_weight': 1.0, + 'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + } + ) + ), + code_size=7 + ), + NMS_POST_MAXSIZE=500, + num_point=5, + freeze=True +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + max_per_img=4096, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-74.88, -74.88], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.32, 0.32] +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 1 +data_root = "data/Waymo" + + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=None, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-74.88, -74.88, -2, 74.88, 74.88, 4.0], + voxel_size=[0.32, 0.32, 6.0], + max_points_in_voxel=20, + max_voxel_num=[32000, 60000], # we only use non-empty voxels. this will be much smaller than max_voxel_num +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_01sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_01sweeps_filter_zero_gt.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 6 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/two_stage/waymo_centerpoint_pp_two_pfn_stride1_two_stage_bev_6epoch.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/two_stage/waymo_centerpoint_pp_two_pfn_stride1_two_stage_bev_6epoch.py new file mode 100644 index 0000000000000000000000000000000000000000..604898d1da79bc7638fe76f855df9e87b4cdfb7b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/two_stage/waymo_centerpoint_pp_two_pfn_stride1_two_stage_bev_6epoch.py @@ -0,0 +1,260 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=3, class_names=['VEHICLE', 'PEDESTRIAN', 'CYCLIST']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type='TwoStageDetector', + first_stage_cfg=dict( + type="PointPillars", + pretrained='work_dirs/waymo_centerpoint_pp_two_pfn_stride1_3x/epoch_36.pth', + reader=dict( + type="PillarFeatureNet", + num_filters=[64, 64], + num_input_features=5, + with_distance=False, + voxel_size=(0.32, 0.32, 6.0), + pc_range=(-74.88, -74.88, -2, 74.88, 74.88, 4.0), + ), + backbone=dict(type="PointPillarsScatter", ds_factor=1), + neck=dict( + type="RPN", + layer_nums=[3, 5, 5], + ds_layer_strides=[1, 2, 2], + ds_num_filters=[64, 128, 256], + us_layer_strides=[1, 2, 4], + us_num_filters=[128, 128, 128], + num_input_features=64, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=128*3, + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2)}, # (output_channel, num_conv) + ), + ), + second_stage_modules=[ + dict( + type="BEVFeatureExtractor", + pc_start=[-74.88, -74.88], + voxel_size=[0.32, 0.32], + out_stride=1 + ) + ], + roi_head=dict( + type="RoIHead", + input_channels=128*3*5, + model_cfg=dict( + CLASS_AGNOSTIC=True, + SHARED_FC=[256, 256], + CLS_FC=[256, 256], + REG_FC=[256, 256], + DP_RATIO=0.3, + + TARGET_CONFIG=dict( + ROI_PER_IMAGE=128, + FG_RATIO=0.5, + SAMPLE_ROI_BY_EACH_CLASS=True, + CLS_SCORE_TYPE='roi_iou', + CLS_FG_THRESH=0.75, + CLS_BG_THRESH=0.25, + CLS_BG_THRESH_LO=0.1, + HARD_BG_RATIO=0.8, + REG_FG_THRESH=0.55 + ), + LOSS_CONFIG=dict( + CLS_LOSS='BinaryCrossEntropy', + REG_LOSS='L1', + LOSS_WEIGHTS={ + 'rcnn_cls_weight': 1.0, + 'rcnn_reg_weight': 1.0, + 'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + } + ) + ), + code_size=7 + ), + NMS_POST_MAXSIZE=500, + num_point=5, + freeze=True +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + max_per_img=4096, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-74.88, -74.88], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.32, 0.32] +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 1 +data_root = "data/Waymo" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/Waymo/dbinfos_train_1sweeps_withvelo.pkl", + sample_groups=[ + dict(VEHICLE=15), + dict(PEDESTRIAN=10), + dict(CYCLIST=10), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + VEHICLE=5, + PEDESTRIAN=5, + CYCLIST=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-74.88, -74.88, -2, 74.88, 74.88, 4.0], + voxel_size=[0.32, 0.32, 6.0], + max_points_in_voxel=20, + max_voxel_num=[32000, 60000], # we only use non-empty voxels. this will be much smaller than max_voxel_num +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_01sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_01sweeps_filter_zero_gt.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 6 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/waymo_centerpoint_pp_two_cls_two_pfn_stride1_3x.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/waymo_centerpoint_pp_two_cls_two_pfn_stride1_3x.py new file mode 100644 index 0000000000000000000000000000000000000000..1fbd0e3af935798e411d75bcebfbf5c7d7e55f71 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/waymo_centerpoint_pp_two_cls_two_pfn_stride1_3x.py @@ -0,0 +1,187 @@ +import itertools +import logging +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=2, class_names=['VEHICLE', 'PEDESTRIAN']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="PointPillars", + pretrained=None, + reader=dict( + type="PillarFeatureNet", + num_filters=[64, 64], + num_input_features=5, + with_distance=False, + voxel_size=(0.32, 0.32, 6.0), + pc_range=(-74.88, -74.88, -2, 74.88, 74.88, 4.0), + ), + backbone=dict(type="PointPillarsScatter", ds_factor=1), + neck=dict( + type="RPN", + layer_nums=[3, 5, 5], + ds_layer_strides=[1, 2, 2], + ds_num_filters=[64, 128, 256], + us_layer_strides=[1, 2, 4], + us_num_filters=[128, 128, 128], + num_input_features=64, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=128*3, + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + nms=dict( + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-74.88, -74.88], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.32, 0.32] +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 1 +data_root = "data/Waymo" + + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=None, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-74.88, -74.88, -2, 74.88, 74.88, 4.0], + voxel_size=[0.32, 0.32, 6.0], + max_points_in_voxel=20, + max_voxel_num=[32000, 60000], # we only use non-empty voxels. this will be much smaller than max_voxel_num +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_01sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_01sweeps_filter_zero_gt.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 36 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/waymo_centerpoint_pp_two_pfn_stride1_3x.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/waymo_centerpoint_pp_two_pfn_stride1_3x.py new file mode 100644 index 0000000000000000000000000000000000000000..539d5133487d6245a32c97ca5a212f727156d18a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/pp/waymo_centerpoint_pp_two_pfn_stride1_3x.py @@ -0,0 +1,209 @@ +import itertools +import logging +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=3, class_names=['VEHICLE', 'PEDESTRIAN', 'CYCLIST']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="PointPillars", + pretrained=None, + reader=dict( + type="PillarFeatureNet", + num_filters=[64, 64], + num_input_features=5, + with_distance=False, + voxel_size=(0.32, 0.32, 6.0), + pc_range=(-74.88, -74.88, -2, 74.88, 74.88, 4.0), + ), + backbone=dict(type="PointPillarsScatter", ds_factor=1), + neck=dict( + type="RPN", + layer_nums=[3, 5, 5], + ds_layer_strides=[1, 2, 2], + ds_num_filters=[64, 128, 256], + us_layer_strides=[1, 2, 4], + us_num_filters=[128, 128, 128], + num_input_features=64, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=128*3, + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + nms=dict( + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-74.88, -74.88], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.32, 0.32] +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 1 +data_root = "data/Waymo" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/Waymo/dbinfos_train_1sweeps_withvelo.pkl", + sample_groups=[ + dict(VEHICLE=15), + dict(PEDESTRIAN=10), + dict(CYCLIST=10), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + VEHICLE=5, + PEDESTRIAN=5, + CYCLIST=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-74.88, -74.88, -2, 74.88, 74.88, 4.0], + voxel_size=[0.32, 0.32, 6.0], + max_points_in_voxel=20, + max_voxel_num=[32000, 60000], # we only use non-empty voxels. this will be much smaller than max_voxel_num +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_01sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_01sweeps_filter_zero_gt.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=8, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 36 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/two_stage/waymo_centerpoint_voxelnet_two_stage_bev_5point_ft_6epoch_freeze.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/two_stage/waymo_centerpoint_voxelnet_two_stage_bev_5point_ft_6epoch_freeze.py new file mode 100644 index 0000000000000000000000000000000000000000..744cc271c5f4ba2b7f7dc59f0c8b9da643a88301 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/two_stage/waymo_centerpoint_voxelnet_two_stage_bev_5point_ft_6epoch_freeze.py @@ -0,0 +1,260 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=3, class_names=['VEHICLE', 'PEDESTRIAN', 'CYCLIST']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type='TwoStageDetector', + first_stage_cfg=dict( + type="VoxelNet", + pretrained='work_dirs/waymo_centerpoint_voxelnet_3x/epoch_36.pth', + reader=dict( + type="VoxelFeatureExtractorV3", + num_input_features=5 + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2)}, # (output_channel, num_conv) + ), + ), + second_stage_modules=[ + dict( + type="BEVFeatureExtractor", + pc_start=[-75.2, -75.2], + voxel_size=[0.1, 0.1], + out_stride=8 + ) + ], + roi_head=dict( + type="RoIHead", + input_channels=512*5, + model_cfg=dict( + CLASS_AGNOSTIC=True, + SHARED_FC=[256, 256], + CLS_FC=[256, 256], + REG_FC=[256, 256], + DP_RATIO=0.3, + + TARGET_CONFIG=dict( + ROI_PER_IMAGE=128, + FG_RATIO=0.5, + SAMPLE_ROI_BY_EACH_CLASS=True, + CLS_SCORE_TYPE='roi_iou', + CLS_FG_THRESH=0.75, + CLS_BG_THRESH=0.25, + CLS_BG_THRESH_LO=0.1, + HARD_BG_RATIO=0.8, + REG_FG_THRESH=0.55 + ), + LOSS_CONFIG=dict( + CLS_LOSS='BinaryCrossEntropy', + REG_LOSS='L1', + LOSS_WEIGHTS={ + 'rcnn_cls_weight': 1.0, + 'rcnn_reg_weight': 1.0, + 'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + } + ) + ), + code_size=7 + ), + NMS_POST_MAXSIZE=500, + num_point=5, + freeze=True +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + max_per_img=4096, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-75.2, -75.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.1, 0.1] +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 1 +data_root = "data/Waymo" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/Waymo/dbinfos_train_1sweeps_withvelo.pkl", + sample_groups=[ + dict(VEHICLE=15), + dict(PEDESTRIAN=10), + dict(CYCLIST=10), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + VEHICLE=5, + PEDESTRIAN=5, + CYCLIST=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-75.2, -75.2, -2, 75.2, 75.2, 4], + voxel_size=[0.1, 0.1, 0.15], + max_points_in_voxel=5, + max_voxel_num=[150000, 200000] +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_01sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_01sweeps_filter_zero_gt.pkl" +test_anno = "data/Waymo/infos_test_01sweeps_filter_zero_gt.pkl" + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + test_mode=True, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 6 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/two_stage/waymo_centerpoint_voxelnet_two_sweep_two_stage_bev_5point_ft_6epoch_freeze_with_vel.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/two_stage/waymo_centerpoint_voxelnet_two_sweep_two_stage_bev_5point_ft_6epoch_freeze_with_vel.py new file mode 100644 index 0000000000000000000000000000000000000000..eb164155710c5296b05ea2d48283583ee2fc1330 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/two_stage/waymo_centerpoint_voxelnet_two_sweep_two_stage_bev_5point_ft_6epoch_freeze_with_vel.py @@ -0,0 +1,260 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=3, class_names=['VEHICLE', 'PEDESTRIAN', 'CYCLIST']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type='TwoStageDetector', + first_stage_cfg=dict( + type="VoxelNet", + pretrained='work_dirs/waymo_centerpoint_voxelnet_two_sweeps_3x_with_velo/epoch_36.pth', + reader=dict( + type="VoxelFeatureExtractorV3", + num_input_features=6 + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=6, ds_factor=8 + ), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel':(2,2)}, # (output_channel, num_conv) + ), + ), + second_stage_modules=[ + dict( + type="BEVFeatureExtractor", + pc_start=[-75.2, -75.2], + voxel_size=[0.1, 0.1], + out_stride=8 + ) + ], + roi_head=dict( + type="RoIHead", + input_channels=512*5, + model_cfg=dict( + CLASS_AGNOSTIC=True, + SHARED_FC=[256, 256], + CLS_FC=[256, 256], + REG_FC=[256, 256], + DP_RATIO=0.3, + + TARGET_CONFIG=dict( + ROI_PER_IMAGE=128, + FG_RATIO=0.5, + SAMPLE_ROI_BY_EACH_CLASS=True, + CLS_SCORE_TYPE='roi_iou', + CLS_FG_THRESH=0.75, + CLS_BG_THRESH=0.25, + CLS_BG_THRESH_LO=0.1, + HARD_BG_RATIO=0.8, + REG_FG_THRESH=0.55 + ), + LOSS_CONFIG=dict( + CLS_LOSS='BinaryCrossEntropy', + REG_LOSS='L1', + LOSS_WEIGHTS={ + 'rcnn_cls_weight': 1.0, + 'rcnn_reg_weight': 1.0, + 'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + } + ) + ), + code_size=9 + ), + NMS_POST_MAXSIZE=500, + num_point=5, + freeze=True +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + max_per_img=4096, + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-75.2, -75.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.1, 0.1] +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 2 +data_root = "data/Waymo" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/Waymo/dbinfos_train_2sweeps_withvelo.pkl", + sample_groups=[ + dict(VEHICLE=15), + dict(PEDESTRIAN=10), + dict(CYCLIST=10), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + VEHICLE=5, + PEDESTRIAN=5, + CYCLIST=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-75.2, -75.2, -2, 75.2, 75.2, 4], + voxel_size=[0.1, 0.1, 0.15], + max_points_in_voxel=5, + max_voxel_num=[180000, 400000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_02sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_02sweeps_filter_zero_gt.pkl" +test_anno = "data/Waymo/infos_test_02sweeps_filter_zero_gt.pkl" + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + test_mode=True, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 6 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_1x.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..b9a83beae097276f71de8abe117384dcb5025c9a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_1x.py @@ -0,0 +1,210 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=3, class_names=['VEHICLE', 'PEDESTRIAN', 'CYCLIST']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="VoxelFeatureExtractorV3", + num_input_features=5, + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-75.2, -75.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.1, 0.1], +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 1 +data_root = "data/Waymo" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/Waymo/dbinfos_train_1sweeps_withvelo.pkl", + sample_groups=[ + dict(VEHICLE=15), + dict(PEDESTRIAN=10), + dict(CYCLIST=10), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + VEHICLE=5, + PEDESTRIAN=5, + CYCLIST=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-75.2, -75.2, -2, 75.2, 75.2, 4], + voxel_size=[0.1, 0.1, 0.15], + max_points_in_voxel=5, + max_voxel_num=150000, +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_01sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_01sweeps_filter_zero_gt.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 12 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_3epoch.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_3epoch.py new file mode 100644 index 0000000000000000000000000000000000000000..02fa88e6260594f87671490f36238b5a8a3d0104 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_3epoch.py @@ -0,0 +1,210 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=3, class_names=['VEHICLE', 'PEDESTRIAN', 'CYCLIST']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="VoxelFeatureExtractorV3", + num_input_features=5, + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-75.2, -75.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.1, 0.1], +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 1 +data_root = "data/Waymo" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/Waymo/dbinfos_train_1sweeps_withvelo.pkl", + sample_groups=[ + dict(VEHICLE=15), + dict(PEDESTRIAN=10), + dict(CYCLIST=10), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + VEHICLE=5, + PEDESTRIAN=5, + CYCLIST=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-75.2, -75.2, -2, 75.2, 75.2, 4], + voxel_size=[0.1, 0.1, 0.15], + max_points_in_voxel=5, + max_voxel_num=150000, +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_01sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_01sweeps_filter_zero_gt.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 3 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_3x.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_3x.py new file mode 100644 index 0000000000000000000000000000000000000000..989cddee5713d684f9ede69222d42f9b849a7a0b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_3x.py @@ -0,0 +1,210 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=3, class_names=['VEHICLE', 'PEDESTRIAN', 'CYCLIST']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="VoxelFeatureExtractorV3", + num_input_features=5, + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-75.2, -75.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.1, 0.1], +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 1 +data_root = "data/Waymo" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/Waymo/dbinfos_train_1sweeps_withvelo.pkl", + sample_groups=[ + dict(VEHICLE=15), + dict(PEDESTRIAN=10), + dict(CYCLIST=10), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + VEHICLE=5, + PEDESTRIAN=5, + CYCLIST=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-75.2, -75.2, -2, 75.2, 75.2, 4], + voxel_size=[0.1, 0.1, 0.15], + max_points_in_voxel=5, + max_voxel_num=[150000, 200000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_01sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_01sweeps_filter_zero_gt.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 36 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_6epoch.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_6epoch.py new file mode 100644 index 0000000000000000000000000000000000000000..ff8eccf1d4474a902d98c913b164d0f2b29b1d83 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_6epoch.py @@ -0,0 +1,210 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=3, class_names=['VEHICLE', 'PEDESTRIAN', 'CYCLIST']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="VoxelFeatureExtractorV3", + num_input_features=5, + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=5, ds_factor=8), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-75.2, -75.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.1, 0.1], +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 1 +data_root = "data/Waymo" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/Waymo/dbinfos_train_1sweeps_withvelo.pkl", + sample_groups=[ + dict(VEHICLE=15), + dict(PEDESTRIAN=10), + dict(CYCLIST=10), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + VEHICLE=5, + PEDESTRIAN=5, + CYCLIST=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-75.2, -75.2, -2, 75.2, 75.2, 4], + voxel_size=[0.1, 0.1, 0.15], + max_points_in_voxel=5, + max_voxel_num=150000, +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_01sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_01sweeps_filter_zero_gt.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 6 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_two_sweeps_3x_with_velo.py b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_two_sweeps_3x_with_velo.py new file mode 100644 index 0000000000000000000000000000000000000000..c7313d1f0f10f070aa44121babb35675c7c92fd1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/configs/waymo/voxelnet/waymo_centerpoint_voxelnet_two_sweeps_3x_with_velo.py @@ -0,0 +1,210 @@ +import itertools +import logging + +from det3d.utils.config_tool import get_downsample_factor + +tasks = [ + dict(num_class=3, class_names=['VEHICLE', 'PEDESTRIAN', 'CYCLIST']), +] + +class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) + +# training and testing settings +target_assigner = dict( + tasks=tasks, +) + +# model settings +model = dict( + type="VoxelNet", + pretrained=None, + reader=dict( + type="VoxelFeatureExtractorV3", + num_input_features=6, + ), + backbone=dict( + type="SpMiddleResNetFHD", num_input_features=6, ds_factor=8), + neck=dict( + type="RPN", + layer_nums=[5, 5], + ds_layer_strides=[1, 2], + ds_num_filters=[128, 256], + us_layer_strides=[1, 2], + us_num_filters=[256, 256], + num_input_features=256, + logger=logging.getLogger("RPN"), + ), + bbox_head=dict( + type="CenterHead", + in_channels=sum([256, 256]), + tasks=tasks, + dataset='waymo', + weight=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], + common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel':(2,2)}, # (output_channel, num_conv) + ), +) + +assigner = dict( + target_assigner=target_assigner, + out_size_factor=get_downsample_factor(model), + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, +) + + +train_cfg = dict(assigner=assigner) + + +test_cfg = dict( + post_center_limit_range=[-80, -80, -10.0, 80, 80, 10.0], + nms=dict( + use_rotate_nms=True, + use_multi_class_nms=False, + nms_pre_max_size=4096, + nms_post_max_size=500, + nms_iou_threshold=0.7, + ), + score_threshold=0.1, + pc_range=[-75.2, -75.2], + out_size_factor=get_downsample_factor(model), + voxel_size=[0.1, 0.1], +) + + +# dataset settings +dataset_type = "WaymoDataset" +nsweeps = 2 +data_root = "data/Waymo" + +db_sampler = dict( + type="GT-AUG", + enable=False, + db_info_path="data/Waymo/dbinfos_train_2sweeps_withvelo.pkl", + sample_groups=[ + dict(VEHICLE=15), + dict(PEDESTRIAN=10), + dict(CYCLIST=10), + ], + db_prep_steps=[ + dict( + filter_by_min_num_points=dict( + VEHICLE=5, + PEDESTRIAN=5, + CYCLIST=5, + ) + ), + dict(filter_by_difficulty=[-1],), + ], + global_random_rotation_range_per_object=[0, 0], + rate=1.0, +) + +train_preprocessor = dict( + mode="train", + shuffle_points=True, + global_rot_noise=[-0.78539816, 0.78539816], + global_scale_noise=[0.95, 1.05], + db_sampler=db_sampler, + class_names=class_names, +) + +val_preprocessor = dict( + mode="val", + shuffle_points=False, +) + +voxel_generator = dict( + range=[-75.2, -75.2, -2, 75.2, 75.2, 4], + voxel_size=[0.1, 0.1, 0.15], + max_points_in_voxel=5, + max_voxel_num=[180000, 400000], +) + +train_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=train_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] +test_pipeline = [ + dict(type="LoadPointCloudFromFile", dataset=dataset_type), + dict(type="LoadPointCloudAnnotations", with_bbox=True), + dict(type="Preprocess", cfg=val_preprocessor), + dict(type="Voxelization", cfg=voxel_generator), + dict(type="AssignLabel", cfg=train_cfg["assigner"]), + dict(type="Reformat"), +] + +train_anno = "data/Waymo/infos_train_02sweeps_filter_zero_gt.pkl" +val_anno = "data/Waymo/infos_val_02sweeps_filter_zero_gt.pkl" +test_anno = None + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + info_path=train_anno, + ann_file=train_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=train_pipeline, + ), + val=dict( + type=dataset_type, + root_path=data_root, + info_path=val_anno, + test_mode=True, + ann_file=val_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), + test=dict( + type=dataset_type, + root_path=data_root, + info_path=test_anno, + ann_file=test_anno, + nsweeps=nsweeps, + class_names=class_names, + pipeline=test_pipeline, + ), +) + + + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) + +# optimizer +optimizer = dict( + type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, +) +lr_config = dict( + type="one_cycle", lr_max=0.003, moms=[0.95, 0.85], div_factor=10.0, pct_start=0.4, +) + +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=5, + hooks=[ + dict(type="TextLoggerHook"), + # dict(type='TensorboardLoggerHook') + ], +) +# yapf:enable +# runtime settings +total_epochs = 36 +device_ids = range(8) +dist_params = dict(backend="nccl", init_method="env://") +log_level = "INFO" +work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/builder.py b/cv/3d_detection/centerpoint/pytorch/det3d/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..f1bf49fd5b26eea7179e29e1eb08b7216e51086a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/builder.py @@ -0,0 +1,222 @@ +import logging +import pickle +from functools import partial + +import det3d.core.sampler.preprocess as prep +import numpy as np +import torch +from det3d.core.input.voxel_generator import VoxelGenerator +from det3d.core.sampler.preprocess import DataBasePreprocessor +from det3d.core.sampler.sample_ops import DataBaseSamplerV2 +from det3d.solver import learning_schedules +from det3d.solver import learning_schedules_fastai as lsf +from det3d.solver import optim +from det3d.solver.fastai_optim import FastAIMixedOptim, OptimWrapper +from torch import nn + + +def build_voxel_generator(voxel_config): + + voxel_generator = VoxelGenerator( + voxel_size=voxel_config.VOXEL_SIZE, + point_cloud_range=voxel_config.RANGE, + max_num_points=voxel_config.MAX_POINTS_NUM_PER_VOXEL, + max_voxels=20000, + ) + + return voxel_generator + +def build_db_preprocess(db_prep_config, logger=None): + logger = logging.getLogger("build_db_preprocess") + cfg = db_prep_config + if "filter_by_difficulty" in cfg: + v = cfg["filter_by_difficulty"] + return prep.DBFilterByDifficulty(v, logger=logger) + elif "filter_by_min_num_points" in cfg: + v = cfg["filter_by_min_num_points"] + return prep.DBFilterByMinNumPoint(v, logger=logger) + else: + raise ValueError("unknown database prep type") + + +def children(m: nn.Module): + "Get children of `m`." + return list(m.children()) + + +def num_children(m: nn.Module) -> int: + "Get number of children modules in `m`." + return len(children(m)) + + +def flatten_model(m: nn.Module): + return sum(map(flatten_model, m.children()), []) if num_children(m) else [m] + + +def get_layer_groups(m: nn.Module): + return [nn.Sequential(*flatten_model(m))] + + +def build_optimizer(optimizer_config, net, name=None, mixed=False, loss_scale=512.0): + """Create optimizer based on config. + + Args: + optimizer_config: A Optimizer proto message. + + Returns: + An optimizer and a list of variables for summary. + + Raises: + ValueError: when using an unsupported input data type. + """ + optimizer_type = optimizer_config.TYPE + config = optimizer_config.VALUE + + if optimizer_type == "rms_prop_optimizer": + optimizer_func = partial( + torch.optim.RMSprop, + alpha=config.decay, + momentum=config.momentum_optimizer_value, + eps=config.epsilon, + ) + elif optimizer_type == "momentum_optimizer": + optimizer_func = partial( + torch.optim.SGD, + momentum=config.momentum_optimizer_value, + eps=config.epsilon, + ) + elif optimizer_type == "adam": + if optimizer_config.FIXED_WD: + optimizer_func = partial( + torch.optim.Adam, betas=(0.9, 0.99), amsgrad=config.amsgrad + ) + else: + # regular adam + optimizer_func = partial(torch.optim.Adam, amsgrad=config.amsgrad) + + optimizer = OptimWrapper.create( + optimizer_func, + 3e-3, + get_layer_groups(net), + wd=config.WD, + true_wd=optimizer_config.FIXED_WD, + bn_wd=True, + ) + + if optimizer is None: + raise ValueError("Optimizer %s not supported." % optimizer_type) + + if optimizer_config.MOVING_AVERAGE: + raise ValueError("torch don't support moving average") + + if name is None: + # assign a name to optimizer for checkpoint system + optimizer.name = optimizer_type + else: + optimizer.name = name + + return optimizer + + +def build_lr_scheduler(optimizer, optimizer_config, total_step): + """Create lr scheduler based on config. note that + lr_scheduler must accept a optimizer that has been restored. + + Args: + optimizer_config: A Optimizer proto message. + + Returns: + An optimizer and a list of variables for summary. + + Raises: + ValueError: when using an unsupported input data type. + """ + optimizer_type = optimizer_config.type + config = optimizer_config + + if optimizer_type == "rms_prop_optimizer": + lr_scheduler = _create_learning_rate_scheduler( + config, optimizer, total_step=total_step + ) + elif optimizer_type == "momentum_optimizer": + lr_scheduler = _create_learning_rate_scheduler( + config, optimizer, total_step=total_step + ) + elif optimizer_type == "adam": + lr_scheduler = _create_learning_rate_scheduler( + config, optimizer, total_step=total_step + ) + + return lr_scheduler + + +def _create_learning_rate_scheduler(optimizer, learning_rate_config, total_step): + """Create optimizer learning rate scheduler based on config. + + Args: + learning_rate_config: A LearningRate proto message. + + Returns: + A learning rate. + + Raises: + ValueError: when using an unsupported input data type. + """ + lr_scheduler = None + learning_rate_type = learning_rate_config.type + config = learning_rate_config + + if learning_rate_type == "multi_phase": + lr_phases = [] + mom_phases = [] + for phase_cfg in config.phases: + lr_phases.append((phase_cfg.start, phase_cfg.lambda_func)) + mom_phases.append((phase_cfg.start, phase_cfg.momentum_lambda_func)) + lr_scheduler = lsf.LRSchedulerStep(optimizer, total_step, lr_phases, mom_phases) + elif learning_rate_type == "one_cycle": + lr_scheduler = lsf.OneCycle( + optimizer, + total_step, + config.lr_max, + config.moms, + config.div_factor, + config.pct_start, + ) + elif learning_rate_type == "exponential_decay": + lr_scheduler = lsf.ExponentialDecay( + optimizer, + total_step, + config.initial_learning_rate, + config.decay_length, + config.decay_factor, + config.staircase, + ) + elif learning_rate_type == "manual_stepping": + lr_scheduler = lsf.ManualStepping( + optimizer, total_step, config.boundaries, config.rates + ) + elif lr_scheduler is None: + raise ValueError("Learning_rate %s not supported." % learning_rate_type) + + return lr_scheduler + + +def build_dbsampler(cfg, logger=None): + logger = logging.getLogger("build_dbsampler") + prepors = [build_db_preprocess(c, logger=logger) for c in cfg.db_prep_steps] + db_prepor = DataBasePreprocessor(prepors) + rate = cfg.rate + grot_range = cfg.global_random_rotation_range_per_object + groups = cfg.sample_groups + # groups = [dict(g.name_to_max_num) for g in groups] + info_path = cfg.db_info_path + with open(info_path, "rb") as f: + db_infos = pickle.load(f) + grot_range = list(grot_range) + if len(grot_range) == 0: + grot_range = None + sampler = DataBaseSamplerV2( + db_infos, groups, db_prepor, rate, grot_range, logger=logger + ) + + return sampler diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d05014be850115de1451f22e0c16d9936676d7e5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/__init__.py @@ -0,0 +1,4 @@ +from .utils import * +from .bbox import * +from .input import * +from .sampler import * diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..11c36137b9d9f7f6890a5b56e2406c294d667e1b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/__init__.py @@ -0,0 +1 @@ +from . import box_np_ops, box_torch_ops, geometry diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/box_np_ops.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/box_np_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..5d7e5c81a42c07946b2ff46b2fe21cb5c6bd8910 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/box_np_ops.py @@ -0,0 +1,803 @@ +from pathlib import Path + +import numba +import numpy as np +from det3d.core.bbox.geometry import ( + points_count_convex_polygon_3d_jit, + points_in_convex_polygon_3d_jit, +) +try: + from spconv.utils import rbbox_intersection, rbbox_iou +except: + print("Import spconv fail, no support for sparse convolution!") + + +def points_count_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0.5)): + rbbox_corners = center_to_corner_box3d( + rbbox[:, :3], rbbox[:, 3:6], rbbox[:, -1], origin=origin, axis=z_axis + ) + surfaces = corner_to_surfaces_3d(rbbox_corners) + return points_count_convex_polygon_3d_jit(points[:, :3], surfaces) + + +def riou_cc(rbboxes, qrbboxes, standup_thresh=0.0): + # less than 50ms when used in second one thread. 10x slower than gpu + boxes_corners = center_to_corner_box2d( + rbboxes[:, :2], rbboxes[:, 2:4], rbboxes[:, 4] + ) + boxes_standup = corner_to_standup_nd(boxes_corners) + qboxes_corners = center_to_corner_box2d( + qrbboxes[:, :2], qrbboxes[:, 2:4], qrbboxes[:, 4] + ) + qboxes_standup = corner_to_standup_nd(qboxes_corners) + # if standup box not overlapped, rbbox not overlapped too. + standup_iou = iou_jit(boxes_standup, qboxes_standup, eps=0.0) + return rbbox_iou(boxes_corners, qboxes_corners, standup_iou, standup_thresh) + + +def rinter_cc(rbboxes, qrbboxes, standup_thresh=0.0): + # less than 50ms when used in second one thread. 10x slower than gpu + boxes_corners = center_to_corner_box2d( + rbboxes[:, :2], rbboxes[:, 2:4], rbboxes[:, 4] + ) + boxes_standup = corner_to_standup_nd(boxes_corners) + qboxes_corners = center_to_corner_box2d( + qrbboxes[:, :2], qrbboxes[:, 2:4], qrbboxes[:, 4] + ) + qboxes_standup = corner_to_standup_nd(qboxes_corners) + # if standup box not overlapped, rbbox not overlapped too. + standup_iou = iou_jit(boxes_standup, qboxes_standup, eps=0.0) + return rbbox_intersection( + boxes_corners, qboxes_corners, standup_iou, standup_thresh + ) + + +def corners_nd(dims, origin=0.5): + """generate relative box corners based on length per dim and + origin point. + + Args: + dims (float array, shape=[N, ndim]): array of length per dim + origin (list or array or float): origin point relate to smallest point. + + Returns: + float array, shape=[N, 2 ** ndim, ndim]: returned corners. + point layout example: (2d) x0y0, x0y1, x1y0, x1y1; + (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + where x0 < x1, y0 < y1, z0 < z1 + """ + ndim = int(dims.shape[1]) + corners_norm = np.stack( + np.unravel_index(np.arange(2 ** ndim), [2] * ndim), axis=1 + ).astype(dims.dtype) + # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1 + # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + # so need to convert to a format which is convenient to do other computing. + # for 2d boxes, format is clockwise start with minimum point + # for 3d boxes, please draw lines by your hand. + if ndim == 2: + # generate clockwise box corners + corners_norm = corners_norm[[0, 1, 3, 2]] + elif ndim == 3: + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) + corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape([1, 2 ** ndim, ndim]) + return corners + + +@numba.njit +def corners_2d_jit(dims, origin=0.5): + ndim = 2 + corners_norm = np.array([[0, 0], [0, 1], [1, 1], [1, 0]], dtype=dims.dtype) + corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) + corners = dims.reshape((-1, 1, ndim)) * corners_norm.reshape((1, 2 ** ndim, ndim)) + return corners + + +@numba.njit +def corners_3d_jit(dims, origin=0.5): + ndim = 3 + corners_norm = np.array( + [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1], + dtype=dims.dtype, + ).reshape((8, 3)) + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) + corners = dims.reshape((-1, 1, ndim)) * corners_norm.reshape((1, 2 ** ndim, ndim)) + return corners + + +@numba.njit +def corner_to_standup_nd_jit(boxes_corner): + num_boxes = boxes_corner.shape[0] + ndim = boxes_corner.shape[-1] + result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype) + for i in range(num_boxes): + for j in range(ndim): + result[i, j] = np.min(boxes_corner[i, :, j]) + for j in range(ndim): + result[i, j + ndim] = np.max(boxes_corner[i, :, j]) + return result + + +def corner_to_standup_nd(boxes_corner): + assert len(boxes_corner.shape) == 3 + standup_boxes = [] + standup_boxes.append(np.min(boxes_corner, axis=1)) + standup_boxes.append(np.max(boxes_corner, axis=1)) + return np.concatenate(standup_boxes, -1) + + +def rbbox2d_to_near_bbox(rbboxes): + """convert rotated bbox to nearest 'standing' or 'lying' bbox. + Args: + rbboxes: [N, 5(x, y, xdim, ydim, rad)] rotated bboxes + Returns: + bboxes: [N, 4(xmin, ymin, xmax, ymax)] bboxes + """ + rots = rbboxes[..., -1] + rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi)) + cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis] + bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4]) + bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:]) + return bboxes + + +def rotation_3d_in_axis(points, angles, axis=0): + # points: [N, point_size, 3] + rot_sin = np.sin(angles) + rot_cos = np.cos(angles) + ones = np.ones_like(rot_cos) + zeros = np.zeros_like(rot_cos) + if axis == 1: + rot_mat_T = np.stack( + [ + [rot_cos, zeros, -rot_sin], + [zeros, ones, zeros], + [rot_sin, zeros, rot_cos], + ] + ) + elif axis == 2 or axis == -1: + rot_mat_T = np.stack( + [ + [rot_cos, -rot_sin, zeros], + [rot_sin, rot_cos, zeros], + [zeros, zeros, ones], + ] + ) + elif axis == 0: + rot_mat_T = np.stack( + [ + [zeros, rot_cos, -rot_sin], + [zeros, rot_sin, rot_cos], + [ones, zeros, zeros], + ] + ) + else: + raise ValueError("axis should in range") + + return np.einsum("aij,jka->aik", points, rot_mat_T) + + +def rotation_points_single_angle(points, angle, axis=0): + # points: [N, 3] + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + if axis == 1: + rot_mat_T = np.array( + [[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]], + dtype=points.dtype, + ) + elif axis == 2 or axis == -1: + rot_mat_T = np.array( + [[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]], + dtype=points.dtype, + ) + elif axis == 0: + rot_mat_T = np.array( + [[1, 0, 0], [0, rot_cos, -rot_sin], [0, rot_sin, rot_cos]], + dtype=points.dtype, + ) + else: + raise ValueError("axis should in range") + + return points @ rot_mat_T + + +def rotation_2d(points, angles): + """rotation 2d points based on origin point clockwise when angle positive. + + Args: + points (float array, shape=[N, point_size, 2]): points to be rotated. + angles (float array, shape=[N]): rotation angle. + + Returns: + float array: same shape as points + """ + rot_sin = np.sin(angles) + rot_cos = np.cos(angles) + rot_mat_T = np.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]]) + return np.einsum("aij,jka->aik", points, rot_mat_T) + + +def rotation_box(box_corners, angle): + """rotation 2d points based on origin point clockwise when angle positive. + + Args: + points (float array, shape=[N, point_size, 2]): points to be rotated. + angle (float): rotation angle. + + Returns: + float array: same shape as points + """ + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + rot_mat_T = np.array( + [[rot_cos, -rot_sin], [rot_sin, rot_cos]], dtype=box_corners.dtype + ) + return box_corners @ rot_mat_T + + +def center_to_corner_box3d(centers, dims, angles=None, origin=(0.5, 0.5, 0.5), axis=2): + """convert kitti locations, dimensions and angles to corners + + Args: + centers (float array, shape=[N, 3]): locations in kitti label file. + dims (float array, shape=[N, 3]): dimensions in kitti label file. + angles (float array, shape=[N]): rotation_y in kitti label file. + origin (list or array or float): origin point relate to smallest point. + use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar. + axis (int): rotation axis. 1 for camera and 2 for lidar. + Returns: + [type]: [description] + """ + # 'length' in kitti format is in x axis. + # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 8, 3] + if angles is not None: + corners = rotation_3d_in_axis(corners, angles, axis=axis) + corners += centers.reshape([-1, 1, 3]) + return corners + + +def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): + """convert kitti locations, dimensions and angles to corners. + format: center(xy), dims(xy), angles(clockwise when positive) + + Args: + centers (float array, shape=[N, 2]): locations in kitti label file. + dims (float array, shape=[N, 2]): dimensions in kitti label file. + angles (float array, shape=[N]): rotation_y in kitti label file. + + Returns: + [type]: [description] + """ + # 'length' in kitti format is in x axis. + # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 4, 2] + if angles is not None: + corners = rotation_2d(corners, angles) + corners += centers.reshape([-1, 1, 2]) + return corners + + +@numba.jit(nopython=True) +def box2d_to_corner_jit(boxes): + num_box = boxes.shape[0] + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(1, 4, 2) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) + for i in range(num_box): + rot_sin = np.sin(boxes[i, -1]) + rot_cos = np.cos(boxes[i, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] + return box_corners + + +def rbbox3d_to_corners(rbboxes, origin=[0.5, 0.5, 0.5], axis=2): + return center_to_corner_box3d( + rbboxes[..., :3], rbboxes[..., 3:6], rbboxes[..., 6], origin, axis=axis + ) + + +def rbbox3d_to_bev_corners(rbboxes, origin=0.5): + return center_to_corner_box2d( + rbboxes[..., :2], rbboxes[..., 3:5], rbboxes[..., 6], origin + ) + + +def minmax_to_corner_2d(minmax_box): + ndim = minmax_box.shape[-1] // 2 + center = minmax_box[..., :ndim] + dims = minmax_box[..., ndim:] - center + return center_to_corner_box2d(center, dims, origin=0.0) + + +def minmax_to_corner_2d_v2(minmax_box): + # N, 4 -> N 4 2 + return minmax_box[..., [0, 1, 0, 3, 2, 3, 2, 1]].reshape(-1, 4, 2) + + +def minmax_to_corner_3d(minmax_box): + ndim = minmax_box.shape[-1] // 2 + center = minmax_box[..., :ndim] + dims = minmax_box[..., ndim:] - center + return center_to_corner_box3d(center, dims, origin=0.0) + + +def minmax_to_center_2d(minmax_box): + ndim = minmax_box.shape[-1] // 2 + center_min = minmax_box[..., :ndim] + dims = minmax_box[..., ndim:] - center_min + center = center_min + 0.5 * dims + return np.concatenate([center, dims], axis=-1) + + +def center_to_minmax_2d_0_5(centers, dims): + return np.concatenate([centers - dims / 2, centers + dims / 2], axis=-1) + + +def center_to_minmax_2d(centers, dims, origin=0.5): + if origin == 0.5: + return center_to_minmax_2d_0_5(centers, dims) + corners = center_to_corner_box2d(centers, dims, origin=origin) + return corners[:, [0, 2]].reshape([-1, 4]) + + +def limit_period(val, offset=0.5, period=np.pi): + return val - np.floor(val / period + offset) * period + + +def projection_matrix_to_CRT_kitti(proj): + # P = C @ [R|T] + # C is upper triangular matrix, so we need to inverse CR and use QR + # stable for all kitti camera projection matrix + CR = proj[0:3, 0:3] + CT = proj[0:3, 3] + RinvCinv = np.linalg.inv(CR) + Rinv, Cinv = np.linalg.qr(RinvCinv) + C = np.linalg.inv(Cinv) + R = np.linalg.inv(Rinv) + T = Cinv @ CT + return C, R, T + + +def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100): + fku = C[0, 0] + fkv = -C[1, 1] + u0v0 = C[0:2, 2] + z_points = np.array([near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis] + b = bbox_image + box_corners = np.array( + [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]], dtype=C.dtype + ) + near_box_corners = (box_corners - u0v0) / np.array( + [fku / near_clip, -fkv / near_clip], dtype=C.dtype + ) + far_box_corners = (box_corners - u0v0) / np.array( + [fku / far_clip, -fkv / far_clip], dtype=C.dtype + ) + ret_xy = np.concatenate([near_box_corners, far_box_corners], axis=0) # [8, 2] + ret_xyz = np.concatenate([ret_xy, z_points], axis=1) + return ret_xyz + + +def get_frustum_v2(bboxes, C, near_clip=0.001, far_clip=100): + fku = C[0, 0] + fkv = -C[1, 1] + u0v0 = C[0:2, 2] + num_box = bboxes.shape[0] + z_points = np.array([near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[ + np.newaxis, :, np.newaxis + ] + z_points = np.tile(z_points, [num_box, 1, 1]) + box_corners = minmax_to_corner_2d_v2(bboxes) + near_box_corners = (box_corners - u0v0) / np.array( + [fku / near_clip, -fkv / near_clip], dtype=C.dtype + ) + far_box_corners = (box_corners - u0v0) / np.array( + [fku / far_clip, -fkv / far_clip], dtype=C.dtype + ) + ret_xy = np.concatenate([near_box_corners, far_box_corners], axis=1) # [8, 2] + ret_xyz = np.concatenate([ret_xy, z_points], axis=-1) + return ret_xyz + + +@numba.njit +def _add_rgb_to_points_kernel(points_2d, image, points_rgb): + num_points = points_2d.shape[0] + image_h, image_w = image.shape[:2] + for i in range(num_points): + img_pos = np.floor(points_2d[i]).astype(np.int32) + if img_pos[0] >= 0 and img_pos[0] < image_w: + if img_pos[1] >= 0 and img_pos[1] < image_h: + points_rgb[i, :] = image[img_pos[1], img_pos[0], :] + # image[img_pos[1], img_pos[0]] = 0 + + +def add_rgb_to_points(points, image, rect, Trv2c, P2, mean_size=[5, 5]): + kernel = np.ones(mean_size, np.float32) / np.prod(mean_size) + # image = cv2.filter2D(image, -1, kernel) + points_cam = lidar_to_camera(points[:, :3], rect, Trv2c) + points_2d = project_to_image(points_cam, P2) + points_rgb = np.zeros([points_cam.shape[0], 3], dtype=points.dtype) + _add_rgb_to_points_kernel(points_2d, image, points_rgb) + return points_rgb + + +def project_to_image(points_3d, proj_mat): + points_shape = list(points_3d.shape) + points_shape[-1] = 1 + points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1) + point_2d = points_4 @ proj_mat.T + point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] + return point_2d_res + + +def camera_to_lidar(points, r_rect, velo2cam): + points_shape = list(points.shape[0:-1]) + if points.shape[-1] == 3: + points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) + lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T) + return lidar_points[..., :3] + + +def lidar_to_camera(points, r_rect, velo2cam): + points_shape = list(points.shape[:-1]) + if points.shape[-1] == 3: + points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) + camera_points = points @ (r_rect @ velo2cam).T + return camera_points[..., :3] + + +def box_camera_to_lidar(data, r_rect, velo2cam): + xyz = data[:, 0:3] + l, h, w = data[:, 3:4], data[:, 4:5], data[:, 5:6] + r = data[:, 6:7] + xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam) + return np.concatenate([xyz_lidar, w, l, h, r], axis=1) + + +def box_lidar_to_camera(data, r_rect, velo2cam): + xyz_lidar = data[:, 0:3] + w, l, h = data[:, 3:4], data[:, 4:5], data[:, 5:6] + r = data[:, 6:7] + xyz = lidar_to_camera(xyz_lidar, r_rect, velo2cam) + return np.concatenate([xyz, l, h, w, r], axis=1) + + +def remove_outside_points(points, rect, Trv2c, P2, image_shape): + # 5x faster than remove_outside_points_v1(2ms vs 10ms) + C, R, T = projection_matrix_to_CRT_kitti(P2) + image_bbox = [0, 0, image_shape[1], image_shape[0]] + frustum = get_frustum(image_bbox, C) + frustum -= T + frustum = np.linalg.inv(R) @ frustum.T + frustum = camera_to_lidar(frustum.T, rect, Trv2c) + frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...]) + indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces) + points = points[indices.reshape([-1])] + return points + + +@numba.jit(nopython=True) +def iou_jit(boxes, query_boxes, eps=1.0): + """calculate box iou. note that jit version runs 2x faster than cython in + my machine! + Parameters + ---------- + boxes: (N, 4) ndarray of float + query_boxes: (K, 4) ndarray of float + Returns + ------- + overlaps: (N, K) ndarray of overlap between boxes and query_boxes + """ + N = boxes.shape[0] + K = query_boxes.shape[0] + overlaps = np.zeros((N, K), dtype=boxes.dtype) + for k in range(K): + box_area = (query_boxes[k, 2] - query_boxes[k, 0] + eps) * ( + query_boxes[k, 3] - query_boxes[k, 1] + eps + ) + for n in range(N): + iw = ( + min(boxes[n, 2], query_boxes[k, 2]) + - max(boxes[n, 0], query_boxes[k, 0]) + + eps + ) + if iw > 0: + ih = ( + min(boxes[n, 3], query_boxes[k, 3]) + - max(boxes[n, 1], query_boxes[k, 1]) + + eps + ) + if ih > 0: + ua = ( + (boxes[n, 2] - boxes[n, 0] + eps) + * (boxes[n, 3] - boxes[n, 1] + eps) + + box_area + - iw * ih + ) + overlaps[n, k] = iw * ih / ua + return overlaps + + +@numba.jit(nopython=True) +def iou_3d_jit(boxes, query_boxes, add1=True): + """calculate box iou3d, + ---------- + boxes: (N, 6) ndarray of float + query_boxes: (K, 6) ndarray of float + Returns + ------- + overlaps: (N, K) ndarray of overlap between boxes and query_boxes + """ + N = boxes.shape[0] + K = query_boxes.shape[0] + overlaps = np.zeros((N, K), dtype=boxes.dtype) + if add1: + add1 = 1.0 + else: + add1 = 0.0 + for k in range(K): + box_area = ( + (query_boxes[k, 3] - query_boxes[k, 0] + add1) + * (query_boxes[k, 4] - query_boxes[k, 1] + add1) + * (query_boxes[k, 5] - query_boxes[k, 2] + add1) + ) + for n in range(N): + iw = ( + min(boxes[n, 3], query_boxes[k, 3]) + - max(boxes[n, 0], query_boxes[k, 0]) + + add1 + ) + if iw > 0: + ih = ( + min(boxes[n, 4], query_boxes[k, 4]) + - max(boxes[n, 1], query_boxes[k, 1]) + + add1 + ) + if ih > 0: + il = ( + min(boxes[n, 5], query_boxes[k, 5]) + - max(boxes[n, 2], query_boxes[k, 2]) + + add1 + ) + if il > 0: + ua = float( + (boxes[n, 3] - boxes[n, 0] + add1) + * (boxes[n, 4] - boxes[n, 1] + add1) + * (boxes[n, 5] - boxes[n, 2] + add1) + + box_area + - iw * ih * il + ) + overlaps[n, k] = iw * ih * il / ua + return overlaps + + +@numba.jit(nopython=True) +def iou_nd_jit(boxes, query_boxes, add1=True): + """calculate box iou nd, 2x slower than iou_jit. + ---------- + boxes: (N, ndim * 2) ndarray of float + query_boxes: (K, ndim * 2) ndarray of float + Returns + ------- + overlaps: (N, K) ndarray of overlap between boxes and query_boxes + """ + N = boxes.shape[0] + K = query_boxes.shape[0] + ndim = boxes.shape[1] // 2 + overlaps = np.zeros((N, K), dtype=boxes.dtype) + side_lengths = np.zeros((ndim,), dtype=boxes.dtype) + if add1: + add1 = 1.0 + else: + add1 = 0.0 + invalid = False + for k in range(K): + qbox_area = query_boxes[k, ndim] - query_boxes[k, 0] + add1 + for i in range(1, ndim): + qbox_area *= query_boxes[k, ndim + i] - query_boxes[k, i] + add1 + for n in range(N): + invalid = False + for i in range(ndim): + side_length = ( + min(boxes[n, i + ndim], query_boxes[k, i + ndim]) + - max(boxes[n, i], query_boxes[k, i]) + + add1 + ) + if side_length <= 0: + invalid = True + break + side_lengths[i] = side_length + if not invalid: + box_area = boxes[n, ndim] - boxes[n, 0] + add1 + for i in range(1, ndim): + box_area *= boxes[n, ndim + i] - boxes[n, i] + add1 + inter = side_lengths[0] + for i in range(1, ndim): + inter *= side_lengths[i] + # inter = np.prod(side_lengths) + ua = float(box_area + qbox_area - inter) + overlaps[n, k] = inter / ua + + return overlaps + + +def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0.5)): + rbbox_corners = center_to_corner_box3d( + rbbox[:, :3], rbbox[:, 3:6], rbbox[:, -1], origin=origin, axis=z_axis + ) + surfaces = corner_to_surfaces_3d(rbbox_corners) + indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) + return indices + + +def corner_to_surfaces_3d(corners): + """convert 3d box corners from corner function above + to surfaces that normal vectors all direct to internal. + + Args: + corners (float array, [N, 8, 3]): 3d box corners. + Returns: + surfaces (float array, [N, 6, 4, 3]): + """ + # box_corners: [N, 8, 3], must from corner functions in this module + surfaces = np.array( + [ + [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]], + [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]], + [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]], + [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]], + [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]], + [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]], + ] + ).transpose([2, 0, 1, 3]) + return surfaces + + +@numba.jit(nopython=True) +def corner_to_surfaces_3d_jit(corners): + """convert 3d box corners from corner function above + to surfaces that normal vectors all direct to internal. + + Args: + corners (float array, [N, 8, 3]): 3d box corners. + Returns: + surfaces (float array, [N, 6, 4, 3]): + """ + # box_corners: [N, 8, 3], must from corner functions in this module + num_boxes = corners.shape[0] + surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype) + corner_idxes = np.array( + [0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7] + ).reshape(6, 4) + for i in range(num_boxes): + for j in range(6): + for k in range(4): + surfaces[i, j, k] = corners[i, corner_idxes[j, k]] + return surfaces + + +def assign_label_to_voxel(gt_boxes, coors, voxel_size, coors_range): + """assign a 0/1 label to each voxel based on whether + the center of voxel is in gt_box. LIDAR. + """ + voxel_size = np.array(voxel_size, dtype=gt_boxes.dtype) + coors_range = np.array(coors_range, dtype=gt_boxes.dtype) + shift = coors_range[:3] + voxel_origins = coors[:, ::-1] * voxel_size + shift + voxel_centers = voxel_origins + voxel_size * 0.5 + gt_box_corners = center_to_corner_box3d( + gt_boxes[:, :3] - voxel_size * 0.5, + gt_boxes[:, 3:6] + voxel_size, + gt_boxes[:, 6], + origin=[0.5, 0.5, 0.5], + axis=2, + ) + gt_surfaces = corner_to_surfaces_3d(gt_box_corners) + ret = points_in_convex_polygon_3d_jit(voxel_centers, gt_surfaces) + return np.any(ret, axis=1).astype(np.int64) + + +def assign_label_to_voxel_v3(gt_boxes, coors, voxel_size, coors_range): + """assign a 0/1 label to each voxel based on whether + the center of voxel is in gt_box. LIDAR. + """ + voxel_size = np.array(voxel_size, dtype=gt_boxes.dtype) + coors_range = np.array(coors_range, dtype=gt_boxes.dtype) + shift = coors_range[:3] + voxel_origins = coors[:, ::-1] * voxel_size + shift + voxel_maxes = voxel_origins + voxel_size + voxel_minmax = np.concatenate([voxel_origins, voxel_maxes], axis=-1) + voxel_corners = minmax_to_corner_3d(voxel_minmax) + gt_box_corners = center_to_corner_box3d( + gt_boxes[:, :3], + gt_boxes[:, 3:6], + gt_boxes[:, 6], + origin=[0.5, 0.5, 0.5], + axis=2, + ) + gt_surfaces = corner_to_surfaces_3d(gt_box_corners) + voxel_corners_flat = voxel_corners.reshape([-1, 3]) + ret = points_in_convex_polygon_3d_jit(voxel_corners_flat, gt_surfaces) + ret = ret.reshape([-1, 8, ret.shape[-1]]) + return ret.any(-1).any(-1).astype(np.int64) + + +def image_box_region_area(img_cumsum, bbox): + """check a 2d voxel is contained by a box. used to filter empty + anchors. + Summed-area table algorithm: + ==> W + ------------------ + | | | + |------A---------B + | | | + | | | + |----- C---------D + Iabcd = ID-IB-IC+IA + Args: + img_cumsum: [M, H, W](yx) cumsumed image. + bbox: [N, 4](xyxy) bounding box, + """ + N = bbox.shape[0] + M = img_cumsum.shape[0] + ret = np.zeros([N, M], dtype=img_cumsum.dtype) + ID = img_cumsum[:, bbox[:, 3], bbox[:, 2]] + IA = img_cumsum[:, bbox[:, 1], bbox[:, 0]] + IB = img_cumsum[:, bbox[:, 3], bbox[:, 0]] + IC = img_cumsum[:, bbox[:, 1], bbox[:, 2]] + ret = ID - IB - IC + IA + return ret + + +def get_minimum_bounding_box_bv(points, voxel_size, bound, downsample=8, margin=1.6): + x_vsize = voxel_size[0] + y_vsize = voxel_size[1] + max_x = points[:, 0].max() + max_y = points[:, 1].max() + min_x = points[:, 0].min() + min_y = points[:, 1].min() + max_x = np.floor(max_x / (x_vsize * downsample) + 1) * (x_vsize * downsample) + max_y = np.floor(max_y / (y_vsize * downsample) + 1) * (y_vsize * downsample) + min_x = np.floor(min_x / (x_vsize * downsample)) * (x_vsize * downsample) + min_y = np.floor(min_y / (y_vsize * downsample)) * (y_vsize * downsample) + max_x = np.minimum(max_x + margin, bound[2]) + max_y = np.minimum(max_y + margin, bound[3]) + min_x = np.maximum(min_x - margin, bound[0]) + min_y = np.maximum(min_y - margin, bound[1]) + return np.array([min_x, min_y, max_x, max_y]) + + +def box3d_to_bbox(box3d, rect, Trv2c, P2): + box3d_to_cam = box_lidar_to_camera(box3d, rect, Trv2c) + box_corners = center_to_corner_box3d( + box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1 + ) + box_corners_in_image = project_to_image(box_corners, P2) + # box_corners_in_image: [N, 8, 2] + minxy = np.min(box_corners_in_image, axis=1) + maxxy = np.max(box_corners_in_image, axis=1) + bbox = np.concatenate([minxy, maxxy], axis=1) + return bbox + + +def change_box3d_center_(box3d, src, dst): + dst = np.array(dst, dtype=box3d.dtype) + src = np.array(src, dtype=box3d.dtype) + box3d[..., :3] += box3d[..., 3:6] * (dst - src) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/box_torch_ops.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/box_torch_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..37f7dafa230ace04806d0c5da0235abc646df26c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/box_torch_ops.py @@ -0,0 +1,276 @@ +import math +from functools import reduce + +import numpy as np +import torch +from torch import stack as tstack +try: + from det3d.ops.iou3d_nms import iou3d_nms_cuda, iou3d_nms_utils +except: + print("iou3d cuda not built. You don't need this if you use circle_nms. Otherwise, refer to the advanced installation part to build this cuda extension") + +def torch_to_np_dtype(ttype): + type_map = { + torch.float16: np.dtype(np.float16), + torch.float32: np.dtype(np.float32), + torch.int32: np.dtype(np.int32), + torch.int64: np.dtype(np.int64), + torch.uint8: np.dtype(np.uint8), + } + return type_map[ttype] + + +def corners_nd(dims, origin=0.5): + """generate relative box corners based on length per dim and + origin point. + + Args: + dims (float array, shape=[N, ndim]): array of length per dim + origin (list or array or float): origin point relate to smallest point. + dtype (output dtype, optional): Defaults to np.float32 + + Returns: + float array, shape=[N, 2 ** ndim, ndim]: returned corners. + point layout example: (2d) x0y0, x0y1, x1y0, x1y1; + (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + where x0 < x1, y0 < y1, z0 < z1 + """ + ndim = int(dims.shape[1]) + dtype = torch_to_np_dtype(dims.dtype) + if isinstance(origin, float): + origin = [origin] * ndim + corners_norm = np.stack( + np.unravel_index(np.arange(2 ** ndim), [2] * ndim), axis=1 + ).astype(dtype) + # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1 + # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + # so need to convert to a format which is convenient to do other computing. + # for 2d boxes, format is clockwise start from minimum point + # for 3d boxes, please draw them by your hand. + if ndim == 2: + # generate clockwise box corners + corners_norm = corners_norm[[0, 1, 3, 2]] + elif ndim == 3: + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + corners_norm = corners_norm - np.array(origin, dtype=dtype) + corners_norm = torch.from_numpy(corners_norm).type_as(dims) + corners = dims.view(-1, 1, ndim) * corners_norm.view(1, 2 ** ndim, ndim) + return corners + + +def corners_2d(dims, origin=0.5): + """generate relative 2d box corners based on length per dim and + origin point. + + Args: + dims (float array, shape=[N, 2]): array of length per dim + origin (list or array or float): origin point relate to smallest point. + dtype (output dtype, optional): Defaults to np.float32 + + Returns: + float array, shape=[N, 4, 2]: returned corners. + point layout: x0y0, x0y1, x1y1, x1y0 + """ + return corners_nd(dims, origin) + + +def corner_to_standup_nd(boxes_corner): + ndim = boxes_corner.shape[2] + standup_boxes = [] + for i in range(ndim): + standup_boxes.append(torch.min(boxes_corner[:, :, i], dim=1)[0]) + for i in range(ndim): + standup_boxes.append(torch.max(boxes_corner[:, :, i], dim=1)[0]) + return torch.stack(standup_boxes, dim=1) + + +def rotation_3d_in_axis(points, angles, axis=0): + # points: [N, point_size, 3] + # angles: [N] + rot_sin = torch.sin(angles) + rot_cos = torch.cos(angles) + ones = torch.ones_like(rot_cos) + zeros = torch.zeros_like(rot_cos) + if axis == 1: + rot_mat_T = tstack( + [ + tstack([rot_cos, zeros, -rot_sin]), + tstack([zeros, ones, zeros]), + tstack([rot_sin, zeros, rot_cos]), + ] + ) + elif axis == 2 or axis == -1: + rot_mat_T = tstack( + [ + tstack([rot_cos, -rot_sin, zeros]), + tstack([rot_sin, rot_cos, zeros]), + tstack([zeros, zeros, ones]), + ] + ) + elif axis == 0: + rot_mat_T = tstack( + [ + tstack([zeros, rot_cos, -rot_sin]), + tstack([zeros, rot_sin, rot_cos]), + tstack([ones, zeros, zeros]), + ] + ) + else: + raise ValueError("axis should in range") + # print(points.shape, rot_mat_T.shape) + return torch.einsum("aij,jka->aik", points, rot_mat_T) + +def rotate_points_along_z(points, angle): + """ + Args: + points: (B, N, 3 + C) + angle: (B), angle along z-axis, angle increases x ==> y + Returns: + """ + cosa = torch.cos(angle) + sina = torch.sin(angle) + zeros = angle.new_zeros(points.shape[0]) + ones = angle.new_ones(points.shape[0]) + rot_matrix = torch.stack(( + cosa, -sina, zeros, + sina, cosa, zeros, + zeros, zeros, ones + ), dim=1).view(-1, 3, 3).float() + points_rot = torch.matmul(points[:, :, 0:3], rot_matrix) + points_rot = torch.cat((points_rot, points[:, :, 3:]), dim=-1) + return points_rot + + +def rotation_2d(points, angles): + """rotation 2d points based on origin point clockwise when angle positive. + + Args: + points (float array, shape=[N, point_size, 2]): points to be rotated. + angles (float array, shape=[N]): rotation angle. + + Returns: + float array: same shape as points + """ + rot_sin = torch.sin(angles) + rot_cos = torch.cos(angles) + rot_mat_T = torch.stack([tstack([rot_cos, -rot_sin]), tstack([rot_sin, rot_cos])]) + return torch.einsum("aij,jka->aik", (points, rot_mat_T)) + + +def center_to_corner_box3d(centers, dims, angles, origin=(0.5, 0.5, 0.5), axis=1): + """convert kitti locations, dimensions and angles to corners + + Args: + centers (float array, shape=[N, 3]): locations in kitti label file. + dims (float array, shape=[N, 3]): dimensions in kitti label file. + angles (float array, shape=[N]): rotation_y in kitti label file. + origin (list or array or float): origin point relate to smallest point. + use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar. + axis (int): rotation axis. 1 for camera and 2 for lidar. + Returns: + [type]: [description] + """ + # 'length' in kitti format is in x axis. + # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 8, 3] + corners = rotation_3d_in_axis(corners, angles, axis=axis) + corners += centers.view(-1, 1, 3) + return corners + + +def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): + """convert kitti locations, dimensions and angles to corners + + Args: + centers (float array, shape=[N, 2]): locations in kitti label file. + dims (float array, shape=[N, 2]): dimensions in kitti label file. + angles (float array, shape=[N]): rotation_y in kitti label file. + + Returns: + [type]: [description] + """ + # 'length' in kitti format is in x axis. + # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 4, 2] + if angles is not None: + corners = rotation_2d(corners, angles) + corners += centers.view(-1, 1, 2) + return corners + + +def project_to_image(points_3d, proj_mat): + points_num = list(points_3d.shape)[:-1] + points_shape = np.concatenate([points_num, [1]], axis=0).tolist() + points_4 = torch.cat( + [points_3d, torch.ones(*points_shape).type_as(points_3d)], dim=-1 + ) + # point_2d = points_4 @ tf.transpose(proj_mat, [1, 0]) + point_2d = torch.matmul(points_4, proj_mat.t()) + point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] + return point_2d_res + + +def camera_to_lidar(points, r_rect, velo2cam): + num_points = points.shape[0] + points = torch.cat([points, torch.ones(num_points, 1).type_as(points)], dim=-1) + lidar_points = points @ torch.inverse((r_rect @ velo2cam).t()) + return lidar_points[..., :3] + + +def lidar_to_camera(points, r_rect, velo2cam): + num_points = points.shape[0] + points = torch.cat([points, torch.ones(num_points, 1).type_as(points)], dim=-1) + camera_points = points @ (r_rect @ velo2cam).t() + return camera_points[..., :3] + + +def box_camera_to_lidar(data, r_rect, velo2cam): + xyz = data[..., 0:3] + l, h, w = data[..., 3:4], data[..., 4:5], data[..., 5:6] + r = data[..., 6:7] + xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam) + return torch.cat([xyz_lidar, w, l, h, r], dim=-1) + + +def box_lidar_to_camera(data, r_rect, velo2cam): + xyz_lidar = data[..., 0:3] + w, l, h = data[..., 3:4], data[..., 4:5], data[..., 5:6] + r = data[..., 6:7] + xyz = lidar_to_camera(xyz_lidar, r_rect, velo2cam) + return torch.cat([xyz, l, h, w, r], dim=-1) + + +def rotate_nms_pcdet(boxes, scores, thresh, pre_maxsize=None, post_max_size=None): + """ + :param boxes: (N, 5) [x, y, z, l, w, h, theta] + :param scores: (N) + :param thresh: + :return: + """ + # transform back to pcdet's coordinate + boxes = boxes[:, [0, 1, 2, 4, 3, 5, -1]] + boxes[:, -1] = -boxes[:, -1] - np.pi /2 + + order = scores.sort(0, descending=True)[1] + if pre_maxsize is not None: + order = order[:pre_maxsize] + + boxes = boxes[order].contiguous() + + keep = torch.LongTensor(boxes.size(0)) + + if len(boxes) == 0: + num_out =0 + else: + num_out = iou3d_nms_cuda.nms_gpu(boxes, keep, thresh) + + selected = order[keep[:num_out].cuda()].contiguous() + + if post_max_size is not None: + selected = selected[:post_max_size] + + return selected \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/geometry.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/geometry.py new file mode 100644 index 0000000000000000000000000000000000000000..a62ee7ba22219b867275ce3ad2e41403db0a0294 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/bbox/geometry.py @@ -0,0 +1,457 @@ +import numba +import numpy as np + + +@numba.njit +def _points_count_convex_polygon_3d_jit( + points, polygon_surfaces, normal_vec, d, num_surfaces=None +): + """count points in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_polygon] array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + ret = np.full((num_polygons,), num_points, dtype=np.int64) + sign = 0.0 + for i in range(num_points): + for j in range(num_polygons): + for k in range(max_num_surfaces): + if k > num_surfaces[j]: + break + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + + d[j, k] + ) + if sign >= 0: + ret[j] -= 1 + break + return ret + + +def points_count_convex_polygon_3d_jit(points, polygon_surfaces, num_surfaces=None): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_polygon] array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + if num_surfaces is None: + num_surfaces = np.full((num_polygons,), 9999999, dtype=np.int64) + normal_vec, d = surface_equ_3d_jitv2(polygon_surfaces[:, :, :3, :]) + # normal_vec: [num_polygon, max_num_surfaces, 3] + # d: [num_polygon, max_num_surfaces] + return _points_count_convex_polygon_3d_jit( + points, polygon_surfaces, normal_vec, d, num_surfaces + ) + + +@numba.njit +def is_line_segment_intersection_jit(lines1, lines2): + """check if line segments1 and line segments2 have cross point + + Args: + lines1 (float, [N, 2, 2]): [description] + lines2 (float, [M, 2, 2]): [description] + + Returns: + [type]: [description] + """ + + # Return true if line segments AB and CD intersect + N = lines1.shape[0] + M = lines2.shape[0] + ret = np.zeros((N, M), dtype=np.bool_) + for i in range(N): + for j in range(M): + A = lines1[i, 0] + B = lines1[i, 1] + C = lines2[j, 0] + D = lines2[j, 1] + acd = (D[1] - A[1]) * (C[0] - A[0]) > (C[1] - A[1]) * (D[0] - A[0]) + bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0]) + if acd != bcd: + abc = (C[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * (C[0] - A[0]) + abd = (D[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * (D[0] - A[0]) + if abc != abd: + ret[i, j] = True + return ret + + +@numba.njit +def line_segment_intersection(line1, line2, intersection): + A = line1[0] + B = line1[1] + C = line2[0] + D = line2[1] + BA0 = B[0] - A[0] + BA1 = B[1] - A[1] + DA0 = D[0] - A[0] + CA0 = C[0] - A[0] + DA1 = D[1] - A[1] + CA1 = C[1] - A[1] + acd = DA1 * CA0 > CA1 * DA0 + bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0]) + if acd != bcd: + abc = CA1 * BA0 > BA1 * CA0 + abd = DA1 * BA0 > BA1 * DA0 + if abc != abd: + DC0 = D[0] - C[0] + DC1 = D[1] - C[1] + ABBA = A[0] * B[1] - B[0] * A[1] + CDDC = C[0] * D[1] - D[0] * C[1] + DH = BA1 * DC0 - BA0 * DC1 + intersection[0] = (ABBA * DC0 - BA0 * CDDC) / DH + intersection[1] = (ABBA * DC1 - BA1 * CDDC) / DH + return True + return False + + +def _ccw(A, B, C): + return (C[..., 1] - A[..., 1]) * (B[..., 0] - A[..., 0]) > ( + B[..., 1] - A[..., 1] + ) * (C[..., 0] - A[..., 0]) + + +def is_line_segment_cross(lines1, lines2): + # 10x slower than jit version with 1000-1000 random lines input. + # lines1, [N, 2, 2] + # lines2, [M, 2, 2] + A = lines1[:, 0, :][:, np.newaxis, :] + B = lines1[:, 1, :][:, np.newaxis, :] + C = lines2[:, 0, :][np.newaxis, :, :] + D = lines2[:, 1, :][np.newaxis, :, :] + return np.logical_and( + _ccw(A, C, D) != _ccw(B, C, D), _ccw(A, B, C) != _ccw(A, B, D) + ) + + +@numba.jit(nopython=False) +def surface_equ_3d_jit(polygon_surfaces): + # return [a, b, c], d in ax+by+cz+d=0 + # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] + surface_v = polygon_surfaces[:, :, :2, :] - polygon_surfaces[:, :, 1:3, :] + # normal_vec: [..., 3] + normal_v = np.cross(surface_v[:, :, 0, :], surface_v[:, :, 1, :]) + # print(normal_vec.shape, points[..., 0, :].shape) + # d = -np.inner(normal_vec, points[..., 0, :]) + d = np.einsum("aij, aij->ai", normal_v, polygon_surfaces[:, :, 0, :]) + return normal_vec, -d + + +@numba.jit(nopython=False) +def points_in_convex_polygon_3d_jit_v1(points, polygon_surfaces, num_surfaces=None): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_points, num_polygon] bool array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + if num_surfaces is None: + num_surfaces = np.full((num_polygons,), 9999999, dtype=np.int64) + normal_vec, d = surface_equ_3d_jit(polygon_surfaces[:, :, :3, :]) + # normal_vec: [num_polygon, max_num_surfaces, 3] + # d: [num_polygon, max_num_surfaces] + ret = np.ones((num_points, num_polygons), dtype=np.bool_) + sign = 0.0 + for i in range(num_points): + for j in range(num_polygons): + for k in range(max_num_surfaces): + if k > num_surfaces[j]: + break + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + + d[j, k] + ) + if sign >= 0: + ret[i, j] = False + break + return ret + + +def surface_equ_3d(polygon_surfaces): + # return [a, b, c], d in ax+by+cz+d=0 + # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] + surface_v = polygon_surfaces[:, :, :2, :] - polygon_surfaces[:, :, 1:3, :] + # normal_vec: [..., 3] + normal_v = np.cross(surface_v[:, :, 0, :], surface_v[:, :, 1, :]) + # print(normal_vec.shape, points[..., 0, :].shape) + # d = -np.inner(normal_vec, points[..., 0, :]) + d = np.einsum("aij, aij->ai", normal_v, polygon_surfaces[:, :, 0, :]) + return normal_v, -d + + +def points_in_convex_polygon_3d_jit(points, polygon_surfaces, num_surfaces=None): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_points, num_polygon] bool array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + if num_surfaces is None: + num_surfaces = np.full((num_polygons,), 9999999, dtype=np.int64) + normal_vec, d = surface_equ_3d_jitv2(polygon_surfaces[:, :, :3, :]) + # normal_vec: [num_polygon, max_num_surfaces, 3] + # d: [num_polygon, max_num_surfaces] + return _points_in_convex_polygon_3d_jit( + points, polygon_surfaces, normal_vec, d, num_surfaces + ) + + +@numba.njit +def _points_in_convex_polygon_3d_jit( + points, polygon_surfaces, normal_vec, d, num_surfaces=None +): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_points, num_polygon] bool array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + ret = np.ones((num_points, num_polygons), dtype=np.bool_) + sign = 0.0 + for i in range(num_points): + for j in range(num_polygons): + for k in range(max_num_surfaces): + if k > num_surfaces[j]: + break + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + + d[j, k] + ) + if sign >= 0: + ret[i, j] = False + break + return ret + + +@numba.jit +def points_in_convex_polygon_jit(points, polygon, clockwise=True): + """check points is in 2d convex polygons. True when point in polygon + Args: + points: [num_points, 2] array. + polygon: [num_polygon, num_points_of_polygon, 2] array. + clockwise: bool. indicate polygon is clockwise. + Returns: + [num_points, num_polygon] bool array. + """ + # first convert polygon to directed lines + num_points_of_polygon = polygon.shape[1] + num_points = points.shape[0] + num_polygons = polygon.shape[0] + if clockwise: + vec1 = ( + polygon + - polygon[ + :, + [num_points_of_polygon - 1] + list(range(num_points_of_polygon - 1)), + :, + ] + ) + else: + vec1 = ( + polygon[ + :, + [num_points_of_polygon - 1] + list(range(num_points_of_polygon - 1)), + :, + ] + - polygon + ) + # vec1: [num_polygon, num_points_of_polygon, 2] + ret = np.zeros((num_points, num_polygons), dtype=np.bool_) + success = True + cross = 0.0 + for i in range(num_points): + for j in range(num_polygons): + success = True + for k in range(num_points_of_polygon): + cross = vec1[j, k, 1] * (polygon[j, k, 0] - points[i, 0]) + cross -= vec1[j, k, 0] * (polygon[j, k, 1] - points[i, 1]) + if cross >= 0: + success = False + break + ret[i, j] = success + return ret + + +def points_in_convex_polygon(points, polygon, clockwise=True): + """check points is in convex polygons. may run 2x faster when write in + cython(don't need to calculate all cross-product between edge and point) + Args: + points: [num_points, 2] array. + polygon: [num_polygon, num_points_of_polygon, 2] array. + clockwise: bool. indicate polygon is clockwise. + Returns: + [num_points, num_polygon] bool array. + """ + # first convert polygon to directed lines + num_lines = polygon.shape[1] + polygon_next = polygon[:, [num_lines - 1] + list(range(num_lines - 1)), :] + if clockwise: + vec1 = (polygon - polygon_next)[np.newaxis, ...] + else: + vec1 = (polygon_next - polygon)[np.newaxis, ...] + vec2 = polygon[np.newaxis, ...] - points[:, np.newaxis, np.newaxis, :] + # [num_points, num_polygon, num_points_of_polygon, 2] + cross = np.cross(vec1, vec2) + return np.all(cross > 0, axis=2) + + +@numba.njit +def surface_equ_3d_jitv2(surfaces): + # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] + num_polygon = surfaces.shape[0] + max_num_surfaces = surfaces.shape[1] + normal_vec = np.zeros((num_polygon, max_num_surfaces, 3), dtype=surfaces.dtype) + d = np.zeros((num_polygon, max_num_surfaces), dtype=surfaces.dtype) + sv0 = surfaces[0, 0, 0] - surfaces[0, 0, 1] + sv1 = surfaces[0, 0, 0] - surfaces[0, 0, 1] + for i in range(num_polygon): + for j in range(max_num_surfaces): + sv0[0] = surfaces[i, j, 0, 0] - surfaces[i, j, 1, 0] + sv0[1] = surfaces[i, j, 0, 1] - surfaces[i, j, 1, 1] + sv0[2] = surfaces[i, j, 0, 2] - surfaces[i, j, 1, 2] + sv1[0] = surfaces[i, j, 1, 0] - surfaces[i, j, 2, 0] + sv1[1] = surfaces[i, j, 1, 1] - surfaces[i, j, 2, 1] + sv1[2] = surfaces[i, j, 1, 2] - surfaces[i, j, 2, 2] + normal_vec[i, j, 0] = sv0[1] * sv1[2] - sv0[2] * sv1[1] + normal_vec[i, j, 1] = sv0[2] * sv1[0] - sv0[0] * sv1[2] + normal_vec[i, j, 2] = sv0[0] * sv1[1] - sv0[1] * sv1[0] + + d[i, j] = ( + -surfaces[i, j, 0, 0] * normal_vec[i, j, 0] + - surfaces[i, j, 0, 1] * normal_vec[i, j, 1] + - surfaces[i, j, 0, 2] * normal_vec[i, j, 2] + ) + return normal_vec, d + + +@numba.njit +def _points_in_convex_polygon_3d_jit_v2(points, surfaces): + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + ret = np.ones((num_points, num_polygons), dtype=np.bool_) + sign = 0.0 + for i in range(num_points): + for j in range(num_polygons): + for k in range(max_num_surfaces): + if k > num_surfaces[j]: + break + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + + d[j, k] + ) + if sign >= 0: + ret[i, j] = False + break + return ret + + +@numba.njit +def points_in_convex_polygon_3d_jit_v2(points, surfaces, num_surfaces=None): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_points, num_polygon] bool array. + """ + num_polygon = surfaces.shape[0] + max_num_surfaces = surfaces.shape[1] + num_points = points.shape[0] + normal_vec = np.zeros((num_polygon, max_num_surfaces, 3), dtype=surfaces.dtype) + d = np.zeros((num_polygon, max_num_surfaces), dtype=surfaces.dtype) + sv0 = surfaces[0, 0, 0] - surfaces[0, 0, 1] + sv1 = surfaces[0, 0, 0] - surfaces[0, 0, 1] + ret = np.ones((num_points, num_polygon), dtype=np.bool_) + for i in range(num_polygon): + for j in range(max_num_surfaces): + sv0[0] = surfaces[i, j, 0, 0] - surfaces[i, j, 1, 0] + sv0[1] = surfaces[i, j, 0, 1] - surfaces[i, j, 1, 1] + sv0[2] = surfaces[i, j, 0, 2] - surfaces[i, j, 1, 2] + sv1[0] = surfaces[i, j, 1, 0] - surfaces[i, j, 2, 0] + sv1[1] = surfaces[i, j, 1, 1] - surfaces[i, j, 2, 1] + sv1[2] = surfaces[i, j, 1, 2] - surfaces[i, j, 2, 2] + normal_vec[i, j, 0] = sv0[1] * sv1[2] - sv0[2] * sv1[1] + normal_vec[i, j, 1] = sv0[2] * sv1[0] - sv0[0] * sv1[2] + normal_vec[i, j, 2] = sv0[0] * sv1[1] - sv0[1] * sv1[0] + + d[i, j] = ( + -surfaces[i, j, 0, 0] * normal_vec[i, j, 0] + - surfaces[i, j, 0, 1] * normal_vec[i, j, 1] + - surfaces[i, j, 0, 2] * normal_vec[i, j, 2] + ) + + sign = 0.0 + for i in range(num_points): + for j in range(num_polygon): + for k in range(max_num_surfaces): + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + + d[j, k] + ) + if sign >= 0: + ret[i, j] = False + break + return ret diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/input/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/input/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4ae403f69d7ee062e0b02075a837e77135cb33af --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/input/__init__.py @@ -0,0 +1 @@ +from . import voxel_generator diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/input/voxel_generator.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/input/voxel_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..4164469cac3f1d4f9df7e3e4f2673870cfb4f0f5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/input/voxel_generator.py @@ -0,0 +1,46 @@ +import numpy as np +from det3d.ops.point_cloud.point_cloud_ops import points_to_voxel + + +class VoxelGenerator: + def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels=20000): + point_cloud_range = np.array(point_cloud_range, dtype=np.float32) + # [0, -40, -3, 70.4, 40, 1] + voxel_size = np.array(voxel_size, dtype=np.float32) + grid_size = (point_cloud_range[3:] - point_cloud_range[:3]) / voxel_size + grid_size = np.round(grid_size).astype(np.int64) + + self._voxel_size = voxel_size + self._point_cloud_range = point_cloud_range + self._max_num_points = max_num_points + self._max_voxels = max_voxels + self._grid_size = grid_size + + def generate(self, points, max_voxels=-1): + if max_voxels == -1: + max_voxels=self._max_voxels + + return points_to_voxel( + points, + self._voxel_size, + self._point_cloud_range, + self._max_num_points, + True, + max_voxels, + ) + + @property + def voxel_size(self): + return self._voxel_size + + @property + def max_num_points_per_voxel(self): + return self._max_num_points + + @property + def point_cloud_range(self): + return self._point_cloud_range + + @property + def grid_size(self): + return self._grid_size diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7c3af1e6d976755c1772218a59ebb9f6063c4617 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/__init__.py @@ -0,0 +1,2 @@ +from . import preprocess +from . import sample_ops diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/preprocess.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..071c3cdb2645831f67c63443d8d10c08f1c263c8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/preprocess.py @@ -0,0 +1,976 @@ +import abc +import sys +import time +from collections import OrderedDict +from functools import reduce + +import numba +import numpy as np + +from det3d.core.bbox import box_np_ops +from det3d.core.bbox.geometry import ( + is_line_segment_intersection_jit, + points_in_convex_polygon_3d_jit, + points_in_convex_polygon_jit, +) +import copy + + +class BatchSampler: + def __init__( + self, sampled_list, name=None, epoch=None, shuffle=True, drop_reminder=False + ): + self._sampled_list = sampled_list + self._indices = np.arange(len(sampled_list)) + if shuffle: + np.random.shuffle(self._indices) + self._idx = 0 + self._example_num = len(sampled_list) + self._name = name + self._shuffle = shuffle + self._epoch = epoch + self._epoch_counter = 0 + self._drop_reminder = drop_reminder + + def _sample(self, num): + if self._idx + num >= self._example_num: + ret = self._indices[self._idx :].copy() + self._reset() + else: + ret = self._indices[self._idx : self._idx + num] + self._idx += num + return ret + + def _reset(self): + # if self._name is not None: + # print("reset", self._name) + if self._shuffle: + np.random.shuffle(self._indices) + self._idx = 0 + + def sample(self, num): + indices = self._sample(num) + return [self._sampled_list[i] for i in indices] + # return np.random.choice(self._sampled_list, num) + + +class DataBasePreprocessing: + def __call__(self, db_infos): + return self._preprocess(db_infos) + + @abc.abstractclassmethod + def _preprocess(self, db_infos): + pass + + +class DBFilterByDifficulty(DataBasePreprocessing): + def __init__(self, removed_difficulties, logger=None): + self._removed_difficulties = removed_difficulties + logger.info(f"{removed_difficulties}") + + def _preprocess(self, db_infos): + new_db_infos = {} + for key, dinfos in db_infos.items(): + new_db_infos[key] = [ + info + for info in dinfos + if info["difficulty"] not in self._removed_difficulties + ] + return new_db_infos + + +class DBFilterByMinNumPoint(DataBasePreprocessing): + def __init__(self, min_gt_point_dict, logger=None): + self._min_gt_point_dict = min_gt_point_dict + logger.info(f"{min_gt_point_dict}") + + def _preprocess(self, db_infos): + for name, min_num in self._min_gt_point_dict.items(): + if min_num > 0: + filtered_infos = [] + for info in db_infos[name]: + if info["num_points_in_gt"] >= min_num: + filtered_infos.append(info) + db_infos[name] = filtered_infos + return db_infos + + +class DataBasePreprocessor: + def __init__(self, preprocessors): + self._preprocessors = preprocessors + + def __call__(self, db_infos): + for prepor in self._preprocessors: + db_infos = prepor(db_infos) + return db_infos + + +def filter_gt_box_outside_range(gt_boxes, limit_range): + """remove gtbox outside training range. + this function should be applied after other prep functions + Args: + gt_boxes ([type]): [description] + limit_range ([type]): [description] + """ + gt_boxes_bv = box_np_ops.center_to_corner_box2d( + gt_boxes[:, [0, 1]], gt_boxes[:, [3, 3 + 1]], gt_boxes[:, -1] + ) + bounding_box = box_np_ops.minmax_to_corner_2d( + np.asarray(limit_range)[np.newaxis, ...] + ) + ret = points_in_convex_polygon_jit(gt_boxes_bv.reshape(-1, 2), bounding_box) + return np.any(ret.reshape(-1, 4), axis=1) + + +def filter_gt_box_outside_range_by_center(gt_boxes, limit_range): + """remove gtbox outside training range. + this function should be applied after other prep functions + Args: + gt_boxes ([type]): [description] + limit_range ([type]): [description] + """ + gt_box_centers = gt_boxes[:, :2] + bounding_box = box_np_ops.minmax_to_corner_2d( + np.asarray(limit_range)[np.newaxis, ...] + ) + ret = points_in_convex_polygon_jit(gt_box_centers, bounding_box) + return ret.reshape(-1) + + +def filter_gt_low_points(gt_boxes, points, num_gt_points, point_num_threshold=2): + points_mask = np.ones([points.shape[0]], np.bool) + gt_boxes_mask = np.ones([gt_boxes.shape[0]], np.bool) + for i, num in enumerate(num_gt_points): + if num <= point_num_threshold: + masks = box_np_ops.points_in_rbbox(points, gt_boxes[i : i + 1]) + masks = masks.reshape([-1]) + points_mask &= np.logical_not(masks) + gt_boxes_mask[i] = False + return gt_boxes[gt_boxes_mask], points[points_mask] + + +def mask_points_in_corners(points, box_corners): + surfaces = box_np_ops.corner_to_surfaces_3d(box_corners) + mask = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) + return mask + + +@numba.njit +def _rotation_matrix_3d_(rot_mat_T, angle, axis): + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + rot_mat_T[:] = np.eye(3) + if axis == 1: + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 2] = -rot_sin + rot_mat_T[2, 0] = rot_sin + rot_mat_T[2, 2] = rot_cos + elif axis == 2 or axis == -1: + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + elif axis == 0: + rot_mat_T[1, 1] = rot_cos + rot_mat_T[1, 2] = -rot_sin + rot_mat_T[2, 1] = rot_sin + rot_mat_T[2, 2] = rot_cos + + +@numba.njit +def _rotation_box2d_jit_(corners, angle, rot_mat_T): + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + corners[:] = corners @ rot_mat_T + + +@numba.jit(nopython=True) +def _box_single_to_corner_jit(boxes): + num_box = boxes.shape[0] + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(1, 4, 2) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) + for i in range(num_box): + rot_sin = np.sin(boxes[i, -1]) + rot_cos = np.cos(boxes[i, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] + return box_corners + + +@numba.njit +def noise_per_box(boxes, valid_mask, loc_noises, rot_noises): + # boxes: [N, 5] + # valid_mask: [N] + # loc_noises: [N, M, 3] + # rot_noises: [N, M] + num_boxes = boxes.shape[0] + num_tests = loc_noises.shape[1] + box_corners = box_np_ops.box2d_to_corner_jit(boxes) + current_corners = np.zeros((4, 2), dtype=boxes.dtype) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + success_mask = -np.ones((num_boxes,), dtype=np.int64) + # print(valid_mask) + for i in range(num_boxes): + if valid_mask[i]: + for j in range(num_tests): + current_corners[:] = box_corners[i] + current_corners -= boxes[i, :2] + _rotation_box2d_jit_(current_corners, rot_noises[i, j], rot_mat_T) + current_corners += boxes[i, :2] + loc_noises[i, j, :2] + coll_mat = box_collision_test( + current_corners.reshape(1, 4, 2), box_corners + ) + coll_mat[0, i] = False + # print(coll_mat) + if not coll_mat.any(): + success_mask[i] = j + box_corners[i] = current_corners + break + return success_mask + + +@numba.njit +def noise_per_box_group(boxes, valid_mask, loc_noises, rot_noises, group_nums): + # WARNING: this function need boxes to be sorted by group id. + # boxes: [N, 5] + # valid_mask: [N] + # loc_noises: [N, M, 3] + # rot_noises: [N, M] + num_groups = group_nums.shape[0] + num_boxes = boxes.shape[0] + num_tests = loc_noises.shape[1] + box_corners = box_np_ops.box2d_to_corner_jit(boxes) + max_group_num = group_nums.max() + current_corners = np.zeros((max_group_num, 4, 2), dtype=boxes.dtype) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + success_mask = -np.ones((num_boxes,), dtype=np.int64) + # print(valid_mask) + idx = 0 + for num in group_nums: + if valid_mask[idx]: + for j in range(num_tests): + for i in range(num): + current_corners[i] = box_corners[i + idx] + current_corners[i] -= boxes[i + idx, :2] + _rotation_box2d_jit_( + current_corners[i], rot_noises[idx + i, j], rot_mat_T + ) + current_corners[i] += ( + boxes[i + idx, :2] + loc_noises[i + idx, j, :2] + ) + coll_mat = box_collision_test( + current_corners[:num].reshape(num, 4, 2), box_corners + ) + for i in range(num): # remove self-coll + coll_mat[i, idx : idx + num] = False + if not coll_mat.any(): + for i in range(num): + success_mask[i + idx] = j + box_corners[i + idx] = current_corners[i] + break + idx += num + return success_mask + + +@numba.njit +def noise_per_box_group_v2_( + boxes, valid_mask, loc_noises, rot_noises, group_nums, global_rot_noises +): + # WARNING: this function need boxes to be sorted by group id. + # boxes: [N, 5] + # valid_mask: [N] + # loc_noises: [N, M, 3] + # rot_noises: [N, M] + num_boxes = boxes.shape[0] + num_tests = loc_noises.shape[1] + box_corners = box_np_ops.box2d_to_corner_jit(boxes) + max_group_num = group_nums.max() + current_box = np.zeros((1, 5), dtype=boxes.dtype) + current_corners = np.zeros((max_group_num, 4, 2), dtype=boxes.dtype) + dst_pos = np.zeros((max_group_num, 2), dtype=boxes.dtype) + + current_grot = np.zeros((max_group_num,), dtype=boxes.dtype) + dst_grot = np.zeros((max_group_num,), dtype=boxes.dtype) + + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + success_mask = -np.ones((num_boxes,), dtype=np.int64) + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners_norm = corners_norm.reshape(4, 2) + + # print(valid_mask) + idx = 0 + for num in group_nums: + if valid_mask[idx]: + for j in range(num_tests): + for i in range(num): + current_box[0, :] = boxes[i + idx] + current_radius = np.sqrt( + current_box[0, 0] ** 2 + current_box[0, 1] ** 2 + ) + current_grot[i] = np.arctan2(current_box[0, 0], current_box[0, 1]) + dst_grot[i] = current_grot[i] + global_rot_noises[idx + i, j] + dst_pos[i, 0] = current_radius * np.sin(dst_grot[i]) + dst_pos[i, 1] = current_radius * np.cos(dst_grot[i]) + current_box[0, :2] = dst_pos[i] + current_box[0, -1] += dst_grot[i] - current_grot[i] + + rot_sin = np.sin(current_box[0, -1]) + rot_cos = np.cos(current_box[0, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + current_corners[i] = ( + current_box[0, 2:4] * corners_norm @ rot_mat_T + + current_box[0, :2] + ) + current_corners[i] -= current_box[0, :2] + + _rotation_box2d_jit_( + current_corners[i], rot_noises[idx + i, j], rot_mat_T + ) + current_corners[i] += ( + current_box[0, :2] + loc_noises[i + idx, j, :2] + ) + coll_mat = box_collision_test( + current_corners[:num].reshape(num, 4, 2), box_corners + ) + for i in range(num): # remove self-coll + coll_mat[i, idx : idx + num] = False + if not coll_mat.any(): + for i in range(num): + success_mask[i + idx] = j + box_corners[i + idx] = current_corners[i] + loc_noises[i + idx, j, :2] += dst_pos[i] - boxes[i + idx, :2] + rot_noises[i + idx, j] += dst_grot[i] - current_grot[i] + break + idx += num + return success_mask + + +@numba.njit +def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises, global_rot_noises): + # boxes: [N, 5] + # valid_mask: [N] + # loc_noises: [N, M, 3] + # rot_noises: [N, M] + num_boxes = boxes.shape[0] + num_tests = loc_noises.shape[1] + box_corners = box_np_ops.box2d_to_corner_jit(boxes) + current_corners = np.zeros((4, 2), dtype=boxes.dtype) + current_box = np.zeros((1, 5), dtype=boxes.dtype) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + dst_pos = np.zeros((2,), dtype=boxes.dtype) + success_mask = -np.ones((num_boxes,), dtype=np.int64) + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners_norm = corners_norm.reshape(4, 2) + for i in range(num_boxes): + if valid_mask[i]: + for j in range(num_tests): + current_box[0, :] = boxes[i] + current_radius = np.sqrt(boxes[i, 0] ** 2 + boxes[i, 1] ** 2) + current_grot = np.arctan2(boxes[i, 0], boxes[i, 1]) + dst_grot = current_grot + global_rot_noises[i, j] + dst_pos[0] = current_radius * np.sin(dst_grot) + dst_pos[1] = current_radius * np.cos(dst_grot) + current_box[0, :2] = dst_pos + current_box[0, -1] += dst_grot - current_grot + + rot_sin = np.sin(current_box[0, -1]) + rot_cos = np.cos(current_box[0, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + current_corners[:] = ( + current_box[0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2] + ) + current_corners -= current_box[0, :2] + _rotation_box2d_jit_(current_corners, rot_noises[i, j], rot_mat_T) + current_corners += current_box[0, :2] + loc_noises[i, j, :2] + coll_mat = box_collision_test( + current_corners.reshape(1, 4, 2), box_corners + ) + coll_mat[0, i] = False + if not coll_mat.any(): + success_mask[i] = j + box_corners[i] = current_corners + loc_noises[i, j, :2] += dst_pos - boxes[i, :2] + rot_noises[i, j] += dst_grot - current_grot + break + return success_mask + + +@numba.njit +def points_transform_( + points, centers, point_masks, loc_transform, rot_transform, valid_mask +): + num_box = centers.shape[0] + num_points = points.shape[0] + rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype) + for i in range(num_box): + _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2) + for i in range(num_points): + for j in range(num_box): + if valid_mask[j]: + if point_masks[i, j] == 1: + points[i, :3] -= centers[j, :3] + points[i : i + 1, :3] = points[i : i + 1, :3] @ rot_mat_T[j] + points[i, :3] += centers[j, :3] + points[i, :3] += loc_transform[j] + break # only apply first box's transform + + +@numba.njit +def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask): + num_box = boxes.shape[0] + for i in range(num_box): + if valid_mask[i]: + boxes[i, :3] += loc_transform[i] + boxes[i, 6] += rot_transform[i] + + +def _select_transform(transform, indices): + result = np.zeros((transform.shape[0], *transform.shape[2:]), dtype=transform.dtype) + for i in range(transform.shape[0]): + if indices[i] != -1: + result[i] = transform[i, indices[i]] + return result + + +@numba.njit +def group_transform_(loc_noise, rot_noise, locs, rots, group_center, valid_mask): + # loc_noise: [N, M, 3], locs: [N, 3] + # rot_noise: [N, M] + # group_center: [N, 3] + num_try = loc_noise.shape[1] + r = 0.0 + x = 0.0 + y = 0.0 + rot_center = 0.0 + for i in range(loc_noise.shape[0]): + if valid_mask[i]: + x = locs[i, 0] - group_center[i, 0] + y = locs[i, 1] - group_center[i, 1] + r = np.sqrt(x ** 2 + y ** 2) + # calculate rots related to group center + rot_center = np.arctan2(x, y) + for j in range(num_try): + loc_noise[i, j, 0] += r * ( + np.sin(rot_center + rot_noise[i, j]) - np.sin(rot_center) + ) + loc_noise[i, j, 1] += r * ( + np.cos(rot_center + rot_noise[i, j]) - np.cos(rot_center) + ) + + +@numba.njit +def group_transform_v2_( + loc_noise, rot_noise, locs, rots, group_center, grot_noise, valid_mask +): + # loc_noise: [N, M, 3], locs: [N, 3] + # rot_noise: [N, M] + # group_center: [N, 3] + num_try = loc_noise.shape[1] + r = 0.0 + x = 0.0 + y = 0.0 + rot_center = 0.0 + for i in range(loc_noise.shape[0]): + if valid_mask[i]: + x = locs[i, 0] - group_center[i, 0] + y = locs[i, 1] - group_center[i, 1] + r = np.sqrt(x ** 2 + y ** 2) + # calculate rots related to group center + rot_center = np.arctan2(x, y) + for j in range(num_try): + loc_noise[i, j, 0] += r * ( + np.sin(rot_center + rot_noise[i, j] + grot_noise[i, j]) + - np.sin(rot_center + grot_noise[i, j]) + ) + loc_noise[i, j, 1] += r * ( + np.cos(rot_center + rot_noise[i, j] + grot_noise[i, j]) + - np.cos(rot_center + grot_noise[i, j]) + ) + + +def set_group_noise_same_(loc_noise, rot_noise, group_ids): + gid_to_index_dict = {} + for i, gid in enumerate(group_ids): + if gid not in gid_to_index_dict: + gid_to_index_dict[gid] = i + for i in range(loc_noise.shape[0]): + loc_noise[i] = loc_noise[gid_to_index_dict[group_ids[i]]] + rot_noise[i] = rot_noise[gid_to_index_dict[group_ids[i]]] + + +def set_group_noise_same_v2_(loc_noise, rot_noise, grot_noise, group_ids): + gid_to_index_dict = {} + for i, gid in enumerate(group_ids): + if gid not in gid_to_index_dict: + gid_to_index_dict[gid] = i + for i in range(loc_noise.shape[0]): + loc_noise[i] = loc_noise[gid_to_index_dict[group_ids[i]]] + rot_noise[i] = rot_noise[gid_to_index_dict[group_ids[i]]] + grot_noise[i] = grot_noise[gid_to_index_dict[group_ids[i]]] + + +def get_group_center(locs, group_ids): + num_groups = 0 + group_centers = np.zeros_like(locs) + group_centers_ret = np.zeros_like(locs) + group_id_dict = {} + group_id_num_dict = OrderedDict() + for i, gid in enumerate(group_ids): + if gid >= 0: + if gid in group_id_dict: + group_centers[group_id_dict[gid]] += locs[i] + group_id_num_dict[gid] += 1 + else: + group_id_dict[gid] = num_groups + num_groups += 1 + group_id_num_dict[gid] = 1 + group_centers[group_id_dict[gid]] = locs[i] + for i, gid in enumerate(group_ids): + group_centers_ret[i] = ( + group_centers[group_id_dict[gid]] / group_id_num_dict[gid] + ) + return group_centers_ret, group_id_num_dict + + +def noise_per_object_v3_( + gt_boxes, + points=None, + valid_mask=None, + rotation_perturb=np.pi / 4, + center_noise_std=1.0, + global_random_rot_range=np.pi / 4, + num_try=5, + group_ids=None, +): + """random rotate or remove each groundtrutn independently. + use kitti viewer to test this function points_transform_ + + Args: + gt_boxes: [N, 7], gt box in lidar.points_transform_ + points: [M, 4], point cloud in lidar. + """ + num_boxes = gt_boxes.shape[0] + if not isinstance(rotation_perturb, (list, tuple, np.ndarray)): + rotation_perturb = [-rotation_perturb, rotation_perturb] + if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)): + global_random_rot_range = [-global_random_rot_range, global_random_rot_range] + enable_grot = ( + np.abs(global_random_rot_range[0] - global_random_rot_range[1]) >= 1e-3 + ) + if not isinstance(center_noise_std, (list, tuple, np.ndarray)): + center_noise_std = [center_noise_std, center_noise_std, center_noise_std] + if valid_mask is None: + valid_mask = np.ones((num_boxes,), dtype=np.bool_) + center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype) + loc_noises = np.random.normal(scale=center_noise_std, size=[num_boxes, num_try, 3]) + # loc_noises = np.random.uniform( + # -center_noise_std, center_noise_std, size=[num_boxes, num_try, 3]) + rot_noises = np.random.uniform( + rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try] + ) + gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1]) + grot_lowers = global_random_rot_range[0] - gt_grots + grot_uppers = global_random_rot_range[1] - gt_grots + global_rot_noises = np.random.uniform( + grot_lowers[..., np.newaxis], + grot_uppers[..., np.newaxis], + size=[num_boxes, num_try], + ) + if group_ids is not None: + if enable_grot: + set_group_noise_same_v2_( + loc_noises, rot_noises, global_rot_noises, group_ids + ) + else: + set_group_noise_same_(loc_noises, rot_noises, group_ids) + group_centers, group_id_num_dict = get_group_center(gt_boxes[:, :3], group_ids) + if enable_grot: + group_transform_v2_( + loc_noises, + rot_noises, + gt_boxes[:, :3], + gt_boxes[:, 6], + group_centers, + global_rot_noises, + valid_mask, + ) + else: + group_transform_( + loc_noises, + rot_noises, + gt_boxes[:, :3], + gt_boxes[:, 6], + group_centers, + valid_mask, + ) + group_nums = np.array(list(group_id_num_dict.values()), dtype=np.int64) + + origin = [0.5, 0.5, 0.5] + gt_box_corners = box_np_ops.center_to_corner_box3d( + gt_boxes[:, :3], gt_boxes[:, 3:6], gt_boxes[:, 6], origin=origin, axis=2 + ) + if group_ids is not None: + if not enable_grot: + selected_noise = noise_per_box_group( + gt_boxes[:, [0, 1, 3, 4, 6]], + valid_mask, + loc_noises, + rot_noises, + group_nums, + ) + else: + selected_noise = noise_per_box_group_v2_( + gt_boxes[:, [0, 1, 3, 4, 6]], + valid_mask, + loc_noises, + rot_noises, + group_nums, + global_rot_noises, + ) + else: + if not enable_grot: + selected_noise = noise_per_box( + gt_boxes[:, [0, 1, 3, 4, 6]], valid_mask, loc_noises, rot_noises + ) + else: + selected_noise = noise_per_box_v2_( + gt_boxes[:, [0, 1, 3, 4, 6]], + valid_mask, + loc_noises, + rot_noises, + global_rot_noises, + ) + loc_transforms = _select_transform(loc_noises, selected_noise) + rot_transforms = _select_transform(rot_noises, selected_noise) + surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners) + if points is not None: + point_masks = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) + points_transform_( + points, + gt_boxes[:, :3], + point_masks, + loc_transforms, + rot_transforms, + valid_mask, + ) + + box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask) + + +def noise_per_object_v2_( + gt_boxes, + points=None, + valid_mask=None, + rotation_perturb=np.pi / 4, + center_noise_std=1.0, + global_random_rot_range=np.pi / 4, + num_try=100, +): + """random rotate or remove each groundtrutn independently. + use kitti viewer to test this function points_transform_ + + Args: + gt_boxes: [N, 7], gt box in lidar.points_transform_ + points: [M, 4], point cloud in lidar. + """ + num_boxes = gt_boxes.shape[0] + if not isinstance(rotation_perturb, (list, tuple, np.ndarray)): + rotation_perturb = [-rotation_perturb, rotation_perturb] + if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)): + global_random_rot_range = [-global_random_rot_range, global_random_rot_range] + + if not isinstance(center_noise_std, (list, tuple, np.ndarray)): + center_noise_std = [center_noise_std, center_noise_std, center_noise_std] + if valid_mask is None: + valid_mask = np.ones((num_boxes,), dtype=np.bool_) + center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype) + loc_noises = np.random.normal(scale=center_noise_std, size=[num_boxes, num_try, 3]) + # loc_noises = np.random.uniform( + # -center_noise_std, center_noise_std, size=[num_boxes, num_try, 3]) + rot_noises = np.random.uniform( + rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try] + ) + gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1]) + grot_lowers = global_random_rot_range[0] - gt_grots + grot_uppers = global_random_rot_range[1] - gt_grots + global_rot_noises = np.random.uniform( + grot_lowers[..., np.newaxis], + grot_uppers[..., np.newaxis], + size=[num_boxes, num_try], + ) + + origin = [0.5, 0.5, 0] + gt_box_corners = box_np_ops.center_to_corner_box3d( + gt_boxes[:, :3], gt_boxes[:, 3:6], gt_boxes[:, 6], origin=origin, axis=2 + ) + if np.abs(global_random_rot_range[0] - global_random_rot_range[1]) < 1e-3: + selected_noise = noise_per_box( + gt_boxes[:, [0, 1, 3, 4, 6]], valid_mask, loc_noises, rot_noises + ) + else: + selected_noise = noise_per_box_v2_( + gt_boxes[:, [0, 1, 3, 4, 6]], + valid_mask, + loc_noises, + rot_noises, + global_rot_noises, + ) + loc_transforms = _select_transform(loc_noises, selected_noise) + rot_transforms = _select_transform(rot_noises, selected_noise) + if points is not None: + surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners) + point_masks = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) + points_transform_( + points, + gt_boxes[:, :3], + point_masks, + loc_transforms, + rot_transforms, + valid_mask, + ) + + box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask) + + +def global_scaling(gt_boxes, points, scale=0.05): + if not isinstance(scale, list): + scale = [-scale, scale] + noise_scale = np.random.uniform(scale[0] + 1, scale[1] + 1) + points[:, :3] *= noise_scale + gt_boxes[:, :6] *= noise_scale + return gt_boxes, points + + +def global_rotation(gt_boxes, points, rotation=np.pi / 4): + if not isinstance(rotation, list): + rotation = [-rotation, rotation] + noise_rotation = np.random.uniform(rotation[0], rotation[1]) + points[:, :3] = box_np_ops.rotation_points_single_angle( + points[:, :3], noise_rotation, axis=2 + ) + gt_boxes[:, :3] = box_np_ops.rotation_points_single_angle( + gt_boxes[:, :3], noise_rotation, axis=2 + ) + if gt_boxes.shape[1] > 7: + gt_boxes[:, 6:8] = box_np_ops.rotation_points_single_angle( + np.hstack([gt_boxes[:, 6:8], np.zeros((gt_boxes.shape[0], 1))]), + noise_rotation, + axis=2, + )[:, :2] + gt_boxes[:, -1] += noise_rotation + return gt_boxes, points + + +def random_flip(gt_boxes, points, probability=0.5): + enable = np.random.choice( + [False, True], replace=False, p=[1 - probability, probability] + ) + if enable: + gt_boxes[:, 1] = -gt_boxes[:, 1] + gt_boxes[:, -1] = -gt_boxes[:, -1] + np.pi + points[:, 1] = -points[:, 1] + if gt_boxes.shape[1] > 7: # y axis: x, y, z, w, h, l, vx, vy, r + gt_boxes[:, 7] = -gt_boxes[:, 7] + return gt_boxes, points + +def random_flip_both(gt_boxes, points, probability=0.5, flip_coor=None): + # x flip + enable = np.random.choice( + [False, True], replace=False, p=[1 - probability, probability] + ) + if enable: + gt_boxes[:, 1] = -gt_boxes[:, 1] + gt_boxes[:, -1] = -gt_boxes[:, -1] + np.pi + points[:, 1] = -points[:, 1] + if gt_boxes.shape[1] > 7: # y axis: x, y, z, w, h, l, vx, vy, r + gt_boxes[:, 7] = -gt_boxes[:, 7] + + # y flip + enable = np.random.choice( + [False, True], replace=False, p=[1 - probability, probability] + ) + if enable: + if flip_coor is None: + gt_boxes[:, 0] = -gt_boxes[:, 0] + points[:, 0] = -points[:, 0] + else: + gt_boxes[:, 0] = flip_coor * 2 - gt_boxes[:, 0] + points[:, 0] = flip_coor * 2 - points[:, 0] + + gt_boxes[:, -1] = -gt_boxes[:, -1] + 2*np.pi # TODO: CHECK THIS + + if gt_boxes.shape[1] > 7: # y axis: x, y, z, w, h, l, vx, vy, r + gt_boxes[:, 6] = -gt_boxes[:, 6] + + return gt_boxes, points + + +def global_scaling_v2(gt_boxes, points, min_scale=0.95, max_scale=1.05): + noise_scale = np.random.uniform(min_scale, max_scale) + points[:, :3] *= noise_scale + gt_boxes[:, :-1] *= noise_scale + return gt_boxes, points + + +def global_rotation_v2(gt_boxes, points, min_rad=-np.pi / 4, max_rad=np.pi / 4): + noise_rotation = np.random.uniform(min_rad, max_rad) + points[:, :3] = box_np_ops.rotation_points_single_angle( + points[:, :3], noise_rotation, axis=2 + ) + gt_boxes[:, :3] = box_np_ops.rotation_points_single_angle( + gt_boxes[:, :3], noise_rotation, axis=2 + ) + gt_boxes[:, -1] += noise_rotation + return gt_boxes, points + + +@numba.jit(nopython=True) +def box_collision_test(boxes, qboxes, clockwise=True): + N = boxes.shape[0] + K = qboxes.shape[0] + ret = np.zeros((N, K), dtype=np.bool_) + slices = np.array([1, 2, 3, 0]) + lines_boxes = np.stack( + (boxes, boxes[:, slices, :]), axis=2 + ) # [N, 4, 2(line), 2(xy)] + lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2) + # vec = np.zeros((2,), dtype=boxes.dtype) + boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes) + qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes) + for i in range(N): + for j in range(K): + # calculate standup first + iw = min(boxes_standup[i, 2], qboxes_standup[j, 2]) - max( + boxes_standup[i, 0], qboxes_standup[j, 0] + ) + if iw > 0: + ih = min(boxes_standup[i, 3], qboxes_standup[j, 3]) - max( + boxes_standup[i, 1], qboxes_standup[j, 1] + ) + if ih > 0: + for k in range(4): + for l in range(4): + A = lines_boxes[i, k, 0] + B = lines_boxes[i, k, 1] + C = lines_qboxes[j, l, 0] + D = lines_qboxes[j, l, 1] + acd = (D[1] - A[1]) * (C[0] - A[0]) > (C[1] - A[1]) * ( + D[0] - A[0] + ) + bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * ( + D[0] - B[0] + ) + if acd != bcd: + abc = (C[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * ( + C[0] - A[0] + ) + abd = (D[1] - A[1]) * (B[0] - A[0]) > (B[1] - A[1]) * ( + D[0] - A[0] + ) + if abc != abd: + ret[i, j] = True # collision. + break + if ret[i, j] is True: + break + if ret[i, j] is False: + # now check complete overlap. + # box overlap qbox: + box_overlap_qbox = True + for l in range(4): # point l in qboxes + for k in range(4): # corner k in boxes + vec = boxes[i, k] - boxes[i, (k + 1) % 4] + if clockwise: + vec = -vec + cross = vec[1] * (boxes[i, k, 0] - qboxes[j, l, 0]) + cross -= vec[0] * (boxes[i, k, 1] - qboxes[j, l, 1]) + if cross >= 0: + box_overlap_qbox = False + break + if box_overlap_qbox is False: + break + + if box_overlap_qbox is False: + qbox_overlap_box = True + for l in range(4): # point l in boxes + for k in range(4): # corner k in qboxes + vec = qboxes[j, k] - qboxes[j, (k + 1) % 4] + if clockwise: + vec = -vec + cross = vec[1] * (qboxes[j, k, 0] - boxes[i, l, 0]) + cross -= vec[0] * (qboxes[j, k, 1] - boxes[i, l, 1]) + if cross >= 0: # + qbox_overlap_box = False + break + if qbox_overlap_box is False: + break + if qbox_overlap_box: + ret[i, j] = True # collision. + else: + ret[i, j] = True # collision. + return ret + + +def global_translate_(gt_boxes, points, noise_translate_std): + """ + Apply global translation to gt_boxes and points. + """ + + if not isinstance(noise_translate_std, (list, tuple, np.ndarray)): + noise_translate_std = np.array( + [noise_translate_std, noise_translate_std, noise_translate_std] + ) + if all([e == 0 for e in noise_translate_std]): + return gt_boxes, points + noise_translate = np.array( + [ + np.random.normal(0, noise_translate_std[0], 1), + np.random.normal(0, noise_translate_std[1], 1), + np.random.normal(0, noise_translate_std[0], 1), + ] + ).T + + points[:, :3] += noise_translate + gt_boxes[:, :3] += noise_translate + + return gt_boxes, points + + +if __name__ == "__main__": + bboxes = np.array( + [ + [0.0, 0.0, 0.5, 0.5], + [0.2, 0.2, 0.6, 0.6], + [0.7, 0.7, 0.9, 0.9], + [0.55, 0.55, 0.8, 0.8], + ] + ) + bbox_corners = box_np_ops.minmax_to_corner_2d(bboxes) + print(bbox_corners.shape) + print(box_collision_test(bbox_corners, bbox_corners)) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/sample_ops.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/sample_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..d50746001a249a608396047a47a23836b94e4c36 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/sampler/sample_ops.py @@ -0,0 +1,369 @@ +import copy +import pathlib +import pickle +import time +from functools import partial, reduce + +import numpy as np +from det3d.core.bbox import box_np_ops +from det3d.core.sampler import preprocess as prep +from det3d.utils.check import shape_mergeable + + +class DataBaseSamplerV2: + def __init__( + self, + db_infos, + groups, + db_prepor=None, + rate=1.0, + global_rot_range=None, + logger=None, + ): + for k, v in db_infos.items(): + logger.info(f"load {len(v)} {k} database infos") + + if db_prepor is not None: + db_infos = db_prepor(db_infos) + logger.info("After filter database:") + for k, v in db_infos.items(): + logger.info(f"load {len(v)} {k} database infos") + + self.db_infos = db_infos + self._rate = rate + self._groups = groups + self._group_db_infos = {} + self._group_name_to_names = [] + self._sample_classes = [] + self._sample_max_nums = [] + self._use_group_sampling = False # slower + if any([len(g) > 1 for g in groups]): + self._use_group_sampling = True + if not self._use_group_sampling: + self._group_db_infos = self.db_infos # just use db_infos + for group_info in groups: + group_names = list(group_info.keys()) + self._sample_classes += group_names + self._sample_max_nums += list(group_info.values()) + else: + for group_info in groups: + group_dict = {} + group_names = list(group_info.keys()) + group_name = ", ".join(group_names) + self._sample_classes += group_names + self._sample_max_nums += list(group_info.values()) + self._group_name_to_names.append((group_name, group_names)) + # self._group_name_to_names[group_name] = group_names + for name in group_names: + for item in db_infos[name]: + gid = item["group_id"] + if gid not in group_dict: + group_dict[gid] = [item] + else: + group_dict[gid] += [item] + if group_name in self._group_db_infos: + raise ValueError("group must be unique") + group_data = list(group_dict.values()) + self._group_db_infos[group_name] = group_data + info_dict = {} + if len(group_info) > 1: + for group in group_data: + names = [item["name"] for item in group] + names = sorted(names) + group_name = ", ".join(names) + if group_name in info_dict: + info_dict[group_name] += 1 + else: + info_dict[group_name] = 1 + print(info_dict) + + self._sampler_dict = {} + for k, v in self._group_db_infos.items(): + self._sampler_dict[k] = prep.BatchSampler(v, k) + self._enable_global_rot = False + if global_rot_range is not None: + if not isinstance(global_rot_range, (list, tuple, np.ndarray)): + global_rot_range = [-global_rot_range, global_rot_range] + else: + assert shape_mergeable(global_rot_range, [2]) + if np.abs(global_rot_range[0] - global_rot_range[1]) >= 1e-3: + self._enable_global_rot = True + self._global_rot_range = global_rot_range + + @property + def use_group_sampling(self): + return self._use_group_sampling + + def sample_all( + self, + root_path, + gt_boxes, + gt_names, + num_point_features, + random_crop=False, + gt_group_ids=None, + calib=None, + road_planes=None, + ): + sampled_num_dict = {} + sample_num_per_class = [] + for class_name, max_sample_num in zip( + self._sample_classes, self._sample_max_nums + ): + sampled_num = int( + max_sample_num - np.sum([n == class_name for n in gt_names]) + ) + + sampled_num = np.round(self._rate * sampled_num).astype(np.int64) + sampled_num_dict[class_name] = sampled_num + sample_num_per_class.append(sampled_num) + + sampled_groups = self._sample_classes + if self._use_group_sampling: + assert gt_group_ids is not None + sampled_groups = [] + sample_num_per_class = [] + for group_name, class_names in self._group_name_to_names: + sampled_nums_group = [sampled_num_dict[n] for n in class_names] + sampled_num = np.max(sampled_nums_group) + sample_num_per_class.append(sampled_num) + sampled_groups.append(group_name) + total_group_ids = gt_group_ids + sampled = [] + sampled_gt_boxes = [] + avoid_coll_boxes = gt_boxes + + for class_name, sampled_num in zip(sampled_groups, sample_num_per_class): + if sampled_num > 0: + if self._use_group_sampling: + sampled_cls = self.sample_group( + class_name, sampled_num, avoid_coll_boxes, total_group_ids + ) + else: + sampled_cls = self.sample_class_v2( + class_name, sampled_num, avoid_coll_boxes + ) + + sampled += sampled_cls + if len(sampled_cls) > 0: + if len(sampled_cls) == 1: + sampled_gt_box = sampled_cls[0]["box3d_lidar"][np.newaxis, ...] + else: + sampled_gt_box = np.stack( + [s["box3d_lidar"] for s in sampled_cls], axis=0 + ) + + sampled_gt_boxes += [sampled_gt_box] + avoid_coll_boxes = np.concatenate( + [avoid_coll_boxes, sampled_gt_box], axis=0 + ) + if self._use_group_sampling: + if len(sampled_cls) == 1: + sampled_group_ids = np.array(sampled_cls[0]["group_id"])[ + np.newaxis, ... + ] + else: + sampled_group_ids = np.stack( + [s["group_id"] for s in sampled_cls], axis=0 + ) + total_group_ids = np.concatenate( + [total_group_ids, sampled_group_ids], axis=0 + ) + + if len(sampled) > 0: + sampled_gt_boxes = np.concatenate(sampled_gt_boxes, axis=0) + + num_sampled = len(sampled) + s_points_list = [] + for info in sampled: + try: + s_points = np.fromfile( + str(pathlib.Path(root_path) / info["path"]), dtype=np.float32 + ).reshape(-1, num_point_features) + + if "rot_transform" in info: + rot = info["rot_transform"] + s_points[:, :3] = box_np_ops.rotation_points_single_angle( + s_points[:, :4], rot, axis=2 + ) + s_points[:, :3] += info["box3d_lidar"][:3] + s_points_list.append(s_points) + # print(pathlib.Path(info["path"]).stem) + except Exception: + print(str(pathlib.Path(root_path) / info["path"])) + continue + if random_crop: + s_points_list_new = [] + assert calib is not None + rect = calib["rect"] + Trv2c = calib["Trv2c"] + P2 = calib["P2"] + gt_bboxes = box_np_ops.box3d_to_bbox(sampled_gt_boxes, rect, Trv2c, P2) + crop_frustums = prep.random_crop_frustum(gt_bboxes, rect, Trv2c, P2) + for i in range(crop_frustums.shape[0]): + s_points = s_points_list[i] + mask = prep.mask_points_in_corners( + s_points, crop_frustums[i : i + 1] + ).reshape(-1) + num_remove = np.sum(mask) + if num_remove > 0 and (s_points.shape[0] - num_remove) > 15: + s_points = s_points[np.logical_not(mask)] + s_points_list_new.append(s_points) + s_points_list = s_points_list_new + ret = { + "gt_names": np.array([s["name"] for s in sampled]), + "difficulty": np.array([s["difficulty"] for s in sampled]), + "gt_boxes": sampled_gt_boxes, + "points": np.concatenate(s_points_list, axis=0), + "gt_masks": np.ones((num_sampled,), dtype=np.bool_), + } + if self._use_group_sampling: + ret["group_ids"] = np.array([s["group_id"] for s in sampled]) + else: + ret["group_ids"] = np.arange( + gt_boxes.shape[0], gt_boxes.shape[0] + len(sampled) + ) + else: + ret = None + return ret + + def sample(self, name, num): + if self._use_group_sampling: + group_name = name + ret = self._sampler_dict[group_name].sample(num) + groups_num = [len(l) for l in ret] + return reduce(lambda x, y: x + y, ret), groups_num + else: + ret = self._sampler_dict[name].sample(num) + return ret, np.ones((len(ret),), dtype=np.int64) + + def sample_v1(self, name, num): + if isinstance(name, (list, tuple)): + group_name = ", ".join(name) + ret = self._sampler_dict[group_name].sample(num) + groups_num = [len(l) for l in ret] + return reduce(lambda x, y: x + y, ret), groups_num + else: + ret = self._sampler_dict[name].sample(num) + return ret, np.ones((len(ret),), dtype=np.int64) + + def sample_class_v2(self, name, num, gt_boxes): + sampled = self._sampler_dict[name].sample(num) + sampled = copy.deepcopy(sampled) + num_gt = gt_boxes.shape[0] + num_sampled = len(sampled) + gt_boxes_bv = box_np_ops.center_to_corner_box2d( + gt_boxes[:, 0:2], gt_boxes[:, 3:5], gt_boxes[:, -1] + ) + + sp_boxes = np.stack([i["box3d_lidar"] for i in sampled], axis=0) + + valid_mask = np.zeros([gt_boxes.shape[0]], dtype=np.bool_) + valid_mask = np.concatenate( + [valid_mask, np.ones([sp_boxes.shape[0]], dtype=np.bool_)], axis=0 + ) + boxes = np.concatenate([gt_boxes, sp_boxes], axis=0).copy() + if self._enable_global_rot: + # place samples to any place in a circle. + prep.noise_per_object_v3_( + boxes, None, valid_mask, 0, 0, self._global_rot_range, num_try=100 + ) + + sp_boxes_new = boxes[gt_boxes.shape[0] :] + sp_boxes_bv = box_np_ops.center_to_corner_box2d( + sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, -1] + ) + + total_bv = np.concatenate([gt_boxes_bv, sp_boxes_bv], axis=0) + # coll_mat = collision_test_allbox(total_bv) + coll_mat = prep.box_collision_test(total_bv, total_bv) + diag = np.arange(total_bv.shape[0]) + coll_mat[diag, diag] = False + + valid_samples = [] + for i in range(num_gt, num_gt + num_sampled): + if coll_mat[i].any(): + coll_mat[i] = False + coll_mat[:, i] = False + else: + if self._enable_global_rot: + sampled[i - num_gt]["box3d_lidar"][:2] = boxes[i, :2] + sampled[i - num_gt]["box3d_lidar"][-1] = boxes[i, -1] + sampled[i - num_gt]["rot_transform"] = ( + boxes[i, -1] - sp_boxes[i - num_gt, -1] + ) + valid_samples.append(sampled[i - num_gt]) + return valid_samples + + def sample_group(self, name, num, gt_boxes, gt_group_ids): + sampled, group_num = self.sample(name, num) + sampled = copy.deepcopy(sampled) + # rewrite sampled group id to avoid duplicated with gt group ids + gid_map = {} + max_gt_gid = np.max(gt_group_ids) + sampled_gid = max_gt_gid + 1 + for s in sampled: + gid = s["group_id"] + if gid in gid_map: + s["group_id"] = gid_map[gid] + else: + gid_map[gid] = sampled_gid + s["group_id"] = sampled_gid + sampled_gid += 1 + + num_gt = gt_boxes.shape[0] + gt_boxes_bv = box_np_ops.center_to_corner_box2d( + gt_boxes[:, 0:2], gt_boxes[:, 3:5], gt_boxes[:, -1] + ) + + sp_boxes = np.stack([i["box3d_lidar"] for i in sampled], axis=0) + sp_group_ids = np.stack([i["group_id"] for i in sampled], axis=0) + valid_mask = np.zeros([gt_boxes.shape[0]], dtype=np.bool_) + valid_mask = np.concatenate( + [valid_mask, np.ones([sp_boxes.shape[0]], dtype=np.bool_)], axis=0 + ) + boxes = np.concatenate([gt_boxes, sp_boxes], axis=0).copy() + group_ids = np.concatenate([gt_group_ids, sp_group_ids], axis=0) + if self._enable_global_rot: + # place samples to any place in a circle. + prep.noise_per_object_v3_( + boxes, + None, + valid_mask, + 0, + 0, + self._global_rot_range, + group_ids=group_ids, + num_try=100, + ) + sp_boxes_new = boxes[gt_boxes.shape[0] :] + sp_boxes_bv = box_np_ops.center_to_corner_box2d( + sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, -1] + ) + total_bv = np.concatenate([gt_boxes_bv, sp_boxes_bv], axis=0) + # coll_mat = collision_test_allbox(total_bv) + coll_mat = prep.box_collision_test(total_bv, total_bv) + diag = np.arange(total_bv.shape[0]) + coll_mat[diag, diag] = False + valid_samples = [] + idx = num_gt + for num in group_num: + if coll_mat[idx : idx + num].any(): + coll_mat[idx : idx + num] = False + coll_mat[:, idx : idx + num] = False + else: + for i in range(num): + if self._enable_global_rot: + sampled[idx - num_gt + i]["box3d_lidar"][:2] = boxes[ + idx + i, :2 + ] + sampled[idx - num_gt + i]["box3d_lidar"][-1] = boxes[ + idx + i, -1 + ] + sampled[idx - num_gt + i]["rot_transform"] = ( + boxes[idx + i, -1] - sp_boxes[idx + i - num_gt, -1] + ) + + valid_samples.append(sampled[idx - num_gt + i]) + idx += num + return valid_samples diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..154357c907dd0d758925c8d99c208b6a6777470f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/__init__.py @@ -0,0 +1,4 @@ +from .dist_utils import * +from .misc import * +from .center_utils import * +from .circle_nms_jit import * \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/center_utils.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/center_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8edc42111857ba4dd3114fe9af3c1e689a54802a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/center_utils.py @@ -0,0 +1,121 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# Modified by Xingyi Zhou and Tianwei Yin +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import torch +from torch import nn +from .circle_nms_jit import circle_nms + +def gaussian_radius(det_size, min_overlap=0.5): + height, width = det_size + + a1 = 1 + b1 = (height + width) + c1 = width * height * (1 - min_overlap) / (1 + min_overlap) + sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1) + r1 = (b1 + sq1) / 2 + + a2 = 4 + b2 = 2 * (height + width) + c2 = (1 - min_overlap) * width * height + sq2 = np.sqrt(b2 ** 2 - 4 * a2 * c2) + r2 = (b2 + sq2) / 2 + + a3 = 4 * min_overlap + b3 = -2 * min_overlap * (height + width) + c3 = (min_overlap - 1) * width * height + sq3 = np.sqrt(b3 ** 2 - 4 * a3 * c3) + r3 = (b3 + sq3) / 2 + return min(r1, r2, r3) + +def gaussian2D(shape, sigma=1): + m, n = [(ss - 1.) / 2. for ss in shape] + y, x = np.ogrid[-m:m+1,-n:n+1] + + h = np.exp(-(x * x + y * y) / (2 * sigma * sigma)) + h[h < np.finfo(h.dtype).eps * h.max()] = 0 + return h + + +def draw_umich_gaussian(heatmap, center, radius, k=1): + diameter = 2 * radius + 1 + gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6) + + x, y = int(center[0]), int(center[1]) + + height, width = heatmap.shape[0:2] + + left, right = min(x, radius), min(width - x, radius + 1) + top, bottom = min(y, radius), min(height - y, radius + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right] + if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: # TODO debug + np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) + return heatmap + +def _gather_feat(feat, ind, mask=None): + dim = feat.size(2) + ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) + feat = feat.gather(1, ind) + if mask is not None: + mask = mask.unsqueeze(2).expand_as(feat) + feat = feat[mask] + feat = feat.view(-1, dim) + return feat + +def _transpose_and_gather_feat(feat, ind): + feat = feat.permute(0, 2, 3, 1).contiguous() + feat = feat.view(feat.size(0), -1, feat.size(3)) + feat = _gather_feat(feat, ind) + return feat + +def _circle_nms(boxes, min_radius, post_max_size=83): + """ + NMS according to center distance + """ + keep = np.array(circle_nms(boxes.cpu().numpy(), thresh=min_radius))[:post_max_size] + + keep = torch.from_numpy(keep).long().to(boxes.device) + + return keep + + +def bilinear_interpolate_torch(im, x, y): + """ + Args: + im: (H, W, C) [y, x] + x: (N) + y: (N) + Returns: + """ + x0 = torch.floor(x).long() + x1 = x0 + 1 + + y0 = torch.floor(y).long() + y1 = y0 + 1 + + x0 = torch.clamp(x0, 0, im.shape[1] - 1) + x1 = torch.clamp(x1, 0, im.shape[1] - 1) + y0 = torch.clamp(y0, 0, im.shape[0] - 1) + y1 = torch.clamp(y1, 0, im.shape[0] - 1) + + Ia = im[y0, x0] + Ib = im[y1, x0] + Ic = im[y0, x1] + Id = im[y1, x1] + + wa = (x1.type_as(x) - x) * (y1.type_as(y) - y) + wb = (x1.type_as(x) - x) * (y - y0.type_as(y)) + wc = (x - x0.type_as(x)) * (y1.type_as(y) - y) + wd = (x - x0.type_as(x)) * (y - y0.type_as(y)) + ans = torch.t((torch.t(Ia) * wa)) + torch.t(torch.t(Ib) * wb) + torch.t(torch.t(Ic) * wc) + torch.t(torch.t(Id) * wd) + return ans diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/circle_nms_jit.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/circle_nms_jit.py new file mode 100644 index 0000000000000000000000000000000000000000..d19cf1a0d77cdfd76d83b06e24709c71aebfbf2e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/circle_nms_jit.py @@ -0,0 +1,28 @@ +import numba +import numpy as np + +@numba.jit(nopython=True) +def circle_nms(dets, thresh): + x1 = dets[:, 0] + y1 = dets[:, 1] + scores = dets[:, 2] + order = scores.argsort()[::-1].astype(np.int32) # highest->lowest + ndets = dets.shape[0] + suppressed = np.zeros((ndets), dtype=np.int32) + keep = [] + for _i in range(ndets): + i = order[_i] # start with highest score box + if suppressed[i] == 1: # if any box have enough iou with this, remove it + continue + keep.append(i) + for _j in range(_i + 1, ndets): + j = order[_j] + if suppressed[j] == 1: + continue + # calculate center distance between i and j box + dist = (x1[i]-x1[j])**2 + (y1[i]-y1[j])**2 + + # ovr = inter / areas[j] + if dist <= thresh: + suppressed[j] = 1 + return keep diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/dist_utils.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/dist_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..68f66707c19f369956d5e6b58f94b87bac88efa4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/dist_utils.py @@ -0,0 +1,57 @@ +from collections import OrderedDict + +import torch.distributed as dist +from det3d.torchie.trainer import OptimizerHook +from torch._utils import _flatten_dense_tensors, _take_tensors, _unflatten_dense_tensors + + +def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): + if bucket_size_mb > 0: + bucket_size_bytes = bucket_size_mb * 1024 * 1024 + buckets = _take_tensors(tensors, bucket_size_bytes) + else: + buckets = OrderedDict() + for tensor in tensors: + tp = tensor.type() + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(tensor) + buckets = buckets.values() + + for bucket in buckets: + flat_tensors = _flatten_dense_tensors(bucket) + dist.all_reduce(flat_tensors) + flat_tensors.div_(world_size) + for tensor, synced in zip( + bucket, _unflatten_dense_tensors(flat_tensors, bucket) + ): + tensor.copy_(synced) + + +def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): + grads = [ + param.grad.data + for param in params + if param.requires_grad and param.grad is not None + ] + world_size = dist.get_world_size() + if coalesce: + _allreduce_coalesced(grads, world_size, bucket_size_mb) + else: + for tensor in grads: + dist.all_reduce(tensor.div_(world_size)) + + +class DistOptimizerHook(OptimizerHook): + def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1): + self.grad_clip = grad_clip + self.coalesce = coalesce + self.bucket_size_mb = bucket_size_mb + + def after_train_iter(self, runner): + runner.optimizer.zero_grad() + runner.outputs["loss"].backward() + allreduce_grads(runner.model.parameters(), self.coalesce, self.bucket_size_mb) + if self.grad_clip is not None: + self.clip_grads(runner.model.parameters()) + runner.optimizer.step() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/misc.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..f65f5c40f3432d8213ebf6f6fb64b2bc1b9ba2f3 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/misc.py @@ -0,0 +1,36 @@ +from functools import partial + +import numpy as np +from det3d import torchie +from six.moves import map, zip + + +def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): + num_imgs = tensor.size(0) + mean = np.array(mean, dtype=np.float32) + std = np.array(std, dtype=np.float32) + imgs = [] + for img_id in range(num_imgs): + img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0) + img = torchie.imdenormalize(img, mean, std, to_bgr=to_rgb).astype(np.uint8) + imgs.append(np.ascontiguousarray(img)) + return imgs + + +def multi_apply(func, *args, **kwargs): + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +def unmap(data, count, inds, fill=0): + """ Unmap a subset of item (data) back to the original set of items (of + size count) """ + if data.dim() == 1: + ret = data.new_full((count,), fill) + ret[inds] = data + else: + new_size = (count,) + data.size()[1:] + ret = data.new_full(new_size, fill) + ret[inds, :] = data + return ret diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/scatter.py b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/scatter.py new file mode 100644 index 0000000000000000000000000000000000000000..6ee13b7544b9e71b954d5c1be4774c9a5a099b2f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/core/utils/scatter.py @@ -0,0 +1,60 @@ +# The following code are copied from pytorch_scatter https://github.com/rusty1s/pytorch_scatter +# Copyright (c) 2020 Matthias Fey +# MIT License +from typing import Optional, Tuple +import torch + +@torch.jit.script +def broadcast(src: torch.Tensor, other: torch.Tensor, dim: int): + if dim < 0: + dim = other.dim() + dim + if src.dim() == 1: + for _ in range(dim): + src = src.unsqueeze(0) + for _ in range(other.dim()-src.dim()): + src = src.unsqueeze(-1) + src = src.expand_as(other) + return src + +@torch.jit.script +def scatter_sum(src: torch.Tensor, index: torch.Tensor, dim: int = -1, + out: Optional[torch.Tensor] = None, + dim_size: Optional[int] = None) -> torch.Tensor: + index = broadcast(index, src, dim) + if out is None: + size = list(src.size()) + if dim_size is not None: + size[dim] = dim_size + elif index.numel() == 0: + size[dim] = 0 + else: + size[dim] = int(index.max()) + 1 + out = torch.zeros(size, dtype=src.dtype, device=src.device) + return out.scatter_add_(dim, index, src) + else: + return out.scatter_add_(dim, index, src) + +@torch.jit.script +def scatter_mean(src: torch.Tensor, index: torch.Tensor, dim: int = -1, + out: Optional[torch.Tensor] = None, + dim_size: Optional[int] = None) -> torch.Tensor: + + out = scatter_sum(src, index, dim, out, dim_size) + dim_size = out.size(dim) + + index_dim = dim + if index_dim < 0: + index_dim = index_dim + src.dim() + if index.dim() <= index_dim: + index_dim = index.dim() - 1 + + ones = torch.ones(index.size(), dtype=src.dtype, device=src.device) + count = scatter_sum(ones, index, index_dim, None, dim_size) + count.clamp_(1) + count = broadcast(count, out, dim) + if torch.is_floating_point(out): + out.div_(count) + else: + assert 0 + # out.floor_divide_(count) + return out \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1c0f34aae7a78df3b3d2ad542c979c63cd2a503b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/__init__.py @@ -0,0 +1,27 @@ +from .builder import build_dataset + +# from .cityscapes import CityscapesDataset +from .nuscenes import NuScenesDataset +from .waymo import WaymoDataset + +# from .custom import CustomDataset +from .dataset_wrappers import ConcatDataset, RepeatDataset + +# from .extra_aug import ExtraAugmentation +from .loader import DistributedGroupSampler, GroupSampler, build_dataloader +from .registry import DATASETS + +# from .voc import VOCDataset +# from .wider_face import WIDERFaceDataset +# from .xml_style import XMLDataset +# +__all__ = [ + "CustomDataset", + "GroupSampler", + "DistributedGroupSampler", + "build_dataloader", + "ConcatDataset", + "RepeatDataset", + "DATASETS", + "build_dataset", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/builder.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..9405d9f071c4a75cac46afb542c645c0799a0740 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/builder.py @@ -0,0 +1,43 @@ +import copy + +from det3d.utils import build_from_cfg + +from .dataset_wrappers import ConcatDataset, RepeatDataset +from .registry import DATASETS + + +def _concat_dataset(cfg, default_args=None): + ann_files = cfg["ann_file"] + img_prefixes = cfg.get("img_prefix", None) + seg_prefixes = cfg.get("seg_prefixes", None) + proposal_files = cfg.get("proposal_file", None) + + datasets = [] + num_dset = len(ann_files) + for i in range(num_dset): + data_cfg = copy.deepcopy(cfg) + data_cfg["ann_file"] = ann_files[i] + if isinstance(img_prefixes, (list, tuple)): + data_cfg["img_prefix"] = img_prefixes[i] + if isinstance(seg_prefixes, (list, tuple)): + data_cfg["seg_prefix"] = seg_prefixes[i] + if isinstance(proposal_files, (list, tuple)): + data_cfg["proposal_file"] = proposal_files[i] + datasets.append(build_dataset(data_cfg, default_args)) + + return ConcatDataset(datasets) + + +def build_dataset(cfg, default_args=None): + if isinstance(cfg, (list, tuple)): + dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) + elif cfg["type"] == "RepeatDataset": + dataset = RepeatDataset( + build_dataset(cfg["dataset"], default_args), cfg["times"] + ) + # elif isinstance(cfg['ann_file'], (list, tuple)): + # dataset = _concat_dataset(cfg, default_args) + else: + dataset = build_from_cfg(cfg, DATASETS, default_args) + + return dataset diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/custom.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/custom.py new file mode 100644 index 0000000000000000000000000000000000000000..1df6ff3fd16a7a129d2f03555d1c85134d8355d9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/custom.py @@ -0,0 +1,190 @@ +import os.path as osp +from pathlib import Path + +import numpy as np +from torch.utils.data import Dataset + +from .registry import DATASETS +from .pipelines import Compose + + +@DATASETS.register_module +class PointCloudDataset(Dataset): + """An abstract class representing a pytorch-like Dataset. + All other datasets should subclass it. All subclasses should override + ``__len__``, that provides the size of the dataset, and ``__getitem__``, + supporting integer indexing in range from 0 to len(self) exclusive. + """ + + NumPointFeatures = -1 + CLASSES = None + + def __init__( + self, + root_path, + info_path, + pipeline=None, + test_mode=False, + class_names=None, + **kwrags + ): + self._info_path = info_path + self._root_path = Path(root_path) + self._class_names = class_names + + self.test_mode = test_mode + + self._set_group_flag() + + if pipeline is None: + self.pipeline = None + else: + self.pipeline = Compose(pipeline) + + def __getitem__(self, index): + """This function is used for preprocess. + you need to create a input dict in this function for network inference. + format: { + anchors + voxels + num_points + coordinates + if training: + labels + reg_targets + [optional]anchors_mask, slow in SECOND v1.5, don't use this. + [optional]metadata, in kitti, image index is saved in metadata + } + """ + raise NotImplementedError + + def __len__(self): + raise NotImplementedError + + def get_sensor_data(self, query): + """Dataset must provide a unified function to get data. + Args: + query: int or dict. this param must support int for training. + if dict, should have this format (no example yet): + { + sensor_name: { + sensor_meta + } + } + if int, will return all sensor data. + (TODO: how to deal with unsynchronized data?) + Returns: + sensor_data: dict. + if query is int (return all), return a dict with all sensors: + { + sensor_name: sensor_data + ... + metadata: ... (for kitti, contains image_idx) + } + + if sensor is lidar (all lidar point cloud must be concatenated to one array): + e.g. If your dataset have two lidar sensor, you need to return a single dict: + { + "lidar": { + "points": ... + ... + } + } + sensor_data: { + points: [N, 3+] + [optional]annotations: { + "boxes": [N, 7] locs, dims, yaw, in lidar coord system. must tested + in provided visualization tools such as second.utils.simplevis + or web tool. + "names": array of string. + } + } + if sensor is camera (not used yet): + sensor_data: { + data: image string (array is too large) + [optional]annotations: { + "boxes": [N, 4] 2d bbox + "names": array of string. + } + } + metadata: { + # dataset-specific information. + # for kitti, must have image_idx for label file generation. + image_idx: ... + } + [optional]calib # only used for kitti + """ + raise NotImplementedError + + def evaluation(self, dt_annos, output_dir): + """Dataset must provide a evaluation function to evaluate model.""" + raise NotImplementedError + + @property + def ground_truth_annotations(self): + """ + If you want to eval by my KITTI eval function, you must + provide the correct format annotations. + ground_truth_annotations format: + { + bbox: [N, 4], if you fill fake data, MUST HAVE >25 HEIGHT!!!!!! + alpha: [N], you can use -10 to ignore it. + occluded: [N], you can use zero. + truncated: [N], you can use zero. + name: [N] + location: [N, 3] center of 3d box. + dimensions: [N, 3] dim of 3d box. + rotation_y: [N] angle. + } + all fields must be filled, but some fields can fill + zero. + """ + raise NotImplementedError + + def pre_pipeline(self, results): + results["img_prefix"] = self.img_prefix + results["seg_prefix"] = self.seg_prefix + results["proposal_file"] = self.proposal_file + results["bbox_fields"] = [] + results["mask_fields"] = [] + + def _filter_imgs(self, min_size=32): + """Filter images too small.""" + valid_inds = [] + for i, img_info in enumerate(self.img_infos): + if min(img_info["width"], img_info["height"]) >= min_size: + valid_inds.append(i) + return valid_inds + + def _set_group_flag(self): + """Set flag according to image aspect ratio. + Images with aspect ratio greater than 1 will be set as group 1, + otherwise group 0. + """ + self.flag = np.ones(len(self), dtype=np.uint8) + # self.flag = np.zeros(len(self), dtype=np.uint8) + # for i in range(len(self)): + # img_info = self.img_infos[i] + # if img_info['width'] / img_info['height'] > 1: + # self.flag[i] = 1 + + def prepare_train_input(self, idx): + raise NotImplementedError + + # img_info = self.img_infos[idx] + # ann_info = self.get_ann_info(idx) + # results = dict(img_info=img_info, ann_info=ann_info) + # if self.proposals is not None: + # results['proposals'] = self.proposals[idx] + # self.pre_pipeline(results) + # return self.pipeline(results) + + def prepare_test_input(self, idx): + raise NotImplementedError + + # img_info = self.img_infos[idx] + # results = dict(img_info=img_info) + # if self.proposals is not None: + # results['proposals'] = self.proposals[idx] + # self.pre_pipeline(results) + # return self.pipeline(results) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/dataset_factory.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/dataset_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..225632f667f686233b918ccdf2d93cee922a88db --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/dataset_factory.py @@ -0,0 +1,11 @@ +from .nuscenes import NuScenesDataset +from .waymo import WaymoDataset + +dataset_factory = { + "NUSC": NuScenesDataset, + "WAYMO": WaymoDataset +} + + +def get_dataset(dataset_name): + return dataset_factory[dataset_name] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/dataset_wrappers.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/dataset_wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..2d7f17f467a41615f827a7717969f33caac5ef8b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/dataset_wrappers.py @@ -0,0 +1,55 @@ +import numpy as np +from torch.utils.data.dataset import ConcatDataset as _ConcatDataset + +from .registry import DATASETS + + +@DATASETS.register_module +class ConcatDataset(_ConcatDataset): + """A wrapper of concatenated dataset. + + Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but + concat the group flag for image aspect ratio. + + Args: + datasets (list[:obj:`Dataset`]): A list of datasets. + """ + + def __init__(self, datasets): + super(ConcatDataset, self).__init__(datasets) + self.CLASSES = datasets[0].CLASSES + if hasattr(datasets[0], "flag"): + flags = [] + for i in range(0, len(datasets)): + flags.append(datasets[i].flag) + self.flag = np.concatenate(flags) + + +@DATASETS.register_module +class RepeatDataset(object): + """A wrapper of repeated dataset. + + The length of repeated dataset will be `times` larger than the original + dataset. This is useful when the data loading time is long but the dataset + is small. Using RepeatDataset can reduce the data loading time between + epochs. + + Args: + dataset (:obj:`Dataset`): The dataset to be repeated. + times (int): Repeat times. + """ + + def __init__(self, dataset, times): + self.dataset = dataset + self.times = times + self.CLASSES = dataset.CLASSES + if hasattr(self.dataset, "flag"): + self.flag = np.tile(self.dataset.flag, times) + + self._ori_len = len(self.dataset) + + def __getitem__(self, idx): + return self.dataset[idx % self._ori_len] + + def __len__(self): + return self.times * self._ori_len diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/loader/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/loader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0d00da9a5cf31123a504b4019226c12b2a4b6e2b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/loader/__init__.py @@ -0,0 +1,4 @@ +from .build_loader import build_dataloader +from .sampler import DistributedGroupSampler, GroupSampler + +__all__ = ["GroupSampler", "DistributedGroupSampler", "build_dataloader"] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/loader/build_loader.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/loader/build_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..4e2ff9e3526863b60606423f3bfdc772b92c2e6d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/loader/build_loader.py @@ -0,0 +1,57 @@ +import platform +from functools import partial + +from det3d.torchie.parallel import collate, collate_kitti +from det3d.torchie.trainer import get_dist_info +from torch.utils.data import DataLoader + +from .sampler import ( + DistributedGroupSampler, + DistributedSampler, + DistributedSamplerV2, + GroupSampler, +) + +if platform.system() != "Windows": + # https://github.com/pytorch/pytorch/issues/973 + import resource + + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) + + +def build_dataloader( + dataset, batch_size, workers_per_gpu, num_gpus=1, dist=True, **kwargs +): + shuffle = kwargs.get("shuffle", True) + if dist: + rank, world_size = get_dist_info() + # sampler = DistributedSamplerV2(dataset, + # num_replicas=world_size, + # rank=rank, + # shuffle=shuffle) + if shuffle: + sampler = DistributedGroupSampler(dataset, batch_size, world_size, rank) + else: + sampler = DistributedSampler(dataset, world_size, rank, shuffle=False) + batch_size = batch_size + num_workers = workers_per_gpu + else: + sampler = GroupSampler(dataset, batch_size) if shuffle else None + sampler = None + batch_size = num_gpus * batch_size + num_workers = num_gpus * workers_per_gpu + + # TODO change pin_memory + data_loader = DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + shuffle=(sampler is None), + num_workers=num_workers, + collate_fn=collate_kitti, + # pin_memory=True, + pin_memory=False, + ) + + return data_loader diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/loader/sampler.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/loader/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..60ae2cf3252b65a8c6b834ae269a5d0f73d20c57 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/loader/sampler.py @@ -0,0 +1,223 @@ +from __future__ import division +import math + +import numpy as np +import torch +import math +import torch.distributed as dist +from torch.utils.data.sampler import Sampler + +from det3d.torchie.trainer import get_dist_info +from torch.utils.data import DistributedSampler as _DistributedSampler + +# from torch.utils.data import Sampler + + +class DistributedSamplerV2(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + .. note:: + Dataset is assumed to be of constant size. + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + """ + + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + self.shuffle = shuffle + + def __iter__(self): + if self.shuffle: + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + indices += indices[: (self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank : self.total_size : self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch + + +class DistributedSampler(_DistributedSampler): + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): + super().__init__(dataset, num_replicas=num_replicas, rank=rank) + self.shuffle = shuffle + + def __iter__(self): + # deterministically shuffle based on epoch + if self.shuffle: + g = torch.Generator() + g.manual_seed(self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + indices += indices[: (self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank : self.total_size : self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) + + +class GroupSampler(Sampler): + def __init__(self, dataset, samples_per_gpu=1): + assert hasattr(dataset, "flag") + self.dataset = dataset + self.samples_per_gpu = samples_per_gpu + self.flag = dataset.flag.astype(np.int64) + self.group_sizes = np.bincount(self.flag) + self.num_samples = 0 + for i, size in enumerate(self.group_sizes): + self.num_samples += ( + int(np.ceil(size / self.samples_per_gpu)) * self.samples_per_gpu + ) + + def __iter__(self): + indices = [] + for i, size in enumerate(self.group_sizes): + if size == 0: + continue + indice = np.where(self.flag == i)[0] + assert len(indice) == size + np.random.shuffle(indice) + num_extra = int( + np.ceil(size / self.samples_per_gpu) + ) * self.samples_per_gpu - len(indice) + indice = np.concatenate([indice, indice[:num_extra]]) + indices.append(indice) + indices = np.concatenate(indices) + indices = [ + indices[i * self.samples_per_gpu : (i + 1) * self.samples_per_gpu] + for i in np.random.permutation(range(len(indices) // self.samples_per_gpu)) + ] + indices = np.concatenate(indices) + indices = indices.astype(np.int64).tolist() + assert len(indices) == self.num_samples + return iter(indices) + + def __len__(self): + return self.num_samples + + +class DistributedGroupSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + .. note:: + Dataset is assumed to be of constant size. + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + """ + + def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None): + _rank, _num_replicas = get_dist_info() + if num_replicas is None: + num_replicas = _num_replicas + if rank is None: + rank = _rank + self.dataset = dataset + self.samples_per_gpu = samples_per_gpu + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + + assert hasattr(self.dataset, "flag") + self.flag = self.dataset.flag + self.group_sizes = np.bincount(self.flag) + + self.num_samples = 0 + for i, j in enumerate(self.group_sizes): + self.num_samples += ( + int( + math.ceil( + self.group_sizes[i] + * 1.0 + / self.samples_per_gpu + / self.num_replicas + ) + ) + * self.samples_per_gpu + ) + self.total_size = self.num_samples * self.num_replicas + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + for i, size in enumerate(self.group_sizes): + if size > 0: + indice = np.where(self.flag == i)[0] + assert len(indice) == size + indice = indice[list(torch.randperm(int(size), generator=g))].tolist() + extra = int( + math.ceil(size * 1.0 / self.samples_per_gpu / self.num_replicas) + ) * self.samples_per_gpu * self.num_replicas - len(indice) + indice += indice[:extra] + indices += indice + + assert len(indices) == self.total_size + + indices = [ + indices[j] + for i in list( + torch.randperm(len(indices) // self.samples_per_gpu, generator=g) + ) + for j in range(i * self.samples_per_gpu, (i + 1) * self.samples_per_gpu) + ] + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset : offset + self.num_samples] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/nuscenes/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/nuscenes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..02b035e0c0881d2a7de1c294239ba188340f64a9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/nuscenes/__init__.py @@ -0,0 +1,4 @@ +from .nuscenes import NuScenesDataset +from .nusc_common import * + +__all__ = ["NuScenesDataset"] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/nuscenes/nusc_common.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/nuscenes/nusc_common.py new file mode 100644 index 0000000000000000000000000000000000000000..9f8737959e1340e03a7f91d4145a4c0f3afbafa0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/nuscenes/nusc_common.py @@ -0,0 +1,622 @@ +import numpy as np +import pickle + +from pathlib import Path +from functools import reduce +from typing import List + +from tqdm import tqdm +from pyquaternion import Quaternion + +try: + from nuscenes import NuScenes + from nuscenes.utils import splits + from nuscenes.utils.data_classes import Box + from nuscenes.utils.geometry_utils import transform_matrix + from nuscenes.eval.detection.config import config_factory + from nuscenes.eval.detection.evaluate import NuScenesEval +except: + print("nuScenes devkit not Found!") + +general_to_detection = { + "human.pedestrian.adult": "pedestrian", + "human.pedestrian.child": "pedestrian", + "human.pedestrian.wheelchair": "ignore", + "human.pedestrian.stroller": "ignore", + "human.pedestrian.personal_mobility": "ignore", + "human.pedestrian.police_officer": "pedestrian", + "human.pedestrian.construction_worker": "pedestrian", + "animal": "ignore", + "vehicle.car": "car", + "vehicle.motorcycle": "motorcycle", + "vehicle.bicycle": "bicycle", + "vehicle.bus.bendy": "bus", + "vehicle.bus.rigid": "bus", + "vehicle.truck": "truck", + "vehicle.construction": "construction_vehicle", + "vehicle.emergency.ambulance": "ignore", + "vehicle.emergency.police": "ignore", + "vehicle.trailer": "trailer", + "movable_object.barrier": "barrier", + "movable_object.trafficcone": "traffic_cone", + "movable_object.pushable_pullable": "ignore", + "movable_object.debris": "ignore", + "static_object.bicycle_rack": "ignore", +} + +cls_attr_dist = { + "barrier": { + "cycle.with_rider": 0, + "cycle.without_rider": 0, + "pedestrian.moving": 0, + "pedestrian.sitting_lying_down": 0, + "pedestrian.standing": 0, + "vehicle.moving": 0, + "vehicle.parked": 0, + "vehicle.stopped": 0, + }, + "bicycle": { + "cycle.with_rider": 2791, + "cycle.without_rider": 8946, + "pedestrian.moving": 0, + "pedestrian.sitting_lying_down": 0, + "pedestrian.standing": 0, + "vehicle.moving": 0, + "vehicle.parked": 0, + "vehicle.stopped": 0, + }, + "bus": { + "cycle.with_rider": 0, + "cycle.without_rider": 0, + "pedestrian.moving": 0, + "pedestrian.sitting_lying_down": 0, + "pedestrian.standing": 0, + "vehicle.moving": 9092, + "vehicle.parked": 3294, + "vehicle.stopped": 3881, + }, + "car": { + "cycle.with_rider": 0, + "cycle.without_rider": 0, + "pedestrian.moving": 0, + "pedestrian.sitting_lying_down": 0, + "pedestrian.standing": 0, + "vehicle.moving": 114304, + "vehicle.parked": 330133, + "vehicle.stopped": 46898, + }, + "construction_vehicle": { + "cycle.with_rider": 0, + "cycle.without_rider": 0, + "pedestrian.moving": 0, + "pedestrian.sitting_lying_down": 0, + "pedestrian.standing": 0, + "vehicle.moving": 882, + "vehicle.parked": 11549, + "vehicle.stopped": 2102, + }, + "ignore": { + "cycle.with_rider": 307, + "cycle.without_rider": 73, + "pedestrian.moving": 0, + "pedestrian.sitting_lying_down": 0, + "pedestrian.standing": 0, + "vehicle.moving": 165, + "vehicle.parked": 400, + "vehicle.stopped": 102, + }, + "motorcycle": { + "cycle.with_rider": 4233, + "cycle.without_rider": 8326, + "pedestrian.moving": 0, + "pedestrian.sitting_lying_down": 0, + "pedestrian.standing": 0, + "vehicle.moving": 0, + "vehicle.parked": 0, + "vehicle.stopped": 0, + }, + "pedestrian": { + "cycle.with_rider": 0, + "cycle.without_rider": 0, + "pedestrian.moving": 157444, + "pedestrian.sitting_lying_down": 13939, + "pedestrian.standing": 46530, + "vehicle.moving": 0, + "vehicle.parked": 0, + "vehicle.stopped": 0, + }, + "traffic_cone": { + "cycle.with_rider": 0, + "cycle.without_rider": 0, + "pedestrian.moving": 0, + "pedestrian.sitting_lying_down": 0, + "pedestrian.standing": 0, + "vehicle.moving": 0, + "vehicle.parked": 0, + "vehicle.stopped": 0, + }, + "trailer": { + "cycle.with_rider": 0, + "cycle.without_rider": 0, + "pedestrian.moving": 0, + "pedestrian.sitting_lying_down": 0, + "pedestrian.standing": 0, + "vehicle.moving": 3421, + "vehicle.parked": 19224, + "vehicle.stopped": 1895, + }, + "truck": { + "cycle.with_rider": 0, + "cycle.without_rider": 0, + "pedestrian.moving": 0, + "pedestrian.sitting_lying_down": 0, + "pedestrian.standing": 0, + "vehicle.moving": 21339, + "vehicle.parked": 55626, + "vehicle.stopped": 11097, + }, +} + +def _second_det_to_nusc_box(detection): + box3d = detection["box3d_lidar"].detach().cpu().numpy() + scores = detection["scores"].detach().cpu().numpy() + labels = detection["label_preds"].detach().cpu().numpy() + box3d[:, -1] = -box3d[:, -1] - np.pi / 2 + box_list = [] + for i in range(box3d.shape[0]): + quat = Quaternion(axis=[0, 0, 1], radians=box3d[i, -1]) + velocity = (*box3d[i, 6:8], 0.0) + box = Box( + box3d[i, :3], + box3d[i, 3:6], + quat, + label=labels[i], + score=scores[i], + velocity=velocity, + ) + box_list.append(box) + return box_list + + +def _lidar_nusc_box_to_global(nusc, boxes, sample_token): + try: + s_record = nusc.get("sample", sample_token) + sample_data_token = s_record["data"]["LIDAR_TOP"] + except: + sample_data_token = sample_token + + sd_record = nusc.get("sample_data", sample_data_token) + cs_record = nusc.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) + pose_record = nusc.get("ego_pose", sd_record["ego_pose_token"]) + + box_list = [] + for box in boxes: + # Move box to ego vehicle coord system + box.rotate(Quaternion(cs_record["rotation"])) + box.translate(np.array(cs_record["translation"])) + # Move box to global coord system + box.rotate(Quaternion(pose_record["rotation"])) + box.translate(np.array(pose_record["translation"])) + box_list.append(box) + return box_list + + +def _get_available_scenes(nusc): + available_scenes = [] + print("total scene num:", len(nusc.scene)) + for scene in nusc.scene: + scene_token = scene["token"] + scene_rec = nusc.get("scene", scene_token) + sample_rec = nusc.get("sample", scene_rec["first_sample_token"]) + sd_rec = nusc.get("sample_data", sample_rec["data"]["LIDAR_TOP"]) + has_more_frames = True + scene_not_exist = False + while has_more_frames: + lidar_path, boxes, _ = nusc.get_sample_data(sd_rec["token"]) + if not Path(lidar_path).exists(): + scene_not_exist = True + break + else: + break + if scene_not_exist: + continue + available_scenes.append(scene) + print("exist scene num:", len(available_scenes)) + return available_scenes + + +def get_sample_data( + nusc, sample_data_token: str, selected_anntokens: List[str] = None +): + """ + Returns the data path as well as all annotations related to that sample_data. + Note that the boxes are transformed into the current sensor's coordinate frame. + :param sample_data_token: Sample_data token. + :param selected_anntokens: If provided only return the selected annotation. + :return: (data_path, boxes, camera_intrinsic ) + """ + + # Retrieve sensor & pose records + sd_record = nusc.get("sample_data", sample_data_token) + cs_record = nusc.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) + sensor_record = nusc.get("sensor", cs_record["sensor_token"]) + pose_record = nusc.get("ego_pose", sd_record["ego_pose_token"]) + + data_path = nusc.get_sample_data_path(sample_data_token) + + if sensor_record["modality"] == "camera": + cam_intrinsic = np.array(cs_record["camera_intrinsic"]) + else: + cam_intrinsic = None + + # Retrieve all sample annotations and map to sensor coordinate system. + if selected_anntokens is not None: + boxes = list(map(nusc.get_box, selected_anntokens)) + else: + boxes = nusc.get_boxes(sample_data_token) + + # Make list of Box objects including coord system transforms. + box_list = [] + for box in boxes: + box.velocity = nusc.box_velocity(box.token) + # Move box to ego vehicle coord system + box.translate(-np.array(pose_record["translation"])) + box.rotate(Quaternion(pose_record["rotation"]).inverse) + + # Move box to sensor coord system + box.translate(-np.array(cs_record["translation"])) + box.rotate(Quaternion(cs_record["rotation"]).inverse) + + box_list.append(box) + + return data_path, box_list, cam_intrinsic + +CAM_CHANS = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_FRONT_LEFT'] + + +def get_lidar_to_image_transform(nusc, pointsensor, camera_sensor): + tms = [] + intrinsics = [] + cam_paths = [] + for chan in CAM_CHANS: + cam = camera_sensor[chan] + + # Points live in the point sensor frame. So they need to be transformed via global to the image plane. + # First step: transform the point-cloud to the ego vehicle frame for the timestamp of the sweep. + lidar_cs_record = nusc.get('calibrated_sensor', pointsensor['calibrated_sensor_token']) + car_from_lidar = transform_matrix( + lidar_cs_record["translation"], Quaternion(lidar_cs_record["rotation"]), inverse=False + ) + + # Second step: transform to the global frame. + lidar_poserecord = nusc.get('ego_pose', pointsensor['ego_pose_token']) + global_from_car = transform_matrix( + lidar_poserecord["translation"], Quaternion(lidar_poserecord["rotation"]), inverse=False, + ) + + # Third step: transform into the ego vehicle frame for the timestamp of the image. + cam_poserecord = nusc.get('ego_pose', cam['ego_pose_token']) + car_from_global = transform_matrix( + cam_poserecord["translation"], + Quaternion(cam_poserecord["rotation"]), + inverse=True, + ) + + # Fourth step: transform into the camera. + cam_cs_record = nusc.get('calibrated_sensor', cam['calibrated_sensor_token']) + cam_from_car = transform_matrix( + cam_cs_record["translation"], Quaternion(cam_cs_record["rotation"]), inverse=True + ) + + tm = reduce( + np.dot, + [cam_from_car, car_from_global, global_from_car, car_from_lidar], + ) + + cam_path, _, intrinsic = nusc.get_sample_data(cam['token']) + + tms.append(tm) + intrinsics.append(intrinsic) + cam_paths.append(cam_path ) + + return tms, intrinsics, cam_paths + +def find_closet_camera_tokens(nusc, pointsensor, ref_sample): + lidar_timestamp = pointsensor["timestamp"] + + min_cams = {} + + for chan in CAM_CHANS: + camera_token = ref_sample['data'][chan] + + cam = nusc.get('sample_data', camera_token) + min_diff = abs(lidar_timestamp - cam['timestamp']) + min_cam = cam + + for i in range(6): # nusc allows at most 6 previous camera frames + if cam['prev'] == "": + break + + cam = nusc.get('sample_data', cam['prev']) + cam_timestamp = cam['timestamp'] + + diff = abs(lidar_timestamp-cam_timestamp) + + if (diff < min_diff): + min_diff = diff + min_cam = cam + + min_cams[chan] = min_cam + + return min_cams + + +def _fill_trainval_infos(nusc, train_scenes, val_scenes, test=False, nsweeps=10, filter_zero=True): + from nuscenes.utils.geometry_utils import transform_matrix + + train_nusc_infos = [] + val_nusc_infos = [] + + ref_chan = "LIDAR_TOP" # The radar channel from which we track back n sweeps to aggregate the point cloud. + chan = "LIDAR_TOP" # The reference channel of the current sample_rec that the point clouds are mapped to. + + for sample in tqdm(nusc.sample): + """ Manual save info["sweeps"] """ + # Get reference pose and timestamp + # ref_chan == "LIDAR_TOP" + ref_sd_token = sample["data"][ref_chan] + ref_sd_rec = nusc.get("sample_data", ref_sd_token) + ref_cs_rec = nusc.get( + "calibrated_sensor", ref_sd_rec["calibrated_sensor_token"] + ) + ref_pose_rec = nusc.get("ego_pose", ref_sd_rec["ego_pose_token"]) + ref_time = 1e-6 * ref_sd_rec["timestamp"] + + ref_lidar_path, ref_boxes, _ = get_sample_data(nusc, ref_sd_token) + + ref_cam_front_token = sample["data"]["CAM_FRONT"] + ref_cam_path, _, ref_cam_intrinsic = nusc.get_sample_data(ref_cam_front_token) + + # Homogeneous transform from ego car frame to reference frame + ref_from_car = transform_matrix( + ref_cs_rec["translation"], Quaternion(ref_cs_rec["rotation"]), inverse=True + ) + + # Homogeneous transformation matrix from global to _current_ ego car frame + car_from_global = transform_matrix( + ref_pose_rec["translation"], + Quaternion(ref_pose_rec["rotation"]), + inverse=True, + ) + + ref_cams = {} + # get all camera sensor data + for cam_chan in CAM_CHANS: + camera_token = sample['data'][cam_chan] + cam = nusc.get('sample_data', camera_token) + + ref_cams[cam_chan] = cam + + # get camera info for point painting + all_cams_from_lidar, all_cams_intrinsic, all_cams_path = get_lidar_to_image_transform(nusc, pointsensor=ref_sd_rec, camera_sensor=ref_cams) + + info = { + "lidar_path": ref_lidar_path, + "cam_front_path": ref_cam_path, + "cam_intrinsic": ref_cam_intrinsic, + "token": sample["token"], + "sweeps": [], + "ref_from_car": ref_from_car, + "car_from_global": car_from_global, + "timestamp": ref_time, + "all_cams_from_lidar": all_cams_from_lidar, + "all_cams_intrinsic": all_cams_intrinsic, + "all_cams_path": all_cams_path + } + + sample_data_token = sample["data"][chan] + curr_sd_rec = nusc.get("sample_data", sample_data_token) + sweeps = [] + while len(sweeps) < nsweeps - 1: + if curr_sd_rec["prev"] == "": + if len(sweeps) == 0: + sweep = { + "lidar_path": ref_lidar_path, + "sample_data_token": curr_sd_rec["token"], + "transform_matrix": None, + "time_lag": curr_sd_rec["timestamp"] * 0, + "all_cams_from_lidar": all_cams_from_lidar, + "all_cams_intrinsic": all_cams_intrinsic, + "all_cams_path": all_cams_path + } + sweeps.append(sweep) + else: + sweeps.append(sweeps[-1]) + else: + curr_sd_rec = nusc.get("sample_data", curr_sd_rec["prev"]) + + # get nearest camera frame data + cam_data = find_closet_camera_tokens(nusc, curr_sd_rec, ref_sample=sample) + cur_cams_from_lidar, cur_cams_intrinsic, cur_cams_path = get_lidar_to_image_transform(nusc, pointsensor=curr_sd_rec, camera_sensor=cam_data) + + # Get past pose + current_pose_rec = nusc.get("ego_pose", curr_sd_rec["ego_pose_token"]) + global_from_car = transform_matrix( + current_pose_rec["translation"], + Quaternion(current_pose_rec["rotation"]), + inverse=False, + ) + + # Homogeneous transformation matrix from sensor coordinate frame to ego car frame. + current_cs_rec = nusc.get( + "calibrated_sensor", curr_sd_rec["calibrated_sensor_token"] + ) + car_from_current = transform_matrix( + current_cs_rec["translation"], + Quaternion(current_cs_rec["rotation"]), + inverse=False, + ) + + tm = reduce( + np.dot, + [ref_from_car, car_from_global, global_from_car, car_from_current], + ) + + lidar_path = nusc.get_sample_data_path(curr_sd_rec["token"]) + + time_lag = ref_time - 1e-6 * curr_sd_rec["timestamp"] + + sweep = { + "lidar_path": lidar_path, + "sample_data_token": curr_sd_rec["token"], + "transform_matrix": tm, + "global_from_car": global_from_car, + "car_from_current": car_from_current, + "time_lag": time_lag, + "all_cams_from_lidar": cur_cams_from_lidar, + "all_cams_intrinsic": cur_cams_intrinsic, + "all_cams_path": cur_cams_path + } + sweeps.append(sweep) + + info["sweeps"] = sweeps + + assert ( + len(info["sweeps"]) == nsweeps - 1 + ) + + if not test: + annotations = [ + nusc.get("sample_annotation", token) for token in sample["anns"] + ] + + mask = np.array([(anno['num_lidar_pts'] + anno['num_radar_pts'])>0 for anno in annotations], dtype=bool).reshape(-1) + + locs = np.array([b.center for b in ref_boxes]).reshape(-1, 3) + dims = np.array([b.wlh for b in ref_boxes]).reshape(-1, 3) + # rots = np.array([b.orientation.yaw_pitch_roll[0] for b in ref_boxes]).reshape(-1, 1) + velocity = np.array([b.velocity for b in ref_boxes]).reshape(-1, 3) + rots = np.array([quaternion_yaw(b.orientation) for b in ref_boxes]).reshape( + -1, 1 + ) + names = np.array([b.name for b in ref_boxes]) + tokens = np.array([b.token for b in ref_boxes]) + gt_boxes = np.concatenate( + [locs, dims, velocity[:, :2], -rots - np.pi / 2], axis=1 + ) + # gt_boxes = np.concatenate([locs, dims, rots], axis=1) + + assert len(annotations) == len(gt_boxes) == len(velocity) + + if not filter_zero: + info["gt_boxes"] = gt_boxes + info["gt_boxes_velocity"] = velocity + info["gt_names"] = np.array([general_to_detection[name] for name in names]) + info["gt_boxes_token"] = tokens + else: + info["gt_boxes"] = gt_boxes[mask, :] + info["gt_boxes_velocity"] = velocity[mask, :] + info["gt_names"] = np.array([general_to_detection[name] for name in names])[mask] + info["gt_boxes_token"] = tokens[mask] + + if sample["scene_token"] in train_scenes: + train_nusc_infos.append(info) + else: + val_nusc_infos.append(info) + + return train_nusc_infos, val_nusc_infos + + +def quaternion_yaw(q: Quaternion) -> float: + """ + Calculate the yaw angle from a quaternion. + Note that this only works for a quaternion that represents a box in lidar or global coordinate frame. + It does not work for a box in the camera frame. + :param q: Quaternion of interest. + :return: Yaw angle in radians. + """ + + # Project into xy plane. + v = np.dot(q.rotation_matrix, np.array([1, 0, 0])) + + # Measure yaw using arctan. + yaw = np.arctan2(v[1], v[0]) + + return yaw + + +def create_nuscenes_infos(root_path, version="v1.0-trainval", nsweeps=10, filter_zero=True): + nusc = NuScenes(version=version, dataroot=root_path, verbose=True) + available_vers = ["v1.0-trainval", "v1.0-test", "v1.0-mini"] + assert version in available_vers + if version == "v1.0-trainval": + train_scenes = splits.train + # random.shuffle(train_scenes) + # train_scenes = train_scenes[:int(len(train_scenes)*0.2)] + val_scenes = splits.val + elif version == "v1.0-test": + train_scenes = splits.test + val_scenes = [] + elif version == "v1.0-mini": + train_scenes = splits.mini_train + val_scenes = splits.mini_val + else: + raise ValueError("unknown") + test = "test" in version + root_path = Path(root_path) + # filter exist scenes. you may only download part of dataset. + available_scenes = _get_available_scenes(nusc) + available_scene_names = [s["name"] for s in available_scenes] + train_scenes = list(filter(lambda x: x in available_scene_names, train_scenes)) + val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes)) + train_scenes = set( + [ + available_scenes[available_scene_names.index(s)]["token"] + for s in train_scenes + ] + ) + val_scenes = set( + [available_scenes[available_scene_names.index(s)]["token"] for s in val_scenes] + ) + if test: + print(f"test scene: {len(train_scenes)}") + else: + print(f"train scene: {len(train_scenes)}, val scene: {len(val_scenes)}") + + train_nusc_infos, val_nusc_infos = _fill_trainval_infos( + nusc, train_scenes, val_scenes, test, nsweeps=nsweeps, filter_zero=filter_zero + ) + + if test: + print(f"test sample: {len(train_nusc_infos)}") + with open( + root_path / "infos_test_{:02d}sweeps_withvelo.pkl".format(nsweeps), "wb" + ) as f: + pickle.dump(train_nusc_infos, f) + else: + print( + f"train sample: {len(train_nusc_infos)}, val sample: {len(val_nusc_infos)}" + ) + with open( + root_path / "infos_train_{:02d}sweeps_withvelo_filter_{}.pkl".format(nsweeps, filter_zero), "wb" + ) as f: + pickle.dump(train_nusc_infos, f) + with open( + root_path / "infos_val_{:02d}sweeps_withvelo_filter_{}.pkl".format(nsweeps, filter_zero), "wb" + ) as f: + pickle.dump(val_nusc_infos, f) + + +def eval_main(nusc, eval_version, res_path, eval_set, output_dir): + # nusc = NuScenes(version=version, dataroot=str(root_path), verbose=True) + cfg = config_factory(eval_version) + + nusc_eval = NuScenesEval( + nusc, + config=cfg, + result_path=res_path, + eval_set=eval_set, + output_dir=output_dir, + verbose=True, + ) + metrics_summary = nusc_eval.main(plot_examples=10,) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/nuscenes/nuscenes.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/nuscenes/nuscenes.py new file mode 100644 index 0000000000000000000000000000000000000000..bd3528e83c799bef6453ac47ce2e67fdd2d919aa --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/nuscenes/nuscenes.py @@ -0,0 +1,331 @@ +import sys +import pickle +import json +import random +import operator +import numpy as np + +from functools import reduce +from pathlib import Path +from copy import deepcopy + +try: + from nuscenes.nuscenes import NuScenes + from nuscenes.eval.detection.config import config_factory +except: + print("nuScenes devkit not found!") + +from det3d.datasets.custom import PointCloudDataset +from det3d.datasets.nuscenes.nusc_common import ( + general_to_detection, + cls_attr_dist, + _second_det_to_nusc_box, + _lidar_nusc_box_to_global, + eval_main +) +from det3d.datasets.registry import DATASETS + + +@DATASETS.register_module +class NuScenesDataset(PointCloudDataset): + NumPointFeatures = 5 # x, y, z, intensity, ring_index + + def __init__( + self, + info_path, + root_path, + nsweeps=0, # here set to zero to catch unset nsweep + cfg=None, + pipeline=None, + class_names=None, + test_mode=False, + version="v1.0-trainval", + load_interval=1, + **kwargs, + ): + self.load_interval = load_interval + super(NuScenesDataset, self).__init__( + root_path, info_path, pipeline, test_mode=test_mode, class_names=class_names + ) + + self.nsweeps = nsweeps + assert self.nsweeps > 0, "At least input one sweep please!" + print(self.nsweeps) + + self._info_path = info_path + self._class_names = class_names + + if not hasattr(self, "_nusc_infos"): + self.load_infos(self._info_path) + + self._num_point_features = NuScenesDataset.NumPointFeatures + self._name_mapping = general_to_detection + + self.virtual = kwargs.get('virtual', False) + if self.virtual: + self._num_point_features = 16 + + self.version = version + self.eval_version = "detection_cvpr_2019" + + def reset(self): + self.logger.info(f"re-sample {self.frac} frames from full set") + random.shuffle(self._nusc_infos_all) + self._nusc_infos = self._nusc_infos_all[: self.frac] + + def load_infos(self, info_path): + + with open(self._info_path, "rb") as f: + _nusc_infos_all = pickle.load(f) + + _nusc_infos_all = _nusc_infos_all[::self.load_interval] + + if not self.test_mode: # if training + self.frac = int(len(_nusc_infos_all) * 0.25) + + _cls_infos = {name: [] for name in self._class_names} + for info in _nusc_infos_all: + for name in set(info["gt_names"]): + if name in self._class_names: + _cls_infos[name].append(info) + + duplicated_samples = sum([len(v) for _, v in _cls_infos.items()]) + _cls_dist = {k: len(v) / max(duplicated_samples, 1) for k, v in _cls_infos.items()} + + self._nusc_infos = [] + + frac = 1.0 / len(self._class_names) + ratios = [frac / v for v in _cls_dist.values()] + + for cls_infos, ratio in zip(list(_cls_infos.values()), ratios): + self._nusc_infos += np.random.choice( + cls_infos, int(len(cls_infos) * ratio) + ).tolist() + + _cls_infos = {name: [] for name in self._class_names} + for info in self._nusc_infos: + for name in set(info["gt_names"]): + if name in self._class_names: + _cls_infos[name].append(info) + + _cls_dist = { + k: len(v) / len(self._nusc_infos) for k, v in _cls_infos.items() + } + else: + if isinstance(_nusc_infos_all, dict): + self._nusc_infos = [] + for v in _nusc_infos_all.values(): + self._nusc_infos.extend(v) + else: + self._nusc_infos = _nusc_infos_all + + def __len__(self): + + if not hasattr(self, "_nusc_infos"): + self.load_infos(self._info_path) + + return len(self._nusc_infos) + + @property + def ground_truth_annotations(self): + if "gt_boxes" not in self._nusc_infos[0]: + return None + cls_range_map = config_factory(self.eval_version).serialize()['class_range'] + gt_annos = [] + for info in self._nusc_infos: + gt_names = np.array(info["gt_names"]) + gt_boxes = info["gt_boxes"] + mask = np.array([n != "ignore" for n in gt_names], dtype=np.bool_) + gt_names = gt_names[mask] + gt_boxes = gt_boxes[mask] + # det_range = np.array([cls_range_map[n] for n in gt_names_mapped]) + det_range = np.array([cls_range_map[n] for n in gt_names]) + det_range = det_range[..., np.newaxis] @ np.array([[-1, -1, 1, 1]]) + mask = (gt_boxes[:, :2] >= det_range[:, :2]).all(1) + mask &= (gt_boxes[:, :2] <= det_range[:, 2:]).all(1) + N = int(np.sum(mask)) + gt_annos.append( + { + "bbox": np.tile(np.array([[0, 0, 50, 50]]), [N, 1]), + "alpha": np.full(N, -10), + "occluded": np.zeros(N), + "truncated": np.zeros(N), + "name": gt_names[mask], + "location": gt_boxes[mask][:, :3], + "dimensions": gt_boxes[mask][:, 3:6], + "rotation_y": gt_boxes[mask][:, 6], + "token": info["token"], + } + ) + return gt_annos + + def get_sensor_data(self, idx): + + info = self._nusc_infos[idx] + + res = { + "lidar": { + "type": "lidar", + "points": None, + "nsweeps": self.nsweeps, + # "ground_plane": -gp[-1] if with_gp else None, + "annotations": None, + }, + "metadata": { + "image_prefix": self._root_path, + "num_point_features": self._num_point_features, + "token": info["token"], + }, + "calib": None, + "cam": {}, + "mode": "val" if self.test_mode else "train", + "virtual": self.virtual + } + + data, _ = self.pipeline(res, info) + + return data + + def __getitem__(self, idx): + return self.get_sensor_data(idx) + + def evaluation(self, detections, output_dir=None, testset=False): + version = self.version + eval_set_map = { + "v1.0-mini": "mini_val", + "v1.0-trainval": "val", + "v1.0-test": "test", + } + + if not testset: + dets = [] + gt_annos = self.ground_truth_annotations + assert gt_annos is not None + + miss = 0 + for gt in gt_annos: + try: + dets.append(detections[gt["token"]]) + except Exception: + miss += 1 + + assert miss == 0 + else: + dets = [v for _, v in detections.items()] + assert len(detections) == 6008 + + nusc_annos = { + "results": {}, + "meta": None, + } + + nusc = NuScenes(version=version, dataroot=str(self._root_path), verbose=True) + + mapped_class_names = [] + for n in self._class_names: + if n in self._name_mapping: + mapped_class_names.append(self._name_mapping[n]) + else: + mapped_class_names.append(n) + + for det in dets: + annos = [] + boxes = _second_det_to_nusc_box(det) + boxes = _lidar_nusc_box_to_global(nusc, boxes, det["metadata"]["token"]) + for i, box in enumerate(boxes): + name = mapped_class_names[box.label] + if np.sqrt(box.velocity[0] ** 2 + box.velocity[1] ** 2) > 0.2: + if name in [ + "car", + "construction_vehicle", + "bus", + "truck", + "trailer", + ]: + attr = "vehicle.moving" + elif name in ["bicycle", "motorcycle"]: + attr = "cycle.with_rider" + else: + attr = None + else: + if name in ["pedestrian"]: + attr = "pedestrian.standing" + elif name in ["bus"]: + attr = "vehicle.stopped" + else: + attr = None + + nusc_anno = { + "sample_token": det["metadata"]["token"], + "translation": box.center.tolist(), + "size": box.wlh.tolist(), + "rotation": box.orientation.elements.tolist(), + "velocity": box.velocity[:2].tolist(), + "detection_name": name, + "detection_score": box.score, + "attribute_name": attr + if attr is not None + else max(cls_attr_dist[name].items(), key=operator.itemgetter(1))[ + 0 + ], + } + annos.append(nusc_anno) + nusc_annos["results"].update({det["metadata"]["token"]: annos}) + + nusc_annos["meta"] = { + "use_camera": False, + "use_lidar": True, + "use_radar": False, + "use_map": False, + "use_external": False, + } + + name = self._info_path.split("/")[-1].split(".")[0] + res_path = str(Path(output_dir) / Path(name + ".json")) + with open(res_path, "w") as f: + json.dump(nusc_annos, f) + + print(f"Finish generate predictions for testset, save to {res_path}") + + if not testset: + eval_main( + nusc, + self.eval_version, + res_path, + eval_set_map[self.version], + output_dir, + ) + + with open(Path(output_dir) / "metrics_summary.json", "r") as f: + metrics = json.load(f) + + detail = {} + result = f"Nusc {version} Evaluation\n" + for name in mapped_class_names: + detail[name] = {} + for k, v in metrics["label_aps"][name].items(): + detail[name][f"dist@{k}"] = v + threshs = ", ".join(list(metrics["label_aps"][name].keys())) + scores = list(metrics["label_aps"][name].values()) + mean = sum(scores) / len(scores) + scores = ", ".join([f"{s * 100:.2f}" for s in scores]) + result += f"{name} Nusc dist AP@{threshs}\n" + result += scores + result += f" mean AP: {mean}" + result += "\n" + res_nusc = { + "results": {"nusc": result}, + "detail": {"nusc": detail}, + } + else: + res_nusc = None + + if res_nusc is not None: + res = { + "results": {"nusc": res_nusc["results"]["nusc"],}, + "detail": {"eval.nusc": res_nusc["detail"]["nusc"],}, + } + else: + res = None + + return res, None diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4cef197bd8011858f79ab159c3f7fe133e38bc05 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/__init__.py @@ -0,0 +1,25 @@ +from .compose import Compose +from .formating import Reformat + +# from .loading import LoadAnnotations, LoadImageFromFile, LoadProposals +from .loading import * +from .test_aug import DoubleFlip +from .preprocess import Preprocess, Voxelization + +__all__ = [ + "Compose", + "to_tensor", + "ToTensor", + "ImageToTensor", + "ToDataContainer", + "Transpose", + "Collect", + "LoadImageAnnotations", + "LoadImageFromFile", + "LoadProposals", + "PhotoMetricDistortion", + "Preprocess", + "Voxelization", + "AssignTarget", + "AssignLabel" +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/compose.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/compose.py new file mode 100644 index 0000000000000000000000000000000000000000..f9856adb70f2e7c08b911dfb904c8335e563df21 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/compose.py @@ -0,0 +1,37 @@ +import collections + +from det3d.utils import build_from_cfg +from ..registry import PIPELINES + + +@PIPELINES.register_module +class Compose(object): + def __init__(self, transforms): + assert isinstance(transforms, collections.abc.Sequence) + self.transforms = [] + for transform in transforms: + if isinstance(transform, dict): + if transform['type'] == 'Empty': + continue + transform = build_from_cfg(transform, PIPELINES) + self.transforms.append(transform) + elif callable(transform): + self.transforms.append(transform) + else: + raise TypeError("transform must be callable or a dict") + + def __call__(self, res, info): + for t in self.transforms: + res, info = t(res, info) + if res is None: + return None + return res, info + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string + diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/formating.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/formating.py new file mode 100644 index 0000000000000000000000000000000000000000..11783b5b25e95c0d3fafa9ee4ee61bca5125c07a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/formating.py @@ -0,0 +1,90 @@ +from det3d import torchie +import numpy as np +import torch + +from ..registry import PIPELINES + + +class DataBundle(object): + def __init__(self, data): + self.data = data + + +@PIPELINES.register_module +class Reformat(object): + def __init__(self, **kwargs): + double_flip = kwargs.get('double_flip', False) + self.double_flip = double_flip + + def __call__(self, res, info): + meta = res["metadata"] + points = res["lidar"]["points"] + + data_bundle = dict( + metadata=meta + ) + if points is not None: + data_bundle.update(points=points) + + if 'voxels' in res["lidar"]: + voxels = res["lidar"]["voxels"] + + data_bundle.update( + voxels=voxels["voxels"], + shape=voxels["shape"], + num_points=voxels["num_points"], + num_voxels=voxels["num_voxels"], + coordinates=voxels["coordinates"], + ) + + if res["mode"] == "train": + data_bundle.update(res["lidar"]["targets"]) + elif res["mode"] == "val": + data_bundle.update(dict(metadata=meta, )) + + if self.double_flip: + # y axis + yflip_points = res["lidar"]["yflip_points"] + yflip_voxels = res["lidar"]["yflip_voxels"] + yflip_data_bundle = dict( + metadata=meta, + points=yflip_points, + voxels=yflip_voxels["voxels"], + shape=yflip_voxels["shape"], + num_points=yflip_voxels["num_points"], + num_voxels=yflip_voxels["num_voxels"], + coordinates=yflip_voxels["coordinates"], + ) + + # x axis + xflip_points = res["lidar"]["xflip_points"] + xflip_voxels = res["lidar"]["xflip_voxels"] + xflip_data_bundle = dict( + metadata=meta, + points=xflip_points, + voxels=xflip_voxels["voxels"], + shape=xflip_voxels["shape"], + num_points=xflip_voxels["num_points"], + num_voxels=xflip_voxels["num_voxels"], + coordinates=xflip_voxels["coordinates"], + ) + # double axis flip + double_flip_points = res["lidar"]["double_flip_points"] + double_flip_voxels = res["lidar"]["double_flip_voxels"] + double_flip_data_bundle = dict( + metadata=meta, + points=double_flip_points, + voxels=double_flip_voxels["voxels"], + shape=double_flip_voxels["shape"], + num_points=double_flip_voxels["num_points"], + num_voxels=double_flip_voxels["num_voxels"], + coordinates=double_flip_voxels["coordinates"], + ) + + return [data_bundle, yflip_data_bundle, xflip_data_bundle, double_flip_data_bundle], info + + + return data_bundle, info + + + diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/loading.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/loading.py new file mode 100644 index 0000000000000000000000000000000000000000..5baa3dc41aa0061cfb4ec8ad7c786dc6c95a7bbc --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/loading.py @@ -0,0 +1,209 @@ +import os.path as osp +import warnings +import numpy as np +from functools import reduce + +import pycocotools.mask as maskUtils + +from pathlib import Path +from copy import deepcopy +from det3d import torchie +from det3d.core import box_np_ops +import pickle +import os +from ..registry import PIPELINES + +def _dict_select(dict_, inds): + for k, v in dict_.items(): + if isinstance(v, dict): + _dict_select(v, inds) + else: + dict_[k] = v[inds] + +def read_file(path, tries=2, num_point_feature=4, virtual=False): + if virtual: + # WARNING: hard coded for nuScenes + points = np.fromfile(path, dtype=np.float32).reshape(-1, 5)[:, :num_point_feature] + tokens = path.split('/') + seg_path = os.path.join(*tokens[:-2], tokens[-2]+"_VIRTUAL", tokens[-1]+'.pkl.npy') + data_dict = np.load(seg_path, allow_pickle=True).item() + + # remove reflectance as other virtual points don't have this value + virtual_points1 = data_dict['real_points'][:, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]] + virtual_points2 = data_dict['virtual_points'] + + points = np.concatenate([points, np.ones([points.shape[0], 15-num_point_feature])], axis=1) + virtual_points1 = np.concatenate([virtual_points1, np.zeros([virtual_points1.shape[0], 1])], axis=1) + virtual_points2 = np.concatenate([virtual_points2, -1 * np.ones([virtual_points2.shape[0], 1])], axis=1) + points = np.concatenate([points, virtual_points1, virtual_points2], axis=0).astype(np.float32) + else: + points = np.fromfile(path, dtype=np.float32).reshape(-1, 5)[:, :num_point_feature] + + return points + + +def remove_close(points, radius: float) -> None: + """ + Removes point too close within a certain radius from origin. + :param radius: Radius below which points are removed. + """ + x_filt = np.abs(points[0, :]) < radius + y_filt = np.abs(points[1, :]) < radius + not_close = np.logical_not(np.logical_and(x_filt, y_filt)) + points = points[:, not_close] + return points + + +def read_sweep(sweep, virtual=False): + min_distance = 1.0 + points_sweep = read_file(str(sweep["lidar_path"]), virtual=virtual).T + points_sweep = remove_close(points_sweep, min_distance) + + nbr_points = points_sweep.shape[1] + if sweep["transform_matrix"] is not None: + points_sweep[:3, :] = sweep["transform_matrix"].dot( + np.vstack((points_sweep[:3, :], np.ones(nbr_points))) + )[:3, :] + curr_times = sweep["time_lag"] * np.ones((1, points_sweep.shape[1])) + + return points_sweep.T, curr_times.T + +def read_single_waymo(obj): + points_xyz = obj["lidars"]["points_xyz"] + points_feature = obj["lidars"]["points_feature"] + + # normalize intensity + points_feature[:, 0] = np.tanh(points_feature[:, 0]) + + points = np.concatenate([points_xyz, points_feature], axis=-1) + + return points + +def read_single_waymo_sweep(sweep): + obj = get_obj(sweep['path']) + + points_xyz = obj["lidars"]["points_xyz"] + points_feature = obj["lidars"]["points_feature"] + + # normalize intensity + points_feature[:, 0] = np.tanh(points_feature[:, 0]) + points_sweep = np.concatenate([points_xyz, points_feature], axis=-1).T # 5 x N + + nbr_points = points_sweep.shape[1] + + if sweep["transform_matrix"] is not None: + points_sweep[:3, :] = sweep["transform_matrix"].dot( + np.vstack((points_sweep[:3, :], np.ones(nbr_points))) + )[:3, :] + + curr_times = sweep["time_lag"] * np.ones((1, points_sweep.shape[1])) + + return points_sweep.T, curr_times.T + + +def get_obj(path): + with open(path, 'rb') as f: + obj = pickle.load(f) + return obj + + +@PIPELINES.register_module +class LoadPointCloudFromFile(object): + def __init__(self, dataset="KittiDataset", **kwargs): + self.type = dataset + self.random_select = kwargs.get("random_select", False) + self.npoints = kwargs.get("npoints", 16834) + + def __call__(self, res, info): + + res["type"] = self.type + + if self.type == "NuScenesDataset": + + nsweeps = res["lidar"]["nsweeps"] + + lidar_path = Path(info["lidar_path"]) + points = read_file(str(lidar_path), virtual=res["virtual"]) + + sweep_points_list = [points] + sweep_times_list = [np.zeros((points.shape[0], 1))] + + assert (nsweeps - 1) == len( + info["sweeps"] + ), "nsweeps {} should equal to list length {}.".format( + nsweeps, len(info["sweeps"]) + ) + + for i in np.random.choice(len(info["sweeps"]), nsweeps - 1, replace=False): + sweep = info["sweeps"][i] + points_sweep, times_sweep = read_sweep(sweep, virtual=res["virtual"]) + sweep_points_list.append(points_sweep) + sweep_times_list.append(times_sweep) + + points = np.concatenate(sweep_points_list, axis=0) + times = np.concatenate(sweep_times_list, axis=0).astype(points.dtype) + + res["lidar"]["points"] = points + res["lidar"]["times"] = times + res["lidar"]["combined"] = np.hstack([points, times]) + + elif self.type == "WaymoDataset": + path = info['path'] + nsweeps = res["lidar"]["nsweeps"] + obj = get_obj(path) + points = read_single_waymo(obj) + res["lidar"]["points"] = points + + if nsweeps > 1: + sweep_points_list = [points] + sweep_times_list = [np.zeros((points.shape[0], 1))] + + assert (nsweeps - 1) == len( + info["sweeps"] + ), "nsweeps {} should be equal to the list length {}.".format( + nsweeps, len(info["sweeps"]) + ) + + for i in range(nsweeps - 1): + sweep = info["sweeps"][i] + points_sweep, times_sweep = read_single_waymo_sweep(sweep) + sweep_points_list.append(points_sweep) + sweep_times_list.append(times_sweep) + + points = np.concatenate(sweep_points_list, axis=0) + times = np.concatenate(sweep_times_list, axis=0).astype(points.dtype) + + res["lidar"]["points"] = points + res["lidar"]["times"] = times + res["lidar"]["combined"] = np.hstack([points, times]) + else: + raise NotImplementedError + + return res, info + + +@PIPELINES.register_module +class LoadPointCloudAnnotations(object): + def __init__(self, with_bbox=True, **kwargs): + pass + + def __call__(self, res, info): + + if res["type"] in ["NuScenesDataset"] and "gt_boxes" in info: + gt_boxes = info["gt_boxes"].astype(np.float32) + gt_boxes[np.isnan(gt_boxes)] = 0 + res["lidar"]["annotations"] = { + "boxes": gt_boxes, + "names": info["gt_names"], + "tokens": info["gt_boxes_token"], + "velocities": info["gt_boxes_velocity"].astype(np.float32), + } + elif res["type"] == 'WaymoDataset' and "gt_boxes" in info: + res["lidar"]["annotations"] = { + "boxes": info["gt_boxes"].astype(np.float32), + "names": info["gt_names"], + } + else: + pass + + return res, info diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/preprocess.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..49ddef7b1f70ae336e49232290d9931199906a8d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/preprocess.py @@ -0,0 +1,459 @@ +import numpy as np + +from det3d.core.bbox import box_np_ops +from det3d.core.sampler import preprocess as prep +from det3d.builder import build_dbsampler + +from det3d.core.input.voxel_generator import VoxelGenerator +from det3d.core.utils.center_utils import ( + draw_umich_gaussian, gaussian_radius +) +from ..registry import PIPELINES + + +def _dict_select(dict_, inds): + for k, v in dict_.items(): + if isinstance(v, dict): + _dict_select(v, inds) + else: + dict_[k] = v[inds] + + +def drop_arrays_by_name(gt_names, used_classes): + inds = [i for i, x in enumerate(gt_names) if x not in used_classes] + inds = np.array(inds, dtype=np.int64) + return inds + +@PIPELINES.register_module +class Preprocess(object): + def __init__(self, cfg=None, **kwargs): + self.shuffle_points = cfg.shuffle_points + self.min_points_in_gt = cfg.get("min_points_in_gt", -1) + + self.mode = cfg.mode + if self.mode == "train": + self.global_rotation_noise = cfg.global_rot_noise + self.global_scaling_noise = cfg.global_scale_noise + self.global_translate_std = cfg.get('global_translate_std', 0) + self.class_names = cfg.class_names + if cfg.db_sampler != None: + self.db_sampler = build_dbsampler(cfg.db_sampler) + else: + self.db_sampler = None + + self.npoints = cfg.get("npoints", -1) + + self.no_augmentation = cfg.get('no_augmentation', False) + + def __call__(self, res, info): + + res["mode"] = self.mode + + if res["type"] in ["WaymoDataset"]: + if "combined" in res["lidar"]: + points = res["lidar"]["combined"] + else: + points = res["lidar"]["points"] + elif res["type"] in ["NuScenesDataset"]: + points = res["lidar"]["combined"] + else: + raise NotImplementedError + + if self.mode == "train": + anno_dict = res["lidar"]["annotations"] + + gt_dict = { + "gt_boxes": anno_dict["boxes"], + "gt_names": np.array(anno_dict["names"]).reshape(-1), + } + + if self.mode == "train" and not self.no_augmentation: + selected = drop_arrays_by_name( + gt_dict["gt_names"], ["DontCare", "ignore", "UNKNOWN"] + ) + + _dict_select(gt_dict, selected) + + if self.min_points_in_gt > 0: + point_counts = box_np_ops.points_count_rbbox( + points, gt_dict["gt_boxes"] + ) + mask = point_counts >= min_points_in_gt + _dict_select(gt_dict, mask) + + gt_boxes_mask = np.array( + [n in self.class_names for n in gt_dict["gt_names"]], dtype=np.bool_ + ) + + if self.db_sampler: + sampled_dict = self.db_sampler.sample_all( + res["metadata"]["image_prefix"], + gt_dict["gt_boxes"], + gt_dict["gt_names"], + res["metadata"]["num_point_features"], + False, + gt_group_ids=None, + calib=None, + road_planes=None + ) + + if sampled_dict is not None: + sampled_gt_names = sampled_dict["gt_names"] + sampled_gt_boxes = sampled_dict["gt_boxes"] + sampled_points = sampled_dict["points"] + sampled_gt_masks = sampled_dict["gt_masks"] + gt_dict["gt_names"] = np.concatenate( + [gt_dict["gt_names"], sampled_gt_names], axis=0 + ) + gt_dict["gt_boxes"] = np.concatenate( + [gt_dict["gt_boxes"], sampled_gt_boxes] + ) + gt_boxes_mask = np.concatenate( + [gt_boxes_mask, sampled_gt_masks], axis=0 + ) + + + points = np.concatenate([sampled_points, points], axis=0) + + _dict_select(gt_dict, gt_boxes_mask) + + gt_classes = np.array( + [self.class_names.index(n) + 1 for n in gt_dict["gt_names"]], + dtype=np.int32, + ) + gt_dict["gt_classes"] = gt_classes + + gt_dict["gt_boxes"], points = prep.random_flip_both(gt_dict["gt_boxes"], points) + + gt_dict["gt_boxes"], points = prep.global_rotation( + gt_dict["gt_boxes"], points, rotation=self.global_rotation_noise + ) + gt_dict["gt_boxes"], points = prep.global_scaling_v2( + gt_dict["gt_boxes"], points, *self.global_scaling_noise + ) + gt_dict["gt_boxes"], points = prep.global_translate_( + gt_dict["gt_boxes"], points, noise_translate_std=self.global_translate_std + ) + elif self.no_augmentation: + gt_boxes_mask = np.array( + [n in self.class_names for n in gt_dict["gt_names"]], dtype=np.bool_ + ) + _dict_select(gt_dict, gt_boxes_mask) + + gt_classes = np.array( + [self.class_names.index(n) + 1 for n in gt_dict["gt_names"]], + dtype=np.int32, + ) + gt_dict["gt_classes"] = gt_classes + + + if self.shuffle_points: + np.random.shuffle(points) + + res["lidar"]["points"] = points + + if self.mode == "train": + res["lidar"]["annotations"] = gt_dict + + return res, info + + +@PIPELINES.register_module +class Voxelization(object): + def __init__(self, **kwargs): + cfg = kwargs.get("cfg", None) + self.range = cfg.range + self.voxel_size = cfg.voxel_size + self.max_points_in_voxel = cfg.max_points_in_voxel + self.max_voxel_num = [cfg.max_voxel_num, cfg.max_voxel_num] if isinstance(cfg.max_voxel_num, int) else cfg.max_voxel_num + + self.double_flip = cfg.get('double_flip', False) + + self.voxel_generator = VoxelGenerator( + voxel_size=self.voxel_size, + point_cloud_range=self.range, + max_num_points=self.max_points_in_voxel, + max_voxels=self.max_voxel_num[0], + ) + + def __call__(self, res, info): + voxel_size = self.voxel_generator.voxel_size + pc_range = self.voxel_generator.point_cloud_range + grid_size = self.voxel_generator.grid_size + + if res["mode"] == "train": + gt_dict = res["lidar"]["annotations"] + bv_range = pc_range[[0, 1, 3, 4]] + mask = prep.filter_gt_box_outside_range(gt_dict["gt_boxes"], bv_range) + _dict_select(gt_dict, mask) + + res["lidar"]["annotations"] = gt_dict + max_voxels = self.max_voxel_num[0] + else: + max_voxels = self.max_voxel_num[1] + + voxels, coordinates, num_points = self.voxel_generator.generate( + res["lidar"]["points"], max_voxels=max_voxels + ) + num_voxels = np.array([voxels.shape[0]], dtype=np.int64) + + res["lidar"]["voxels"] = dict( + voxels=voxels, + coordinates=coordinates, + num_points=num_points, + num_voxels=num_voxels, + shape=grid_size, + range=pc_range, + size=voxel_size + ) + + double_flip = self.double_flip and (res["mode"] != 'train') + + if double_flip: + flip_voxels, flip_coordinates, flip_num_points = self.voxel_generator.generate( + res["lidar"]["yflip_points"] + ) + flip_num_voxels = np.array([flip_voxels.shape[0]], dtype=np.int64) + + res["lidar"]["yflip_voxels"] = dict( + voxels=flip_voxels, + coordinates=flip_coordinates, + num_points=flip_num_points, + num_voxels=flip_num_voxels, + shape=grid_size, + range=pc_range, + size=voxel_size + ) + + flip_voxels, flip_coordinates, flip_num_points = self.voxel_generator.generate( + res["lidar"]["xflip_points"] + ) + flip_num_voxels = np.array([flip_voxels.shape[0]], dtype=np.int64) + + res["lidar"]["xflip_voxels"] = dict( + voxels=flip_voxels, + coordinates=flip_coordinates, + num_points=flip_num_points, + num_voxels=flip_num_voxels, + shape=grid_size, + range=pc_range, + size=voxel_size + ) + + flip_voxels, flip_coordinates, flip_num_points = self.voxel_generator.generate( + res["lidar"]["double_flip_points"] + ) + flip_num_voxels = np.array([flip_voxels.shape[0]], dtype=np.int64) + + res["lidar"]["double_flip_voxels"] = dict( + voxels=flip_voxels, + coordinates=flip_coordinates, + num_points=flip_num_points, + num_voxels=flip_num_voxels, + shape=grid_size, + range=pc_range, + size=voxel_size + ) + + return res, info + +def flatten(box): + return np.concatenate(box, axis=0) + +def merge_multi_group_label(gt_classes, num_classes_by_task): + num_task = len(gt_classes) + flag = 0 + + for i in range(num_task): + gt_classes[i] += flag + flag += num_classes_by_task[i] + + return flatten(gt_classes) + +@PIPELINES.register_module +class AssignLabel(object): + def __init__(self, **kwargs): + """Return CenterNet training labels like heatmap, height, offset""" + assigner_cfg = kwargs["cfg"] + self.out_size_factor = assigner_cfg.out_size_factor + self.tasks = assigner_cfg.target_assigner.tasks + self.gaussian_overlap = assigner_cfg.gaussian_overlap + self._max_objs = assigner_cfg.max_objs + self._min_radius = assigner_cfg.min_radius + self.cfg = assigner_cfg + + def __call__(self, res, info): + max_objs = self._max_objs + class_names_by_task = [t.class_names for t in self.tasks] + num_classes_by_task = [t.num_class for t in self.tasks] + + example = {} + + if res["mode"] == "train": + # Calculate output featuremap size + if 'voxels' in res['lidar']: + # Calculate output featuremap size + grid_size = res["lidar"]["voxels"]["shape"] + pc_range = res["lidar"]["voxels"]["range"] + voxel_size = res["lidar"]["voxels"]["size"] + feature_map_size = grid_size[:2] // self.out_size_factor + else: + pc_range = np.array(self.cfg['pc_range'], dtype=np.float32) + voxel_size = np.array(self.cfg['voxel_size'], dtype=np.float32) + grid_size = (pc_range[3:] - pc_range[:3]) / voxel_size + grid_size = np.round(grid_size).astype(np.int64) + + feature_map_size = grid_size[:2] // self.out_size_factor + + gt_dict = res["lidar"]["annotations"] + + # reorganize the gt_dict by tasks + task_masks = [] + flag = 0 + for class_name in class_names_by_task: + task_masks.append( + [ + np.where( + gt_dict["gt_classes"] == class_name.index(i) + 1 + flag + ) + for i in class_name + ] + ) + flag += len(class_name) + + task_boxes = [] + task_classes = [] + task_names = [] + flag2 = 0 + for idx, mask in enumerate(task_masks): + task_box = [] + task_class = [] + task_name = [] + for m in mask: + task_box.append(gt_dict["gt_boxes"][m]) + task_class.append(gt_dict["gt_classes"][m] - flag2) + task_name.append(gt_dict["gt_names"][m]) + task_boxes.append(np.concatenate(task_box, axis=0)) + task_classes.append(np.concatenate(task_class)) + task_names.append(np.concatenate(task_name)) + flag2 += len(mask) + + for task_box in task_boxes: + # limit rad to [-pi, pi] + task_box[:, -1] = box_np_ops.limit_period( + task_box[:, -1], offset=0.5, period=np.pi * 2 + ) + + # print(gt_dict.keys()) + gt_dict["gt_classes"] = task_classes + gt_dict["gt_names"] = task_names + gt_dict["gt_boxes"] = task_boxes + + res["lidar"]["annotations"] = gt_dict + + draw_gaussian = draw_umich_gaussian + + hms, anno_boxs, inds, masks, cats = [], [], [], [], [] + + for idx, task in enumerate(self.tasks): + hm = np.zeros((len(class_names_by_task[idx]), feature_map_size[1], feature_map_size[0]), + dtype=np.float32) + + if res['type'] == 'NuScenesDataset': + # [reg, hei, dim, vx, vy, rots, rotc] + anno_box = np.zeros((max_objs, 10), dtype=np.float32) + elif res['type'] == 'WaymoDataset': + anno_box = np.zeros((max_objs, 10), dtype=np.float32) + else: + raise NotImplementedError("Only Support nuScene for Now!") + + ind = np.zeros((max_objs), dtype=np.int64) + mask = np.zeros((max_objs), dtype=np.uint8) + cat = np.zeros((max_objs), dtype=np.int64) + + num_objs = min(gt_dict['gt_boxes'][idx].shape[0], max_objs) + + for k in range(num_objs): + cls_id = gt_dict['gt_classes'][idx][k] - 1 + + w, l, h = gt_dict['gt_boxes'][idx][k][3], gt_dict['gt_boxes'][idx][k][4], \ + gt_dict['gt_boxes'][idx][k][5] + w, l = w / voxel_size[0] / self.out_size_factor, l / voxel_size[1] / self.out_size_factor + if w > 0 and l > 0: + radius = gaussian_radius((l, w), min_overlap=self.gaussian_overlap) + radius = max(self._min_radius, int(radius)) + + # be really careful for the coordinate system of your box annotation. + x, y, z = gt_dict['gt_boxes'][idx][k][0], gt_dict['gt_boxes'][idx][k][1], \ + gt_dict['gt_boxes'][idx][k][2] + + coor_x, coor_y = (x - pc_range[0]) / voxel_size[0] / self.out_size_factor, \ + (y - pc_range[1]) / voxel_size[1] / self.out_size_factor + + ct = np.array( + [coor_x, coor_y], dtype=np.float32) + ct_int = ct.astype(np.int32) + + # throw out not in range objects to avoid out of array area when creating the heatmap + if not (0 <= ct_int[0] < feature_map_size[0] and 0 <= ct_int[1] < feature_map_size[1]): + continue + + draw_gaussian(hm[cls_id], ct, radius) + + new_idx = k + x, y = ct_int[0], ct_int[1] + + cat[new_idx] = cls_id + ind[new_idx] = y * feature_map_size[0] + x + mask[new_idx] = 1 + + if res['type'] == 'NuScenesDataset': + vx, vy = gt_dict['gt_boxes'][idx][k][6:8] + rot = gt_dict['gt_boxes'][idx][k][8] + anno_box[new_idx] = np.concatenate( + (ct - (x, y), z, np.log(gt_dict['gt_boxes'][idx][k][3:6]), + np.array(vx), np.array(vy), np.sin(rot), np.cos(rot)), axis=None) + elif res['type'] == 'WaymoDataset': + vx, vy = gt_dict['gt_boxes'][idx][k][6:8] + rot = gt_dict['gt_boxes'][idx][k][-1] + anno_box[new_idx] = np.concatenate( + (ct - (x, y), z, np.log(gt_dict['gt_boxes'][idx][k][3:6]), + np.array(vx), np.array(vy), np.sin(rot), np.cos(rot)), axis=None) + else: + raise NotImplementedError("Only Support Waymo and nuScene for Now") + + hms.append(hm) + anno_boxs.append(anno_box) + masks.append(mask) + inds.append(ind) + cats.append(cat) + + # used for two stage code + boxes = flatten(gt_dict['gt_boxes']) + classes = merge_multi_group_label(gt_dict['gt_classes'], num_classes_by_task) + + if res["type"] == "NuScenesDataset": + gt_boxes_and_cls = np.zeros((max_objs, 10), dtype=np.float32) + elif res['type'] == "WaymoDataset": + gt_boxes_and_cls = np.zeros((max_objs, 10), dtype=np.float32) + else: + raise NotImplementedError() + + boxes_and_cls = np.concatenate((boxes, + classes.reshape(-1, 1).astype(np.float32)), axis=1) + num_obj = len(boxes_and_cls) + assert num_obj <= max_objs + # x, y, z, w, l, h, rotation_y, velocity_x, velocity_y, class_name + boxes_and_cls = boxes_and_cls[:, [0, 1, 2, 3, 4, 5, 8, 6, 7, 9]] + gt_boxes_and_cls[:num_obj] = boxes_and_cls + + example.update({'gt_boxes_and_cls': gt_boxes_and_cls}) + + example.update({'hm': hms, 'anno_box': anno_boxs, 'ind': inds, 'mask': masks, 'cat': cats}) + else: + pass + + res["lidar"]["targets"] = example + + return res, info + diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/test_aug.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/test_aug.py new file mode 100644 index 0000000000000000000000000000000000000000..9a34bd0e4e3140d9026b676f450ac6ff47b0c53d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/pipelines/test_aug.py @@ -0,0 +1,35 @@ +from det3d import torchie + +from ..registry import PIPELINES +from .compose import Compose + + +@PIPELINES.register_module +class DoubleFlip(object): + def __init__(self): + pass + + def __call__(self, res, info): + # y flip + points = res["lidar"]["points"].copy() + points[:, 1] = -points[:, 1] + + res["lidar"]['yflip_points'] = points + + # x flip + points = res["lidar"]["points"].copy() + points[:, 0] = -points[:, 0] + + res["lidar"]['xflip_points'] = points + + # x y flip + points = res["lidar"]["points"].copy() + points[:, 0] = -points[:, 0] + points[:, 1] = -points[:, 1] + + res["lidar"]["double_flip_points"] = points + + return res, info + + + diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/registry.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..3045ee22fca3e8ecdb0cf9abf81c43cc89c2d2a4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/registry.py @@ -0,0 +1,4 @@ +from det3d.utils.registry import Registry + +DATASETS = Registry("dataset") +PIPELINES = Registry("pipeline") diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/create_gt_database.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/create_gt_database.py new file mode 100644 index 0000000000000000000000000000000000000000..db7b5da502850c43e6614a27c47e4859146ad400 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/create_gt_database.py @@ -0,0 +1,170 @@ +import pickle +from pathlib import Path +import os +import numpy as np + +from det3d.core import box_np_ops +from det3d.datasets.dataset_factory import get_dataset +from tqdm import tqdm + +dataset_name_map = { + "NUSC": "NuScenesDataset", + "WAYMO": "WaymoDataset" +} + + +def create_groundtruth_database( + dataset_class_name, + data_path, + info_path=None, + used_classes=None, + db_path=None, + dbinfo_path=None, + relative_path=True, + virtual=False, + **kwargs, +): + pipeline = [ + { + "type": "LoadPointCloudFromFile", + "dataset": dataset_name_map[dataset_class_name], + }, + {"type": "LoadPointCloudAnnotations", "with_bbox": True}, + ] + + if "nsweeps" in kwargs: + dataset = get_dataset(dataset_class_name)( + info_path=info_path, + root_path=data_path, + pipeline=pipeline, + test_mode=True, + nsweeps=kwargs["nsweeps"], + virtual=virtual + ) + nsweeps = dataset.nsweeps + else: + dataset = get_dataset(dataset_class_name)( + info_path=info_path, root_path=data_path, test_mode=True, pipeline=pipeline + ) + nsweeps = 1 + + root_path = Path(data_path) + + if dataset_class_name in ["WAYMO", "NUSC"]: + if db_path is None: + if virtual: + db_path = root_path / f"gt_database_{nsweeps}sweeps_withvelo_virtual" + else: + db_path = root_path / f"gt_database_{nsweeps}sweeps_withvelo" + if dbinfo_path is None: + if virtual: + dbinfo_path = root_path / f"dbinfos_train_{nsweeps}sweeps_withvelo_virtual.pkl" + else: + dbinfo_path = root_path / f"dbinfos_train_{nsweeps}sweeps_withvelo.pkl" + else: + raise NotImplementedError() + + db_path.mkdir(parents=True, exist_ok=True) + + all_db_infos = {} + group_counter = 0 + + for index in tqdm(range(len(dataset))): + image_idx = index + # modified to nuscenes + sensor_data = dataset.get_sensor_data(index) + if "image_idx" in sensor_data["metadata"]: + image_idx = sensor_data["metadata"]["image_idx"] + + if nsweeps > 1: + points = sensor_data["lidar"]["combined"] + else: + points = sensor_data["lidar"]["points"] + + annos = sensor_data["lidar"]["annotations"] + gt_boxes = annos["boxes"] + names = annos["names"] + + if dataset_class_name == 'WAYMO': + # waymo dataset contains millions of objects and it is not possible to store + # all of them into a single folder + # we randomly sample a few objects for gt augmentation + # We keep all cyclist as they are rare + if index % 4 != 0: + mask = (names == 'VEHICLE') + mask = np.logical_not(mask) + names = names[mask] + gt_boxes = gt_boxes[mask] + + if index % 2 != 0: + mask = (names == 'PEDESTRIAN') + mask = np.logical_not(mask) + names = names[mask] + gt_boxes = gt_boxes[mask] + + group_dict = {} + group_ids = np.full([gt_boxes.shape[0]], -1, dtype=np.int64) + if "group_ids" in annos: + group_ids = annos["group_ids"] + else: + group_ids = np.arange(gt_boxes.shape[0], dtype=np.int64) + difficulty = np.zeros(gt_boxes.shape[0], dtype=np.int32) + if "difficulty" in annos: + difficulty = annos["difficulty"] + + num_obj = gt_boxes.shape[0] + if num_obj == 0: + continue + point_indices = box_np_ops.points_in_rbbox(points, gt_boxes) + for i in range(num_obj): + if (used_classes is None) or names[i] in used_classes: + filename = f"{image_idx}_{names[i]}_{i}.bin" + dirpath = os.path.join(str(db_path), names[i]) + os.makedirs(dirpath, exist_ok=True) + + filepath = os.path.join(str(db_path), names[i], filename) + gt_points = points[point_indices[:, i]] + gt_points[:, :3] -= gt_boxes[i, :3] + with open(filepath, "w") as f: + try: + gt_points.tofile(f) + except: + print("process {} files".format(index)) + break + + if (used_classes is None) or names[i] in used_classes: + if relative_path: + db_dump_path = os.path.join(db_path.stem, names[i], filename) + else: + db_dump_path = str(filepath) + + db_info = { + "name": names[i], + "path": db_dump_path, + "image_idx": image_idx, + "gt_idx": i, + "box3d_lidar": gt_boxes[i], + "num_points_in_gt": gt_points.shape[0], + "difficulty": difficulty[i], + # "group_id": -1, + # "bbox": bboxes[i], + } + local_group_id = group_ids[i] + # if local_group_id >= 0: + if local_group_id not in group_dict: + group_dict[local_group_id] = group_counter + group_counter += 1 + db_info["group_id"] = group_dict[local_group_id] + if "score" in annos: + db_info["score"] = annos["score"][i] + if names[i] in all_db_infos: + all_db_infos[names[i]].append(db_info) + else: + all_db_infos[names[i]] = [db_info] + + print("dataset length: ", len(dataset)) + for k, v in all_db_infos.items(): + print(f"load {len(v)} {k} database infos") + + with open(dbinfo_path, "wb") as f: + pickle.dump(all_db_infos, f) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/distributed.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..bd04f9a0d248a69f5ac4dd3efe1a13d74dea5ba7 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/distributed.py @@ -0,0 +1,62 @@ +import math +import torch +import torch.distributed as dist +from torch.utils.data.sampler import Sampler + + +class DistributedSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + .. note:: + Dataset is assumed to be of constant size. + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + """ + + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + self.shuffle = shuffle + + def __iter__(self): + if self.shuffle: + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + indices += indices[: (self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank : self.total_size : self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/eval.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..568efdf93c3772455e7ea1f5faf0d3167f4e68b5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/eval.py @@ -0,0 +1,367 @@ +import numpy as np +import numba + +from det3d.ops.nms.nms_gpu import rotate_iou_gpu_eval +from det3d.ops.nms.nms_gpu import inter +from det3d.core import box_np_ops + + +def get_split_parts(num, num_part): + same_part = num // num_part + remain_num = num % num_part + if remain_num == 0: + return [same_part] * num_part + else: + return [same_part] * num_part + [remain_num] + + +def prepare_data(gt_annos, dt_annos, current_class, difficulty=None, clean_data=None): + gt_datas_list = [] + dt_datas_list = [] + total_dc_num = [] + ignored_gts, ignored_dets, dontcares = [], [], [] + total_num_valid_gt = 0 + for i in range(len(gt_annos)): + rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty) + num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets + ignored_gts.append(np.array(ignored_gt, dtype=np.int64)) + ignored_dets.append(np.array(ignored_det, dtype=np.int64)) + if len(dc_bboxes) == 0: + dc_bboxes = np.zeros((0, 4)).astype(np.float32) + else: + dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float32) + total_dc_num.append(dc_bboxes.shape[0]) + dontcares.append(dc_bboxes) + total_num_valid_gt += num_valid_gt + gt_datas = np.concatenate( + [gt_annos[i]["bbox"], gt_annos[i]["alpha"][..., np.newaxis]], 1 + ) + dt_datas = np.concatenate( + [ + dt_annos[i]["bbox"], + dt_annos[i]["alpha"][..., np.newaxis], + dt_annos[i]["score"][..., np.newaxis], + ], + 1, + ) + gt_datas_list.append(gt_datas) + dt_datas_list.append(dt_datas) + total_dc_num = np.stack(total_dc_num, axis=0) + return ( + gt_datas_list, + dt_datas_list, + ignored_gts, + ignored_dets, + dontcares, + total_dc_num, + total_num_valid_gt, + ) + + +def calculate_iou_partly( + gt_annos, dt_annos, metric, num_parts=50, z_axis=1, z_center=1.0 +): + """fast iou algorithm. this function can be used independently to + do result analysis. + Args: + gt_annos: dict, must from get_label_annos() in kitti_common.py + dt_annos: dict, must from get_label_annos() in kitti_common.py + metric: eval type. 0: bbox, 1: bev, 2: 3d + num_parts: int. a parameter for fast calculate algorithm + z_axis: height axis. kitti camera use 1, lidar use 2. + """ + assert len(gt_annos) == len(dt_annos) + total_dt_num = np.stack([len(a["name"]) for a in dt_annos], 0) + total_gt_num = np.stack([len(a["name"]) for a in gt_annos], 0) + num_examples = len(gt_annos) + split_parts = get_split_parts(num_examples, num_parts) + parted_overlaps = [] + example_idx = 0 + bev_axes = list(range(3)) + bev_axes.pop(z_axis) + split_parts = [i for i in split_parts if i != 0] + for num_part in split_parts: + gt_annos_part = gt_annos[example_idx : example_idx + num_part] + dt_annos_part = dt_annos[example_idx : example_idx + num_part] + if metric == 0: + gt_boxes = np.concatenate([a["bbox"] for a in gt_annos_part], 0) + dt_boxes = np.concatenate([a["bbox"] for a in dt_annos_part], 0) + overlap_part = image_box_overlap(gt_boxes, dt_boxes) + elif metric == 1: + loc = np.concatenate([a["location"][:, bev_axes] for a in gt_annos_part], 0) + dims = np.concatenate( + [a["dimensions"][:, bev_axes] for a in gt_annos_part], 0 + ) + rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0) + gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) + loc = np.concatenate([a["location"][:, bev_axes] for a in dt_annos_part], 0) + dims = np.concatenate( + [a["dimensions"][:, bev_axes] for a in dt_annos_part], 0 + ) + rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0) + dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) + overlap_part = bev_box_overlap(gt_boxes, dt_boxes).astype(np.float32) + elif metric == 2: + loc = np.concatenate([a["location"] for a in gt_annos_part], 0) + dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0) + rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0) + gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) + loc = np.concatenate([a["location"] for a in dt_annos_part], 0) + dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0) + rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0) + dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) + overlap_part = box3d_overlap( + gt_boxes, dt_boxes, z_axis=z_axis, z_center=z_center + ).astype(np.float32) + else: + raise ValueError("unknown metric") + parted_overlaps.append(overlap_part) + example_idx += num_part + + overlaps = [] + example_idx = 0 + for j, num_part in enumerate(split_parts): + gt_annos_part = gt_annos[example_idx : example_idx + num_part] + dt_annos_part = dt_annos[example_idx : example_idx + num_part] + gt_num_idx, dt_num_idx = 0, 0 + for i in range(num_part): + gt_box_num = total_gt_num[example_idx + i] + dt_box_num = total_dt_num[example_idx + i] + overlaps.append( + parted_overlaps[j][ + gt_num_idx : gt_num_idx + gt_box_num, + dt_num_idx : dt_num_idx + dt_box_num, + ] + ) + gt_num_idx += gt_box_num + dt_num_idx += dt_box_num + example_idx += num_part + + return overlaps, parted_overlaps, total_gt_num, total_dt_num + + +@numba.jit(nopython=True) +def compute_statistics_jit( + overlaps, + gt_datas, + dt_datas, + ignored_gt, + ignored_det, + dc_bboxes, + metric, + min_overlap, + thresh=0, + compute_fp=False, + compute_aos=False, +): + + det_size = dt_datas.shape[0] + gt_size = gt_datas.shape[0] + dt_scores = dt_datas[:, -1] + dt_alphas = dt_datas[:, 4] + gt_alphas = gt_datas[:, 4] + dt_bboxes = dt_datas[:, :4] + # gt_bboxes = gt_datas[:, :4] + + assigned_detection = [False] * det_size + ignored_threshold = [False] * det_size + if compute_fp: + for i in range(det_size): + if dt_scores[i] < thresh: + ignored_threshold[i] = True + NO_DETECTION = -10000000 + tp, fp, fn, similarity = 0, 0, 0, 0 + # thresholds = [0.0] + # delta = [0.0] + thresholds = np.zeros((gt_size,)) + thresh_idx = 0 + delta = np.zeros((gt_size,)) + delta_idx = 0 + for i in range(gt_size): + if ignored_gt[i] == -1: + continue + det_idx = -1 + valid_detection = NO_DETECTION + max_overlap = 0 + assigned_ignored_det = False + + for j in range(det_size): + if ignored_det[j] == -1: + continue + if assigned_detection[j]: + continue + if ignored_threshold[j]: + continue + overlap = overlaps[j, i] + dt_score = dt_scores[j] + if ( + not compute_fp + and (overlap > min_overlap) + and dt_score > valid_detection + ): + det_idx = j + valid_detection = dt_score + elif ( + compute_fp + and (overlap > min_overlap) + and (overlap > max_overlap or assigned_ignored_det) + and ignored_det[j] == 0 + ): + max_overlap = overlap + det_idx = j + valid_detection = 1 + assigned_ignored_det = False + elif ( + compute_fp + and (overlap > min_overlap) + and (valid_detection == NO_DETECTION) + and ignored_det[j] == 1 + ): + det_idx = j + valid_detection = 1 + assigned_ignored_det = True + + if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0: + fn += 1 + elif (valid_detection != NO_DETECTION) and ( + ignored_gt[i] == 1 or ignored_det[det_idx] == 1 + ): + assigned_detection[det_idx] = True + elif valid_detection != NO_DETECTION: + # only a tp add a threshold. + tp += 1 + # thresholds.append(dt_scores[det_idx]) + thresholds[thresh_idx] = dt_scores[det_idx] + thresh_idx += 1 + if compute_aos: + # delta.append(gt_alphas[i] - dt_alphas[det_idx]) + delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx] + delta_idx += 1 + + assigned_detection[det_idx] = True + if compute_fp: + for i in range(det_size): + if not ( + assigned_detection[i] + or ignored_det[i] == -1 + or ignored_det[i] == 1 + or ignored_threshold[i] + ): + fp += 1 + nstuff = 0 + if metric == 0: + overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0) + for i in range(dc_bboxes.shape[0]): + for j in range(det_size): + if assigned_detection[j]: + continue + if ignored_det[j] == -1 or ignored_det[j] == 1: + continue + if ignored_threshold[j]: + continue + if overlaps_dt_dc[j, i] > min_overlap: + assigned_detection[j] = True + nstuff += 1 + fp -= nstuff + if compute_aos: + tmp = np.zeros((fp + delta_idx,)) + # tmp = [0] * fp + for i in range(delta_idx): + tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0 + # tmp.append((1.0 + np.cos(delta[i])) / 2.0) + # assert len(tmp) == fp + tp + # assert len(delta) == tp + if tp > 0 or fp > 0: + similarity = np.sum(tmp) + else: + similarity = -1 + return tp, fp, fn, similarity, thresholds[:thresh_idx] + + +@numba.jit(nopython=True) +def image_box_overlap(boxes, query_boxes, criterion=-1): + N = boxes.shape[0] + K = query_boxes.shape[0] + overlaps = np.zeros((N, K), dtype=boxes.dtype) + for k in range(K): + qbox_area = (query_boxes[k, 2] - query_boxes[k, 0]) * ( + query_boxes[k, 3] - query_boxes[k, 1] + ) + for n in range(N): + iw = min(boxes[n, 2], query_boxes[k, 2]) - max( + boxes[n, 0], query_boxes[k, 0] + ) + if iw > 0: + ih = min(boxes[n, 3], query_boxes[k, 3]) - max( + boxes[n, 1], query_boxes[k, 1] + ) + if ih > 0: + if criterion == -1: + ua = ( + (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + + qbox_area + - iw * ih + ) + elif criterion == 0: + ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + elif criterion == 1: + ua = qbox_area + else: + ua = 1.0 + overlaps[n, k] = iw * ih / ua + return overlaps + + +def bev_box_overlap(boxes, qboxes, criterion=-1, stable=False): + if stable: + riou = box_np_ops.riou_cc(boxes, qboxes) + else: + riou = rotate_iou_gpu_eval(boxes, qboxes, criterion) + return riou + + +@numba.jit(nopython=True, parallel=True) +def box3d_overlap_kernel(boxes, qboxes, rinc, criterion=-1, z_axis=1, z_center=1.0): + """ + z_axis: the z (height) axis. + z_center: unified z (height) center of box. + """ + N, K = boxes.shape[0], qboxes.shape[0] + for i in range(N): + for j in range(K): + if rinc[i, j] > 0: + min_z = min( + boxes[i, z_axis] + boxes[i, z_axis + 3] * (1 - z_center), + qboxes[j, z_axis] + qboxes[j, z_axis + 3] * (1 - z_center), + ) + max_z = max( + boxes[i, z_axis] - boxes[i, z_axis + 3] * z_center, + qboxes[j, z_axis] - qboxes[j, z_axis + 3] * z_center, + ) + iw = min_z - max_z + if iw > 0: + area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5] + area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5] + inc = iw * rinc[i, j] + if criterion == -1: + ua = area1 + area2 - inc + elif criterion == 0: + ua = area1 + elif criterion == 1: + ua = area2 + else: + ua = 1.0 + rinc[i, j] = inc / ua + else: + rinc[i, j] = 0.0 + + +def box3d_overlap(boxes, qboxes, criterion=-1, z_axis=1, z_center=1.0): + """kitti camera format z_axis=1. + """ + bev_axes = list(range(7)) + bev_axes.pop(z_axis + 3) + bev_axes.pop(z_axis) + rinc = rotate_iou_gpu_eval(boxes[:, bev_axes], qboxes[:, bev_axes], 2) + box3d_overlap_kernel(boxes, qboxes, rinc, criterion, z_axis, z_center) + return rinc diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/oss.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/oss.py new file mode 100644 index 0000000000000000000000000000000000000000..66773b72009d0ee0d83183371747700e7752a525 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/utils/oss.py @@ -0,0 +1,575 @@ +"""\ +This module offser helpers for OSS operation. + +Basic Use +---------- +Create an :class:`OSSPath` object:: + + >>> p = OSSPath('s3://mybucket/myprefix/mykey.bin') + OSSPath('s3://mybucket/myprefix/mykey.bin') + >>> OSSPath() / "mybucket" / "myprefix" / "mykey.bin" + OSSPath('s3://mybucket/myprefix/mykey.bin') + + +Querying object properies:: + + >>> p.exists() + True + >>> p.is_dir() + False + >>> p.is_file() + True + >>> p.get_size() + 256 + +Access path properties:: + + >>> p.bucket + "mybucket" + >>> p.key + "myprefix/mykey.bin" + >>> p.name + "mykey.bin" + >>> p.stem + "mykey" + >> p.suffix + ".bin" + >> p.suffixes + [".bin"] + >>> p.parent + OSSPath('s3://mybucket/myprefix') + >>> p.root + OSSPath('s3://mybucket') + +Uploading content to an object:: + + >>> p.put(b"some bytes\n") + True + +Uploading file to an object:: + + >>> p.put(open('/path/some/image.jpg', 'rb')) + +Reading an object:: + + >>> f = p.download() + >>> f.read() + b"some bytes" + >>> p.download(encoding='utf-8') + >>> f.read() + "some bytes" + +Deleting an object:: + + >>> p.delete() + True + +Path manipulations:: + + >>> p = OSSPath('s3://mybucket/myprefix/mykey.bin') + >>> p.with_name('mykey2.bin') + OSSPath("s3://mybucket/myprefix/mykey2.bin") + >>> p.with_suffix('.txt') + OSSPath("s3://mybucket/myprefix/mykey.txt") + >>> p.with_bucket('some_bucket') + OSSPath("s3://some_bucket/myprefix/mykey.txt") + + >>> q = p.parent + >>> q + OSSPath('s3://mybucket/myprefix') + >>> q / "subfile.txt" + OSSPath("s3://mybucket/myprefix/subfile.txt") + >>> q / "subdir" / "subfile.txt" + OSSPath("s3://mybucket/myprefix/subdir/subfile.txt") + >>> q.joinpath("a", "b", "c") + OSSPath('s3://mybucket/myprefix/a/b/c') + +Directory-level operations:: + + >>> list(q.list_all()) # list all subfiles in all levels + >>> list(q.iter_dir()) # list subdirs and subfiles in one-level + >>> for root, dirs, files in q.walk(): print(files) # recursively walk through directory + >>> q.rmtree() # remove all subkeys of p + + +""" +import os +import io +import codecs +from typing import Tuple, Iterable, Optional, List +from pathlib import PosixPath +from urllib.parse import urlparse, urlunparse +import re +import socket +import boto3 +from botocore.errorfactory import ClientError + + +def get_site(): + m = re.search(r"([^.]+)\.brainpp\.cn$", socket.getfqdn()) + if m: + return m.group(1) + + +OSS_ENDPOINT = os.getenv( + "OSS_ENDPOINT", default="http://oss.{}.brainpp.cn".format(get_site()), +) + + +class OSSPath: + + __slots__ = ("_client", "bucket", "_key_parts") + + def __new__(cls, s3url: Optional[str] = None, endpoint_url=OSS_ENDPOINT): + _client = boto3.client("s3", endpoint_url=endpoint_url) + bucket, parts = cls._parse_s3url(s3url) + return cls._create(_client, bucket, parts) + + @classmethod + def _parse_s3url(cls, s3url: Optional[str] = None): + if s3url is None: + return "", () + + if not s3url.startswith("s3://"): + raise ValueError( + "s3url must be formated as 's3:///path/to/object'" + ) + + r = urlparse(s3url) + assert r.scheme == "s3" + + key = r.path.lstrip("/") # remove the leading / + + parts = PosixPath(key).parts + return r.netloc, parts + + @classmethod + def _create(cls, client, bucket: str, key_parts: Tuple[str]): + assert isinstance(key_parts, tuple) + self = object.__new__(cls) + self._client = client + self.bucket = bucket + self._key_parts = key_parts + return self + + @property + def key(self) -> str: + return "/".join(self._key_parts) + + @property + def parent(self): + """The logical parent of the path.""" + + if not len(self._key_parts): + return self + + return self._create(self._client, self.bucket, self._key_parts[:-1]) + + @property + def root(self): + return self._create(self._client, self.bucket, key_parts=()) + + @property + def name(self): + if len(self._key_parts) < 1: + return "" + return self._key_parts[-1] + + @property + def suffix(self): + """The final component's last suffix, if any.""" + name = self.name + i = name.rfind(".") + if 0 < i < len(name) - 1: + return name[i:] + else: + return "" + + @property + def suffixes(self): + """A list of the final component's suffixes, if any.""" + name = self.name + if name.endswith("."): + return [] + name = name.lstrip(".") + return ["." + suffix for suffix in name.split(".")[1:]] + + @property + def stem(self): + """The final path component, minus its last suffix.""" + name = self.name + i = name.rfind(".") + if 0 < i < len(name) - 1: + return name[:i] + else: + return name + + @property + def parts(self): + """An object providing sequence-like access to the + components in the filesystem path.""" + + return self._key_parts + + def __str__(self) -> str: + return "s3://{}/{}".format(self.bucket, self.key) + + def __eq__(self, other): + if not isinstance(other, OSSPath): + return False + return self.bucket == other.bucket and self.key == other.key + + def __hash__(self): + return hash(str(self)) + + def __repr__(self): + return "{}({})".format(self.__class__.__name__, str(self)) + + def __lt__(self, other): + if not isinstance(other, OSSPath): + raise NotImplementedError() + return str(self) < str(other) + + def __le__(self, other): + if not isinstance(other, OSSPath): + raise NotImplementedError() + return str(self) <= str(other) + + def __gt__(self, other): + if not isinstance(other, OSSPath): + raise NotImplementedError() + return str(self) > str(other) + + def __ge__(self, other): + if not isinstance(other, OSSPath): + raise NotImplementedError() + return str(self) >= str(other) + + def with_name(self, name): + """Return a new path with the file name changed.""" + if not self.name: + raise ValueError("%r has an empty name" % (self,)) + + r = urlparse(name) + if not (r.scheme == "" and r.netloc == "" or "/" in name): + raise ValueError("invalid name %r" % (name)) + + return self._create(self._client, self.bucket, self._key_parts[:-1] + (name,)) + + def with_suffix(self, suffix): + """Return a new path with the file suffix changed. If the path + has no suffix, add given suffix. If the given suffix is an empty + string, remove the suffix from the path. + """ + if "/" in suffix: + raise ValueError("Invalid suffix %r" % (suffix,)) + if suffix and not suffix.startswith(".") or suffix == ".": + raise ValueError("Invalid suffix %r" % (suffix)) + name = self.name + if not name: + raise ValueError("%r has an empty name" % (self,)) + old_suffix = self.suffix + if not old_suffix: + name = name + suffix + else: + name = name[: -len(old_suffix)] + suffix + return self._create(self._client, self.bucket, self._key_parts[:-1] + (name,)) + + def with_bucket(self, bucket): + if not isinstance(bucket, str): + raise ValueError("bucket be string") + + bucket = bucket.strip("/") + if not bucket: + raise ValueError("bucket must not be empty") + if "/" in bucket: + raise ValueError("bucket_name must not contain '/'") + return self._create(self._client, bucket, self._key_parts) + + def _make_child(self, args: Iterable[str]): + + if not self.bucket: + bucket, *rest_args = args + bucket = bucket.lstrip("/") + bucket, *rest_parts = PosixPath(bucket).parts + return self.with_bucket(bucket)._make_child(rest_parts + rest_args) + + parts = [p for p in self._key_parts] + for item in args: + if not isinstance(item, str): + raise ValueError("child must be string") + item = item.lstrip("/") # remove leading '/' + if not item: + raise ValueError("child must not be empty") + for p in PosixPath(item).parts: + parts.append(p) + + return self._create(self._client, self.bucket, tuple(parts)) + + def joinpath(self, *args): + """Combine this path with one or several arguments, and return a + new path representing either a subpath (if all arguments are relative + paths) or a totally different path (if one of the arguments is + anchored). + """ + return self._make_child(args) + + def __truediv__(self, key): + return self._make_child((key,)) + + def __rtruediv__(self, key): + raise NotImplemented + + def is_dir(self): + if not self.bucket: + return False + + if not self.key: + # key empty, return whether bucket exists + try: + self._client.head_bucket(Bucket=self.bucket) + return True + except ClientError as e: + if e.response["Error"]["Code"] == "404": + return False + + prefix = self.key + if prefix[-1] != "/": + prefix = prefix + "/" + resp = self._client.list_objects( + Bucket=self.bucket, Delimiter="/", Prefix=prefix + ) + return "CommonPrefixes" in resp or "Contents" in resp + + def is_file(self): + if not self.bucket: + return False + if not self.key: + return False + try: + self._client.head_object(Bucket=self.bucket, Key=self.key) + return True + except ClientError as e: + if e.response["Error"]["Code"] == "404": + return False + + def exists(self): + if not self.bucket: + return False + if self.is_dir(): + return True + elif self.is_file(): + return True + return False + + def get_size(self): + if not self.bucket: + return -1 + if self.is_dir(): + return 0 + if not self.is_file(): + return -1 + + key = self.key.lstrip("/") + return self._client.head_object(Bucket=self.bucket, Key=key)["ContentLength"] + + def list_all(self, batch_size=1000): + """\ + List all subkeys + :returns: Iterator[OSSPath] + """ + if not self.is_dir(): + return + + if batch_size > 1000: + print( + "At most 1000 keys can be operated at once. Clipping batch_size to 1000." + ) + batch_size = 1000 + + prefix = self.key + if prefix[-1] != "/": + prefix = prefix + "/" + + marker = None + while True: + request = dict( + Bucket=self.bucket, Delimiter="", Prefix=prefix, MaxKeys=batch_size, + ) + if marker: + request["Marker"] = marker + + resp = self._client.list_objects(**request) + + for p in resp.get("Contents", []): + yield self.root / p["Key"] + + if not resp["IsTruncated"]: + break + + print( + "More than {} objects are found under {}, you should avoid putting too many small objects!".format( + batch_size, self + ) + ) + marker = resp["NextMarker"] + + def walk(self, topdown=True, recursive=True, batch_size=1000): + """\ + Generate path tree by walking either top-down or bottom-up just like :func:`os.walk`. + For each prefix in the tree, it yields a 3-tuple (subtree-root, subdirs, subfiles). + + If optional argument *topdown* is True or not specified, the triple for a directory + is generated before the triples for any subdirectories. If *topdown* is False, + the triple for a directory is generated after its subdirectries. + + If *recurisve* is set to False, it only yields the top level subdirectries and subfiles. + + *batch_size* is the maximum keys that OSS returns in one request-response, + and it cannot be set larger than 1000. + """ + if not self.is_dir(): + return + + if batch_size > 1000: + print( + "At most 1000 keys can be operated at once. Clipping batch_size to 1000." + ) + batch_size = 1000 + + prefix = self.key + if prefix[-1] != "/": + prefix = prefix + "/" + + dirs, files = [], [] + marker = None + while True: + request = dict( + Bucket=self.bucket, Delimiter="/", Prefix=prefix, MaxKeys=batch_size, + ) + if marker: + request["Marker"] = marker + + resp = self._client.list_objects(**request) + + dirs += [self.root / p["Prefix"] for p in resp.get("CommonPrefixes", [])] + + files += [self.root / p["Key"] for p in resp.get("Contents", [])] + + if not resp["IsTruncated"]: + break + + print( + "More than {} objects are found under {}, you should avoid putting too many small objects!".format( + batch_size, self + ) + ) + marker = resp["NextMarker"] + + if topdown: + yield self, dirs, files + + if recursive: + for subdir in dirs: + yield from subdir.walk( + recursive=True, topdown=topdown, batch_size=batch_size + ) + + if not topdown: + yield self, dirs, files + + def iterdir(self, batch_size=1000): + """ + Iterates over self directory, yields subdirs and subfiles. + :returns: Iterator[OSSPath] + """ + for root, dirs, files in self.walk(batch_size=batch_size, recursive=False): + yield from dirs + yield from files + + def download(self, encoding=None) -> Optional[io.IOBase]: + """ + :param encoding: if None, it returns bytes io; + if an encoding (such as 'utf-8') is specified, it returns text io + + :returns: file-like object which can be read out + """ + + if not self.is_file(): + raise FileNotFoundError("{!r} is not an existing object.".format(self)) + + r = self._client.get_object(Bucket=self.bucket, Key=self.key) + b = r["Body"] + if encoding is not None: + b = codecs.getreader(encoding)(b) + + return b + + def put(self, bytes_or_file) -> bool: + """ + :param bytes_or_file: bytes or file-like object to be uploaded to OSS + :returns: wheter successfully uploaded + """ + if not self.bucket or not self.key: + raise ValueError("Invalid path to put object: {!r}".format(self)) + if self.key.endswith("/"): + raise ValueError('Object key cannot endswith "/": {}'.format(self.key)) + + r = self._client.put_object( + Body=bytes_or_file, Bucket=self.bucket, Key=self.key, + ) + return r["ResponseMetadata"]["HTTPStatusCode"] == 200 + + def delete(self) -> bool: + """ + :returns: whether this object is deleted + """ + if not self.is_file(): + return True + r = self._client.delete_object(Bucket=self.bucket, Key=self.key) + + return r["ResponseMetadata"]["HTTPStatusCode"] == 204 + + def rmtree(self, batch_size=1000) -> List[str]: + """ + :returns: list of deleted objects + """ + if not self.is_dir(): + if self.is_file(): + raise ValueError("{!r} is not a directory".format(self)) + return True + + if batch_size > 1000: + print( + "At most 1000 keys can be operated at once. Clipping batch_size to 1000." + ) + batch_size = 1000 + + prefix = self.key + if prefix[-1] != "/": + prefix = prefix + "/" + + ret = [] + while True: + lr = self._client.list_objects( + Bucket=self.bucket, Delimiter="", Prefix=prefix, MaxKeys=batch_size, + ) + + dr = self._client.delete_objects( + Bucket=self.bucket, + Delete={"Objects": [{"Key": i["Key"]} for i in lr.get("Contents", [])]}, + ) + + for i in dr["Deleted"]: + ret.append("s3://{}/{}".format(self.bucket, i["Key"])) + + if not lr["IsTruncated"]: + break + + print( + "More than {} objects are found under {}, you should avoid putting too many small objects!".format( + batch_size, self + ) + ) + + return ret diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f710797ab784835c1442fbb48d68ecbf113174ad --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/__init__.py @@ -0,0 +1,4 @@ +from .waymo import WaymoDataset +from .waymo_common import * + +__all__ = ["WaymoDataset"] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo.py new file mode 100644 index 0000000000000000000000000000000000000000..f659dbacca8bb0f99e1c30a1bb85640bbcf4b722 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo.py @@ -0,0 +1,105 @@ +import sys +import pickle +import json +import random +import operator +from numba.cuda.simulator.api import detect +import numpy as np + +from functools import reduce +from pathlib import Path +from copy import deepcopy + +from det3d.datasets.custom import PointCloudDataset + +from det3d.datasets.registry import DATASETS + + +@DATASETS.register_module +class WaymoDataset(PointCloudDataset): + NumPointFeatures = 5 # x, y, z, intensity, elongation + + def __init__( + self, + info_path, + root_path, + cfg=None, + pipeline=None, + class_names=None, + test_mode=False, + sample=False, + nsweeps=1, + load_interval=1, + **kwargs, + ): + self.load_interval = load_interval + self.sample = sample + self.nsweeps = nsweeps + print("Using {} sweeps".format(nsweeps)) + super(WaymoDataset, self).__init__( + root_path, info_path, pipeline, test_mode=test_mode, class_names=class_names + ) + + self._info_path = info_path + self._class_names = class_names + self._num_point_features = WaymoDataset.NumPointFeatures if nsweeps == 1 else WaymoDataset.NumPointFeatures+1 + + def reset(self): + assert False + + def load_infos(self, info_path): + + with open(self._info_path, "rb") as f: + _waymo_infos_all = pickle.load(f) + + self._waymo_infos = _waymo_infos_all[::self.load_interval] + + print("Using {} Frames".format(len(self._waymo_infos))) + + def __len__(self): + + if not hasattr(self, "_waymo_infos"): + self.load_infos(self._info_path) + + return len(self._waymo_infos) + + def get_sensor_data(self, idx): + info = self._waymo_infos[idx] + + res = { + "lidar": { + "type": "lidar", + "points": None, + "annotations": None, + "nsweeps": self.nsweeps, + }, + "metadata": { + "image_prefix": self._root_path, + "num_point_features": self._num_point_features, + "token": info["token"], + }, + "calib": None, + "cam": {}, + "mode": "val" if self.test_mode else "train", + "type": "WaymoDataset", + } + + data, _ = self.pipeline(res, info) + + return data + + def __getitem__(self, idx): + return self.get_sensor_data(idx) + + def evaluation(self, detections, output_dir=None, testset=False): + from .waymo_common import _create_pd_detection, reorganize_info + + infos = self._waymo_infos + infos = reorganize_info(infos) + + _create_pd_detection(detections, infos, output_dir) + + print("use waymo devkit tool for evaluation") + + return None, None + diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo_common.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo_common.py new file mode 100644 index 0000000000000000000000000000000000000000..13bfdd572b908c7213e22ff92a3c577dda0f9b0b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo_common.py @@ -0,0 +1,355 @@ +import os.path as osp +import numpy as np +import pickle +import random + +from pathlib import Path +from functools import reduce +from typing import Tuple, List +import os +import json +from tqdm import tqdm +import argparse + +from tqdm import tqdm +try: + import tensorflow as tf + tf.enable_eager_execution() +except: + print("No Tensorflow") + +from nuscenes.utils.geometry_utils import transform_matrix +from pyquaternion import Quaternion + + +CAT_NAME_TO_ID = { + 'VEHICLE': 1, + 'PEDESTRIAN': 2, + 'SIGN': 3, + 'CYCLIST': 4, +} +TYPE_LIST = ['UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'] + +def get_obj(path): + with open(path, 'rb') as f: + obj = pickle.load(f) + return obj + +# ignore sign class +LABEL_TO_TYPE = {0: 1, 1:2, 2:4} + +import uuid + +class UUIDGeneration(): + def __init__(self): + self.mapping = {} + def get_uuid(self,seed): + if seed not in self.mapping: + self.mapping[seed] = uuid.uuid4().hex + return self.mapping[seed] +uuid_gen = UUIDGeneration() + +def _create_pd_detection(detections, infos, result_path, tracking=False): + """Creates a prediction objects file.""" + from waymo_open_dataset import label_pb2 + from waymo_open_dataset.protos import metrics_pb2 + + objects = metrics_pb2.Objects() + + for token, detection in tqdm(detections.items()): + info = infos[token] + obj = get_obj(info['anno_path']) + + box3d = detection["box3d_lidar"].detach().cpu().numpy() + scores = detection["scores"].detach().cpu().numpy() + labels = detection["label_preds"].detach().cpu().numpy() + + # transform back to Waymo coordinate + # x,y,z,w,l,h,r2 + # x,y,z,l,w,h,r1 + # r2 = -pi/2 - r1 + box3d[:, -1] = -box3d[:, -1] - np.pi / 2 + box3d = box3d[:, [0, 1, 2, 4, 3, 5, -1]] + + if tracking: + tracking_ids = detection['tracking_ids'] + + for i in range(box3d.shape[0]): + det = box3d[i] + score = scores[i] + + label = labels[i] + + o = metrics_pb2.Object() + o.context_name = obj['scene_name'] + o.frame_timestamp_micros = int(obj['frame_name'].split("_")[-1]) + + # Populating box and score. + box = label_pb2.Label.Box() + box.center_x = det[0] + box.center_y = det[1] + box.center_z = det[2] + box.length = det[3] + box.width = det[4] + box.height = det[5] + box.heading = det[-1] + o.object.box.CopyFrom(box) + o.score = score + # Use correct type. + o.object.type = LABEL_TO_TYPE[label] + + if tracking: + o.object.id = uuid_gen.get_uuid(int(tracking_ids[i])) + + objects.objects.append(o) + + # Write objects to a file. + if tracking: + path = os.path.join(result_path, 'tracking_pred.bin') + else: + path = os.path.join(result_path, 'detection_pred.bin') + + print("results saved to {}".format(path)) + f = open(path, 'wb') + f.write(objects.SerializeToString()) + f.close() + +def _create_gt_detection(infos, tracking=True): + """Creates a gt prediction object file for local evaluation.""" + from waymo_open_dataset import label_pb2 + from waymo_open_dataset.protos import metrics_pb2 + + objects = metrics_pb2.Objects() + + for idx in tqdm(range(len(infos))): + info = infos[idx] + + obj = get_obj(info['anno_path']) + annos = obj['objects'] + num_points_in_gt = np.array([ann['num_points'] for ann in annos]) + box3d = np.array([ann['box'] for ann in annos]) + + if len(box3d) == 0: + continue + + names = np.array([TYPE_LIST[ann['label']] for ann in annos]) + + box3d = box3d[:, [0, 1, 2, 3, 4, 5, -1]] + + for i in range(box3d.shape[0]): + if num_points_in_gt[i] == 0: + continue + if names[i] == 'UNKNOWN': + continue + + det = box3d[i] + score = 1.0 + label = names[i] + + o = metrics_pb2.Object() + o.context_name = obj['scene_name'] + o.frame_timestamp_micros = int(obj['frame_name'].split("_")[-1]) + + # Populating box and score. + box = label_pb2.Label.Box() + box.center_x = det[0] + box.center_y = det[1] + box.center_z = det[2] + box.length = det[3] + box.width = det[4] + box.height = det[5] + box.heading = det[-1] + o.object.box.CopyFrom(box) + o.score = score + # Use correct type. + o.object.type = CAT_NAME_TO_ID[label] + o.object.num_lidar_points_in_box = num_points_in_gt[i] + o.object.id = annos[i]['name'] + + objects.objects.append(o) + + # Write objects to a file. + f = open(os.path.join(args.result_path, 'gt_preds.bin'), 'wb') + f.write(objects.SerializeToString()) + f.close() + +def veh_pos_to_transform(veh_pos): + "convert vehicle pose to two transformation matrix" + rotation = veh_pos[:3, :3] + tran = veh_pos[:3, 3] + + global_from_car = transform_matrix( + tran, Quaternion(matrix=rotation), inverse=False + ) + + car_from_global = transform_matrix( + tran, Quaternion(matrix=rotation), inverse=True + ) + + return global_from_car, car_from_global + +def _fill_infos(root_path, frames, split='train', nsweeps=1): + # load all train infos + infos = [] + for frame_name in tqdm(frames): # global id + lidar_path = os.path.join(root_path, split, 'lidar', frame_name) + ref_path = os.path.join(root_path, split, 'annos', frame_name) + + ref_obj = get_obj(ref_path) + ref_time = 1e-6 * int(ref_obj['frame_name'].split("_")[-1]) + + ref_pose = np.reshape(ref_obj['veh_to_global'], [4, 4]) + _, ref_from_global = veh_pos_to_transform(ref_pose) + + info = { + "path": lidar_path, + "anno_path": ref_path, + "token": frame_name, + "timestamp": ref_time, + "sweeps": [] + } + + sequence_id = int(frame_name.split("_")[1]) + frame_id = int(frame_name.split("_")[3][:-4]) # remove .pkl + + prev_id = frame_id + sweeps = [] + while len(sweeps) < nsweeps - 1: + if prev_id <= 0: + if len(sweeps) == 0: + sweep = { + "path": lidar_path, + "token": frame_name, + "transform_matrix": None, + "time_lag": 0 + } + sweeps.append(sweep) + else: + sweeps.append(sweeps[-1]) + else: + prev_id = prev_id - 1 + # global identifier + + curr_name = 'seq_{}_frame_{}.pkl'.format(sequence_id, prev_id) + curr_lidar_path = os.path.join(root_path, split, 'lidar', curr_name) + curr_label_path = os.path.join(root_path, split, 'annos', curr_name) + + curr_obj = get_obj(curr_label_path) + curr_pose = np.reshape(curr_obj['veh_to_global'], [4, 4]) + global_from_car, _ = veh_pos_to_transform(curr_pose) + + tm = reduce( + np.dot, + [ref_from_global, global_from_car], + ) + + curr_time = int(curr_obj['frame_name'].split("_")[-1]) + time_lag = ref_time - 1e-6 * curr_time + + sweep = { + "path": curr_lidar_path, + "transform_matrix": tm, + "time_lag": time_lag, + } + sweeps.append(sweep) + + info["sweeps"] = sweeps + + if split != 'test': + # read boxes + TYPE_LIST = ['UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'] + annos = ref_obj['objects'] + num_points_in_gt = np.array([ann['num_points'] for ann in annos]) + gt_boxes = np.array([ann['box'] for ann in annos]).reshape(-1, 9) + + if len(gt_boxes) != 0: + # transform from Waymo to KITTI coordinate + # Waymo: x, y, z, length, width, height, rotation from positive x axis clockwisely + # KITTI: x, y, z, width, length, height, rotation from negative y axis counterclockwisely + gt_boxes[:, -1] = -np.pi / 2 - gt_boxes[:, -1] + gt_boxes[:, [3, 4]] = gt_boxes[:, [4, 3]] + + gt_names = np.array([TYPE_LIST[ann['label']] for ann in annos]) + mask_not_zero = (num_points_in_gt > 0).reshape(-1) + + # filter boxes without lidar points + info['gt_boxes'] = gt_boxes[mask_not_zero, :].astype(np.float32) + info['gt_names'] = gt_names[mask_not_zero].astype(str) + + infos.append(info) + return infos + +def sort_frame(frames): + indices = [] + + for f in frames: + seq_id = int(f.split("_")[1]) + frame_id= int(f.split("_")[3][:-4]) + + idx = seq_id * 1000 + frame_id + indices.append(idx) + + rank = list(np.argsort(np.array(indices))) + + frames = [frames[r] for r in rank] + return frames + +def get_available_frames(root, split): + dir_path = os.path.join(root, split, 'lidar') + available_frames = list(os.listdir(dir_path)) + + sorted_frames = sort_frame(available_frames) + + print(split, " split ", "exist frame num:", len(available_frames)) + return sorted_frames + + +def create_waymo_infos(root_path, split='train', nsweeps=1): + frames = get_available_frames(root_path, split) + + waymo_infos = _fill_infos( + root_path, frames, split, nsweeps + ) + + print( + f"sample: {len(waymo_infos)}" + ) + with open( + os.path.join(root_path, "infos_"+split+"_{:02d}sweeps_filter_zero_gt.pkl".format(nsweeps)), "wb" + ) as f: + pickle.dump(waymo_infos, f) + +def parse_args(): + parser = argparse.ArgumentParser(description="Waymo 3D Extractor") + parser.add_argument("--path", type=str, default="data/Waymo/tfrecord_training") + parser.add_argument("--info_path", type=str) + parser.add_argument("--result_path", type=str) + parser.add_argument("--gt", action='store_true' ) + parser.add_argument("--tracking", action='store_true') + args = parser.parse_args() + return args + + +def reorganize_info(infos): + new_info = {} + + for info in infos: + token = info['token'] + new_info[token] = info + + return new_info + +if __name__ == "__main__": + args = parse_args() + + with open(args.info_path, 'rb') as f: + infos = pickle.load(f) + + if args.gt: + _create_gt_detection(infos, tracking=args.tracking) + exit() + + infos = reorganize_info(infos) + with open(args.path, 'rb') as f: + preds = pickle.load(f) + _create_pd_detection(preds, infos, args.result_path, tracking=args.tracking) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo_converter.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..67d3f22d06f66e1d2c9040e1413bb92f512feaaf --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo_converter.py @@ -0,0 +1,71 @@ +"""Tool to convert Waymo Open Dataset to pickle files. + Adapted from https://github.com/WangYueFt/pillar-od + # Copyright (c) Massachusetts Institute of Technology and its affiliates. + # Licensed under MIT License +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import glob, argparse, tqdm, pickle, os + +import waymo_decoder +import tensorflow.compat.v2 as tf +from waymo_open_dataset import dataset_pb2 + +from multiprocessing import Pool + +tf.enable_v2_behavior() + +fnames = None +LIDAR_PATH = None +ANNO_PATH = None + +def convert(idx): + global fnames + fname = fnames[idx] + dataset = tf.data.TFRecordDataset(fname, compression_type='') + for frame_id, data in enumerate(dataset): + frame = dataset_pb2.Frame() + frame.ParseFromString(bytearray(data.numpy())) + decoded_frame = waymo_decoder.decode_frame(frame, frame_id) + decoded_annos = waymo_decoder.decode_annos(frame, frame_id) + + with open(os.path.join(LIDAR_PATH, 'seq_{}_frame_{}.pkl'.format(idx, frame_id)), 'wb') as f: + pickle.dump(decoded_frame, f) + + with open(os.path.join(ANNO_PATH, 'seq_{}_frame_{}.pkl'.format(idx, frame_id)), 'wb') as f: + pickle.dump(decoded_annos, f) + + +def main(args): + global fnames + fnames = sorted(list(glob.glob(args.record_path))) + + print("Number of files {}".format(len(fnames))) + + with Pool(128) as p: # change according to your cpu + r = list(tqdm.tqdm(p.imap(convert, range(len(fnames))), total=len(fnames))) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Waymo Data Converter') + parser.add_argument('--root_path', type=str, required=True) + parser.add_argument('--record_path', type=str, required=True) + + args = parser.parse_args() + + if not os.path.isdir(args.root_path): + os.mkdir(args.root_path) + + LIDAR_PATH = os.path.join(args.root_path, 'lidar') + ANNO_PATH = os.path.join(args.root_path, 'annos') + + if not os.path.isdir(LIDAR_PATH): + os.mkdir(LIDAR_PATH) + + if not os.path.isdir(ANNO_PATH): + os.mkdir(ANNO_PATH) + + main(args) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo_decoder.py b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..32555463b7c17c540473e4246926dea2fa4d7779 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/datasets/waymo/waymo_decoder.py @@ -0,0 +1,207 @@ +"""Waymo open dataset decoder. + Taken from https://github.com/WangYueFt/pillar-od + # Copyright (c) Massachusetts Institute of Technology and its affiliates. + # Licensed under MIT License +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import zlib +import numpy as np + +import tensorflow.compat.v2 as tf +from pyquaternion import Quaternion + +from waymo_open_dataset import dataset_pb2 +from waymo_open_dataset.utils import range_image_utils +from waymo_open_dataset.utils import transform_utils +tf.enable_v2_behavior() + +def decode_frame(frame, frame_id): + """Decodes native waymo Frame proto to tf.Examples.""" + + lidars = extract_points(frame.lasers, + frame.context.laser_calibrations, + frame.pose) + + frame_name = '{scene_name}_{location}_{time_of_day}_{timestamp}'.format( + scene_name=frame.context.name, + location=frame.context.stats.location, + time_of_day=frame.context.stats.time_of_day, + timestamp=frame.timestamp_micros) + + example_data = { + 'scene_name': frame.context.name, + 'frame_name': frame_name, + 'frame_id': frame_id, + 'lidars': lidars, + } + + return example_data + # return encode_tf_example(example_data, FEATURE_SPEC) + +def decode_annos(frame, frame_id): + """Decodes some meta data (e.g. calibration matrices, frame matrices).""" + + veh_to_global = np.array(frame.pose.transform) + + ref_pose = np.reshape(np.array(frame.pose.transform), [4, 4]) + global_from_ref_rotation = ref_pose[:3, :3] + objects = extract_objects(frame.laser_labels, global_from_ref_rotation) + + frame_name = '{scene_name}_{location}_{time_of_day}_{timestamp}'.format( + scene_name=frame.context.name, + location=frame.context.stats.location, + time_of_day=frame.context.stats.time_of_day, + timestamp=frame.timestamp_micros) + + annos = { + 'scene_name': frame.context.name, + 'frame_name': frame_name, + 'frame_id': frame_id, + 'veh_to_global': veh_to_global, + 'objects': objects, + } + + return annos + + +def extract_points_from_range_image(laser, calibration, frame_pose): + """Decode points from lidar.""" + if laser.name != calibration.name: + raise ValueError('Laser and calibration do not match') + if laser.name == dataset_pb2.LaserName.TOP: + frame_pose = tf.convert_to_tensor( + np.reshape(np.array(frame_pose.transform), [4, 4])) + range_image_top_pose = dataset_pb2.MatrixFloat.FromString( + zlib.decompress(laser.ri_return1.range_image_pose_compressed)) + # [H, W, 6] + range_image_top_pose_tensor = tf.reshape( + tf.convert_to_tensor(range_image_top_pose.data), + range_image_top_pose.shape.dims) + # [H, W, 3, 3] + range_image_top_pose_tensor_rotation = transform_utils.get_rotation_matrix( + range_image_top_pose_tensor[..., 0], + range_image_top_pose_tensor[..., 1], range_image_top_pose_tensor[..., + 2]) + range_image_top_pose_tensor_translation = range_image_top_pose_tensor[..., + 3:] + range_image_top_pose_tensor = transform_utils.get_transform( + range_image_top_pose_tensor_rotation, + range_image_top_pose_tensor_translation) + frame_pose = tf.expand_dims(frame_pose, axis=0) + pixel_pose = tf.expand_dims(range_image_top_pose_tensor, axis=0) + else: + pixel_pose = None + frame_pose = None + first_return = zlib.decompress( + laser.ri_return1.range_image_compressed) + second_return = zlib.decompress( + laser.ri_return2.range_image_compressed) + points_list = [] + for range_image_str in [first_return, second_return]: + range_image = dataset_pb2.MatrixFloat.FromString(range_image_str) + if not calibration.beam_inclinations: + beam_inclinations = range_image_utils.compute_inclination( + tf.constant([ + calibration.beam_inclination_min, calibration.beam_inclination_max + ]), + height=range_image.shape.dims[0]) + else: + beam_inclinations = tf.constant(calibration.beam_inclinations) + beam_inclinations = tf.reverse(beam_inclinations, axis=[-1]) + extrinsic = np.reshape(np.array(calibration.extrinsic.transform), [4, 4]) + range_image_tensor = tf.reshape( + tf.convert_to_tensor(range_image.data), range_image.shape.dims) + range_image_mask = range_image_tensor[..., 0] > 0 + range_image_cartesian = ( + range_image_utils.extract_point_cloud_from_range_image( + tf.expand_dims(range_image_tensor[..., 0], axis=0), + tf.expand_dims(extrinsic, axis=0), + tf.expand_dims(tf.convert_to_tensor(beam_inclinations), axis=0), + pixel_pose=pixel_pose, + frame_pose=frame_pose)) + range_image_cartesian = tf.squeeze(range_image_cartesian, axis=0) + points_tensor = tf.gather_nd( + tf.concat([range_image_cartesian, range_image_tensor[..., 1:4]], + axis=-1), + tf.where(range_image_mask)) + points_list.append(points_tensor.numpy()) + return points_list + + +def extract_points(lasers, laser_calibrations, frame_pose): + """Extract point clouds.""" + sort_lambda = lambda x: x.name + lasers_with_calibration = zip( + sorted(lasers, key=sort_lambda), + sorted(laser_calibrations, key=sort_lambda)) + points_xyz = [] + points_feature = [] + points_nlz = [] + for laser, calibration in lasers_with_calibration: + points_list = extract_points_from_range_image(laser, calibration, + frame_pose) + points = np.concatenate(points_list, axis=0) + points_xyz.extend(points[..., :3].astype(np.float32)) + points_feature.extend(points[..., 3:5].astype(np.float32)) + points_nlz.extend(points[..., 5].astype(np.float32)) + return { + 'points_xyz': np.asarray(points_xyz), + 'points_feature': np.asarray(points_feature), + } + +def global_vel_to_ref(vel, global_from_ref_rotation): + # inverse means ref_from_global, rotation_matrix for normalization + vel = [vel[0], vel[1], 0] + ref = np.dot(Quaternion(matrix=global_from_ref_rotation).inverse.rotation_matrix, vel) + ref = [ref[0], ref[1], 0.0] + + return ref + +def extract_objects(laser_labels, global_from_ref_rotation): + """Extract objects.""" + objects = [] + for object_id, label in enumerate(laser_labels): + category_label = label.type + box = label.box + + speed = [label.metadata.speed_x, label.metadata.speed_y] + accel = [label.metadata.accel_x, label.metadata.accel_y] + num_lidar_points_in_box = label.num_lidar_points_in_box + # Difficulty level is 0 if labeler did not say this was LEVEL_2. + # Set difficulty level of "999" for boxes with no points in box. + if num_lidar_points_in_box <= 0: + combined_difficulty_level = 999 + if label.detection_difficulty_level == 0: + # Use points in box to compute difficulty level. + if num_lidar_points_in_box >= 5: + combined_difficulty_level = 1 + else: + combined_difficulty_level = 2 + else: + combined_difficulty_level = label.detection_difficulty_level + + ref_velocity = global_vel_to_ref(speed, global_from_ref_rotation) + + objects.append({ + 'id': object_id, + 'name': label.id, + 'label': category_label, + 'box': np.array([box.center_x, box.center_y, box.center_z, + box.length, box.width, box.height, ref_velocity[0], + ref_velocity[1], box.heading], dtype=np.float32), + 'num_points': + num_lidar_points_in_box, + 'detection_difficulty_level': + label.detection_difficulty_level, + 'combined_difficulty_level': + combined_difficulty_level, + 'global_speed': + np.array(speed, dtype=np.float32), + 'global_accel': + np.array(accel, dtype=np.float32), + }) + return objects diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d24d50235c52154c714f956bac0e41eaeb51b2e3 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/__init__.py @@ -0,0 +1,43 @@ +import importlib +spconv_spec = importlib.util.find_spec("spconv") +found = spconv_spec is not None +if found: + from .backbones import * # noqa: F401,F403 +else: + print("No spconv, sparse convolution disabled!") +from .bbox_heads import * # noqa: F401,F403 +from .builder import ( + build_backbone, + build_detector, + build_head, + build_loss, + build_neck, + build_roi_head +) +from .detectors import * # noqa: F401,F403 +from .necks import * # noqa: F401,F403 +from .readers import * +from .registry import ( + BACKBONES, + DETECTORS, + HEADS, + LOSSES, + NECKS, + READERS, +) +from .second_stage import * +from .roi_heads import * + +__all__ = [ + "READERS", + "BACKBONES", + "NECKS", + "HEADS", + "LOSSES", + "DETECTORS", + "build_backbone", + "build_neck", + "build_head", + "build_loss", + "build_detector", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/backbones/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b50cbd9be1d779bc3f3effbf77ac80d19c3c029a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/backbones/__init__.py @@ -0,0 +1,9 @@ +import importlib +spconv_spec = importlib.util.find_spec("spconv") +found = spconv_spec is not None + +if found: + from .scn import SpMiddleResNetFHD +else: + print("No spconv, sparse convolution disabled!") + diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/backbones/scn.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/backbones/scn.py new file mode 100644 index 0000000000000000000000000000000000000000..559f98e5638b9bd78b2fe28d08ef4385544fba5b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/backbones/scn.py @@ -0,0 +1,191 @@ +import numpy as np +try: + import spconv.pytorch as spconv + from spconv.pytorch import ops + from spconv.pytorch import SparseConv3d, SubMConv3d +except: + import spconv + from spconv import ops + from spconv import SparseConv3d, SubMConv3d + +from torch import nn +from torch.nn import functional as F + +from ..registry import BACKBONES +from ..utils import build_norm_layer + +def replace_feature(out, new_features): + if "replace_feature" in out.__dir__(): + # spconv 2.x behaviour + return out.replace_feature(new_features) + else: + out.features = new_features + return out + +def conv3x3(in_planes, out_planes, stride=1, indice_key=None, bias=True): + """3x3 convolution with padding""" + return spconv.SubMConv3d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias=bias, + indice_key=indice_key, + ) + + +def conv1x1(in_planes, out_planes, stride=1, indice_key=None, bias=True): + """1x1 convolution""" + return spconv.SubMConv3d( + in_planes, + out_planes, + kernel_size=1, + stride=stride, + padding=1, + bias=bias, + indice_key=indice_key, + ) + + +class SparseBasicBlock(spconv.SparseModule): + expansion = 1 + + def __init__( + self, + inplanes, + planes, + stride=1, + norm_cfg=None, + downsample=None, + indice_key=None, + ): + super(SparseBasicBlock, self).__init__() + + if norm_cfg is None: + norm_cfg = dict(type="BN1d", eps=1e-3, momentum=0.01) + + bias = norm_cfg is not None + + self.conv1 = conv3x3(inplanes, planes, stride, indice_key=indice_key, bias=bias) + self.bn1 = build_norm_layer(norm_cfg, planes)[1] + self.relu = nn.ReLU() + self.conv2 = conv3x3(planes, planes, indice_key=indice_key, bias=bias) + self.bn2 = build_norm_layer(norm_cfg, planes)[1] + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = replace_feature(out, self.bn1(out.features)) + out = replace_feature(out, self.relu(out.features)) + + out = self.conv2(out) + out = replace_feature(out, self.bn2(out.features)) + + if self.downsample is not None: + identity = self.downsample(x) + + out = replace_feature(out, out.features + identity.features) + out = replace_feature(out, self.relu(out.features)) + + return out + + +@BACKBONES.register_module +class SpMiddleResNetFHD(nn.Module): + def __init__( + self, num_input_features=128, norm_cfg=None, name="SpMiddleResNetFHD", **kwargs + ): + super(SpMiddleResNetFHD, self).__init__() + self.name = name + + self.dcn = None + self.zero_init_residual = False + + if norm_cfg is None: + norm_cfg = dict(type="BN1d", eps=1e-3, momentum=0.01) + + # input: # [1600, 1200, 41] + self.conv_input = spconv.SparseSequential( + SubMConv3d(num_input_features, 16, 3, bias=False, indice_key="res0"), + build_norm_layer(norm_cfg, 16)[1], + nn.ReLU(inplace=True) + ) + + self.conv1 = spconv.SparseSequential( + SparseBasicBlock(16, 16, norm_cfg=norm_cfg, indice_key="res0"), + SparseBasicBlock(16, 16, norm_cfg=norm_cfg, indice_key="res0"), + ) + + self.conv2 = spconv.SparseSequential( + SparseConv3d( + 16, 32, 3, 2, padding=1, bias=False + ), # [1600, 1200, 41] -> [800, 600, 21] + build_norm_layer(norm_cfg, 32)[1], + nn.ReLU(inplace=True), + SparseBasicBlock(32, 32, norm_cfg=norm_cfg, indice_key="res1"), + SparseBasicBlock(32, 32, norm_cfg=norm_cfg, indice_key="res1"), + ) + + self.conv3 = spconv.SparseSequential( + SparseConv3d( + 32, 64, 3, 2, padding=1, bias=False + ), # [800, 600, 21] -> [400, 300, 11] + build_norm_layer(norm_cfg, 64)[1], + nn.ReLU(inplace=True), + SparseBasicBlock(64, 64, norm_cfg=norm_cfg, indice_key="res2"), + SparseBasicBlock(64, 64, norm_cfg=norm_cfg, indice_key="res2"), + ) + + self.conv4 = spconv.SparseSequential( + SparseConv3d( + 64, 128, 3, 2, padding=[0, 1, 1], bias=False + ), # [400, 300, 11] -> [200, 150, 5] + build_norm_layer(norm_cfg, 128)[1], + nn.ReLU(inplace=True), + SparseBasicBlock(128, 128, norm_cfg=norm_cfg, indice_key="res3"), + SparseBasicBlock(128, 128, norm_cfg=norm_cfg, indice_key="res3"), + ) + + + self.extra_conv = spconv.SparseSequential( + SparseConv3d( + 128, 128, (3, 1, 1), (2, 1, 1), bias=False + ), # [200, 150, 5] -> [200, 150, 2] + build_norm_layer(norm_cfg, 128)[1], + nn.ReLU(), + ) + + def forward(self, voxel_features, coors, batch_size, input_shape): + + # input: # [41, 1600, 1408] + sparse_shape = np.array(input_shape[::-1]) + [1, 0, 0] + + coors = coors.int() + ret = spconv.SparseConvTensor(voxel_features, coors, sparse_shape, batch_size) + + x = self.conv_input(ret) + + x_conv1 = self.conv1(x) + x_conv2 = self.conv2(x_conv1) + x_conv3 = self.conv3(x_conv2) + x_conv4 = self.conv4(x_conv3) + + ret = self.extra_conv(x_conv4) + + ret = ret.dense() + + N, C, D, H, W = ret.shape + ret = ret.view(N, C * D, H, W) + + multi_scale_voxel_features = { + 'conv1': x_conv1, + 'conv2': x_conv2, + 'conv3': x_conv3, + 'conv4': x_conv4, + } + + return ret, multi_scale_voxel_features \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/bbox_heads/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/bbox_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..98b34647af180aa7c6001de38f96872835bf3225 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/bbox_heads/__init__.py @@ -0,0 +1,3 @@ +from .center_head import CenterHead + +__all__ = ["CenterHead"] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/bbox_heads/center_head.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/bbox_heads/center_head.py new file mode 100644 index 0000000000000000000000000000000000000000..87e8541690627ca74013513961b421c140789532 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/bbox_heads/center_head.py @@ -0,0 +1,506 @@ +# ------------------------------------------------------------------------------ +# Portions of this code are from +# det3d (https://github.com/poodarchu/Det3D/tree/56402d4761a5b73acd23080f537599b0888cce07) +# Copyright (c) 2019 朱本金 +# Licensed under the MIT License +# ------------------------------------------------------------------------------ + +import logging +from collections import defaultdict +from det3d.core import box_torch_ops +import torch +from det3d.torchie.cnn import kaiming_init +from torch import double, nn +from det3d.models.losses.centernet_loss import FastFocalLoss, RegLoss +from det3d.models.utils import Sequential +from ..registry import HEADS +import copy +try: + from det3d.ops.dcn import DeformConv +except: + print("Deformable Convolution not built!") + +from det3d.core.utils.circle_nms_jit import circle_nms + +class FeatureAdaption(nn.Module): + """Feature Adaption Module. + + Feature Adaption Module is implemented based on DCN v1. + It uses anchor shape prediction rather than feature map to + predict offsets of deformable conv layer. + + Args: + in_channels (int): Number of channels in the input feature map. + out_channels (int): Number of channels in the output feature map. + kernel_size (int): Deformable conv kernel size. + deformable_groups (int): Deformable conv group size. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + deformable_groups=4): + super(FeatureAdaption, self).__init__() + offset_channels = kernel_size * kernel_size * 2 + self.conv_offset = nn.Conv2d( + in_channels, deformable_groups * offset_channels, 1, bias=True) + self.conv_adaption = DeformConv( + in_channels, + out_channels, + kernel_size=kernel_size, + padding=(kernel_size - 1) // 2, + deformable_groups=deformable_groups) + self.relu = nn.ReLU(inplace=True) + self.init_offset() + + def init_offset(self): + self.conv_offset.weight.data.zero_() + + def forward(self, x,): + offset = self.conv_offset(x) + x = self.relu(self.conv_adaption(x, offset)) + return x + +class SepHead(nn.Module): + def __init__( + self, + in_channels, + heads, + head_conv=64, + final_kernel=1, + bn=False, + init_bias=-2.19, + **kwargs, + ): + super(SepHead, self).__init__(**kwargs) + + self.heads = heads + for head in self.heads: + classes, num_conv = self.heads[head] + + fc = Sequential() + for i in range(num_conv-1): + fc.add(nn.Conv2d(in_channels, head_conv, + kernel_size=final_kernel, stride=1, + padding=final_kernel // 2, bias=True)) + if bn: + fc.add(nn.BatchNorm2d(head_conv)) + fc.add(nn.ReLU()) + + fc.add(nn.Conv2d(head_conv, classes, + kernel_size=final_kernel, stride=1, + padding=final_kernel // 2, bias=True)) + + if 'hm' in head: + fc[-1].bias.data.fill_(init_bias) + else: + for m in fc.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + + self.__setattr__(head, fc) + + + def forward(self, x): + ret_dict = dict() + for head in self.heads: + ret_dict[head] = self.__getattr__(head)(x) + + return ret_dict + +class DCNSepHead(nn.Module): + def __init__( + self, + in_channels, + num_cls, + heads, + head_conv=64, + final_kernel=1, + bn=False, + init_bias=-2.19, + **kwargs, + ): + super(DCNSepHead, self).__init__(**kwargs) + + # feature adaptation with dcn + # use separate features for classification / regression + self.feature_adapt_cls = FeatureAdaption( + in_channels, + in_channels, + kernel_size=3, + deformable_groups=4) + + self.feature_adapt_reg = FeatureAdaption( + in_channels, + in_channels, + kernel_size=3, + deformable_groups=4) + + # heatmap prediction head + self.cls_head = Sequential( + nn.Conv2d(in_channels, head_conv, + kernel_size=3, padding=1, bias=True), + nn.BatchNorm2d(64), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, num_cls, + kernel_size=3, stride=1, + padding=1, bias=True) + ) + self.cls_head[-1].bias.data.fill_(init_bias) + + # other regression target + self.task_head = SepHead(in_channels, heads, head_conv=head_conv, bn=bn, final_kernel=final_kernel) + + + def forward(self, x): + center_feat = self.feature_adapt_cls(x) + reg_feat = self.feature_adapt_reg(x) + + cls_score = self.cls_head(center_feat) + ret = self.task_head(reg_feat) + ret['hm'] = cls_score + + return ret + + +@HEADS.register_module +class CenterHead(nn.Module): + def __init__( + self, + in_channels=[128,], + tasks=[], + dataset='nuscenes', + weight=0.25, + code_weights=[], + common_heads=dict(), + logger=None, + init_bias=-2.19, + share_conv_channel=64, + num_hm_conv=2, + dcn_head=False, + ): + super(CenterHead, self).__init__() + + num_classes = [len(t["class_names"]) for t in tasks] + self.class_names = [t["class_names"] for t in tasks] + self.code_weights = code_weights + self.weight = weight # weight between hm loss and loc loss + self.dataset = dataset + + self.in_channels = in_channels + self.num_classes = num_classes + + self.crit = FastFocalLoss() + self.crit_reg = RegLoss() + + self.box_n_dim = 9 if 'vel' in common_heads else 7 + self.use_direction_classifier = False + + if not logger: + logger = logging.getLogger("CenterHead") + self.logger = logger + + logger.info( + f"num_classes: {num_classes}" + ) + + # a shared convolution + self.shared_conv = nn.Sequential( + nn.Conv2d(in_channels, share_conv_channel, + kernel_size=3, padding=1, bias=True), + nn.BatchNorm2d(share_conv_channel), + nn.ReLU(inplace=True) + ) + + self.tasks = nn.ModuleList() + print("Use HM Bias: ", init_bias) + + if dcn_head: + print("Use Deformable Convolution in the CenterHead!") + + for num_cls in num_classes: + heads = copy.deepcopy(common_heads) + if not dcn_head: + heads.update(dict(hm=(num_cls, num_hm_conv))) + self.tasks.append( + SepHead(share_conv_channel, heads, bn=True, init_bias=init_bias, final_kernel=3) + ) + else: + self.tasks.append( + DCNSepHead(share_conv_channel, num_cls, heads, bn=True, init_bias=init_bias, final_kernel=3) + ) + + logger.info("Finish CenterHead Initialization") + + def forward(self, x, *kwargs): + ret_dicts = [] + + x = self.shared_conv(x) + + for task in self.tasks: + ret_dicts.append(task(x)) + + return ret_dicts, x + + def _sigmoid(self, x): + y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4) + return y + + def loss(self, example, preds_dicts, test_cfg, **kwargs): + rets = [] + for task_id, preds_dict in enumerate(preds_dicts): + # heatmap focal loss + preds_dict['hm'] = self._sigmoid(preds_dict['hm']) + + hm_loss = self.crit(preds_dict['hm'], example['hm'][task_id], example['ind'][task_id], example['mask'][task_id], example['cat'][task_id]) + + target_box = example['anno_box'][task_id] + # reconstruct the anno_box from multiple reg heads + if self.dataset in ['waymo', 'nuscenes']: + if 'vel' in preds_dict: + preds_dict['anno_box'] = torch.cat((preds_dict['reg'], preds_dict['height'], preds_dict['dim'], + preds_dict['vel'], preds_dict['rot']), dim=1) + else: + preds_dict['anno_box'] = torch.cat((preds_dict['reg'], preds_dict['height'], preds_dict['dim'], + preds_dict['rot']), dim=1) + target_box = target_box[..., [0, 1, 2, 3, 4, 5, -2, -1]] # remove vel target + else: + raise NotImplementedError() + + ret = {} + + # Regression loss for dimension, offset, height, rotation + box_loss = self.crit_reg(preds_dict['anno_box'], example['mask'][task_id], example['ind'][task_id], target_box) + + loc_loss = (box_loss*box_loss.new_tensor(self.code_weights)).sum() + + loss = hm_loss + self.weight*loc_loss + + ret.update({'loss': loss, 'hm_loss': hm_loss.detach().cpu(), 'loc_loss':loc_loss, 'loc_loss_elem': box_loss.detach().cpu(), 'num_positive': example['mask'][task_id].float().sum()}) + + rets.append(ret) + + """convert batch-key to key-batch + """ + rets_merged = defaultdict(list) + for ret in rets: + for k, v in ret.items(): + rets_merged[k].append(v) + + return rets_merged + + @torch.no_grad() + def predict(self, example, preds_dicts, test_cfg, **kwargs): + """decode, nms, then return the detection result. Additionaly support double flip testing + """ + # get loss info + rets = [] + metas = [] + + double_flip = test_cfg.get('double_flip', False) + + post_center_range = test_cfg.post_center_limit_range + if len(post_center_range) > 0: + post_center_range = torch.tensor( + post_center_range, + dtype=preds_dicts[0]['hm'].dtype, + device=preds_dicts[0]['hm'].device, + ) + + for task_id, preds_dict in enumerate(preds_dicts): + # convert N C H W to N H W C + for key, val in preds_dict.items(): + preds_dict[key] = val.permute(0, 2, 3, 1).contiguous() + + batch_size = preds_dict['hm'].shape[0] + + if double_flip: + assert batch_size % 4 == 0, print(batch_size) + batch_size = int(batch_size / 4) + for k in preds_dict.keys(): + # transform the prediction map back to their original coordinate befor flipping + # the flipped predictions are ordered in a group of 4. The first one is the original pointcloud + # the second one is X flip pointcloud(y=-y), the third one is Y flip pointcloud(x=-x), and the last one is + # X and Y flip pointcloud(x=-x, y=-y). + # Also please note that pytorch's flip function is defined on higher dimensional space, so dims=[2] means that + # it is flipping along the axis with H length(which is normaly the Y axis), however in our traditional word, it is flipping along + # the X axis. The below flip follows pytorch's definition yflip(y=-y) xflip(x=-x) + _, H, W, C = preds_dict[k].shape + preds_dict[k] = preds_dict[k].reshape(int(batch_size), 4, H, W, C) + preds_dict[k][:, 1] = torch.flip(preds_dict[k][:, 1], dims=[1]) + preds_dict[k][:, 2] = torch.flip(preds_dict[k][:, 2], dims=[2]) + preds_dict[k][:, 3] = torch.flip(preds_dict[k][:, 3], dims=[1, 2]) + + if "metadata" not in example or len(example["metadata"]) == 0: + meta_list = [None] * batch_size + else: + meta_list = example["metadata"] + if double_flip: + meta_list = meta_list[:4*int(batch_size):4] + + batch_hm = torch.sigmoid(preds_dict['hm']) + + batch_dim = torch.exp(preds_dict['dim']) + + batch_rots = preds_dict['rot'][..., 0:1] + batch_rotc = preds_dict['rot'][..., 1:2] + batch_reg = preds_dict['reg'] + batch_hei = preds_dict['height'] + + if double_flip: + batch_hm = batch_hm.mean(dim=1) + batch_hei = batch_hei.mean(dim=1) + batch_dim = batch_dim.mean(dim=1) + + # y = -y reg_y = 1-reg_y + batch_reg[:, 1, ..., 1] = 1 - batch_reg[:, 1, ..., 1] + batch_reg[:, 2, ..., 0] = 1 - batch_reg[:, 2, ..., 0] + + batch_reg[:, 3, ..., 0] = 1 - batch_reg[:, 3, ..., 0] + batch_reg[:, 3, ..., 1] = 1 - batch_reg[:, 3, ..., 1] + batch_reg = batch_reg.mean(dim=1) + + # first yflip + # y = -y theta = pi -theta + # sin(pi-theta) = sin(theta) cos(pi-theta) = -cos(theta) + # batch_rots[:, 1] the same + batch_rotc[:, 1] *= -1 + + # then xflip x = -x theta = 2pi - theta + # sin(2pi - theta) = -sin(theta) cos(2pi - theta) = cos(theta) + # batch_rots[:, 2] the same + batch_rots[:, 2] *= -1 + + # double flip + batch_rots[:, 3] *= -1 + batch_rotc[:, 3] *= -1 + + batch_rotc = batch_rotc.mean(dim=1) + batch_rots = batch_rots.mean(dim=1) + + batch_rot = torch.atan2(batch_rots, batch_rotc) + + batch, H, W, num_cls = batch_hm.size() + + batch_reg = batch_reg.reshape(batch, H*W, 2) + batch_hei = batch_hei.reshape(batch, H*W, 1) + + batch_rot = batch_rot.reshape(batch, H*W, 1) + batch_dim = batch_dim.reshape(batch, H*W, 3) + batch_hm = batch_hm.reshape(batch, H*W, num_cls) + + ys, xs = torch.meshgrid([torch.arange(0, H), torch.arange(0, W)]) + ys = ys.view(1, H, W).repeat(batch, 1, 1).to(batch_hm) + xs = xs.view(1, H, W).repeat(batch, 1, 1).to(batch_hm) + + xs = xs.view(batch, -1, 1) + batch_reg[:, :, 0:1] + ys = ys.view(batch, -1, 1) + batch_reg[:, :, 1:2] + + xs = xs * test_cfg.out_size_factor * test_cfg.voxel_size[0] + test_cfg.pc_range[0] + ys = ys * test_cfg.out_size_factor * test_cfg.voxel_size[1] + test_cfg.pc_range[1] + + if 'vel' in preds_dict: + batch_vel = preds_dict['vel'] + + if double_flip: + # flip vy + batch_vel[:, 1, ..., 1] *= -1 + # flip vx + batch_vel[:, 2, ..., 0] *= -1 + + batch_vel[:, 3] *= -1 + + batch_vel = batch_vel.mean(dim=1) + + batch_vel = batch_vel.reshape(batch, H*W, 2) + batch_box_preds = torch.cat([xs, ys, batch_hei, batch_dim, batch_vel, batch_rot], dim=2) + else: + batch_box_preds = torch.cat([xs, ys, batch_hei, batch_dim, batch_rot], dim=2) + + metas.append(meta_list) + + if test_cfg.get('per_class_nms', False): + pass + else: + rets.append(self.post_processing(batch_box_preds, batch_hm, test_cfg, post_center_range, task_id)) + + # Merge branches results + ret_list = [] + num_samples = len(rets[0]) + + ret_list = [] + for i in range(num_samples): + ret = {} + for k in rets[0][i].keys(): + if k in ["box3d_lidar", "scores"]: + ret[k] = torch.cat([ret[i][k] for ret in rets]) + elif k in ["label_preds"]: + flag = 0 + for j, num_class in enumerate(self.num_classes): + rets[j][i][k] += flag + flag += num_class + ret[k] = torch.cat([ret[i][k] for ret in rets]) + + ret['metadata'] = metas[0][i] + ret_list.append(ret) + + return ret_list + + @torch.no_grad() + def post_processing(self, batch_box_preds, batch_hm, test_cfg, post_center_range, task_id): + batch_size = len(batch_hm) + + prediction_dicts = [] + for i in range(batch_size): + box_preds = batch_box_preds[i] + hm_preds = batch_hm[i] + + scores, labels = torch.max(hm_preds, dim=-1) + + score_mask = scores > test_cfg.score_threshold + distance_mask = (box_preds[..., :3] >= post_center_range[:3]).all(1) \ + & (box_preds[..., :3] <= post_center_range[3:]).all(1) + + mask = distance_mask & score_mask + + box_preds = box_preds[mask] + scores = scores[mask] + labels = labels[mask] + + boxes_for_nms = box_preds[:, [0, 1, 2, 3, 4, 5, -1]] + + if test_cfg.get('circular_nms', False): + centers = boxes_for_nms[:, [0, 1]] + boxes = torch.cat([centers, scores.view(-1, 1)], dim=1) + selected = _circle_nms(boxes, min_radius=test_cfg.min_radius[task_id], post_max_size=test_cfg.nms.nms_post_max_size) + else: + selected = box_torch_ops.rotate_nms_pcdet(boxes_for_nms.float(), scores.float(), + thresh=test_cfg.nms.nms_iou_threshold, + pre_maxsize=test_cfg.nms.nms_pre_max_size, + post_max_size=test_cfg.nms.nms_post_max_size) + + selected_boxes = box_preds[selected] + selected_scores = scores[selected] + selected_labels = labels[selected] + + prediction_dict = { + 'box3d_lidar': selected_boxes, + 'scores': selected_scores, + 'label_preds': selected_labels + } + + prediction_dicts.append(prediction_dict) + + return prediction_dicts + +import numpy as np +def _circle_nms(boxes, min_radius, post_max_size=83): + """ + NMS according to center distance + """ + keep = np.array(circle_nms(boxes.cpu().numpy(), thresh=min_radius))[:post_max_size] + + keep = torch.from_numpy(keep).long().to(boxes.device) + + return keep \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/builder.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..0b157898e59c9f83ae9f3d533aa298bd59395137 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/builder.py @@ -0,0 +1,50 @@ +from det3d.utils import build_from_cfg +from torch import nn + +from .registry import ( + BACKBONES, + DETECTORS, + HEADS, + LOSSES, + NECKS, + READERS, + SECOND_STAGE, + ROI_HEAD +) + + +def build(cfg, registry, default_args=None): + if isinstance(cfg, list): + modules = [build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg] + return nn.Sequential(*modules) + else: + return build_from_cfg(cfg, registry, default_args) + +def build_second_stage_module(cfg): + return build(cfg, SECOND_STAGE) + +def build_roi_head(cfg): + return build(cfg, ROI_HEAD) + + +def build_reader(cfg): + return build(cfg, READERS) + + +def build_backbone(cfg): + return build(cfg, BACKBONES) + + +def build_neck(cfg): + return build(cfg, NECKS) + +def build_head(cfg): + return build(cfg, HEADS) + + +def build_loss(cfg): + return build(cfg, LOSSES) + + +def build_detector(cfg, train_cfg=None, test_cfg=None): + return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg)) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..888e4586e7208f10ca63cc2395dcde0b24570529 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/__init__.py @@ -0,0 +1,12 @@ +from .base import BaseDetector +from .point_pillars import PointPillars +from .single_stage import SingleStageDetector +from .voxelnet import VoxelNet +from .two_stage import TwoStageDetector + +__all__ = [ + "BaseDetector", + "SingleStageDetector", + "VoxelNet", + "PointPillars", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/base.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/base.py new file mode 100644 index 0000000000000000000000000000000000000000..0ad93f6831e0d9494816c44cd8db39a562422757 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/base.py @@ -0,0 +1,70 @@ +import logging +from abc import ABCMeta, abstractmethod + +import numpy as np +import pycocotools.mask as maskUtils +import torch.nn as nn +from det3d import torchie + + +class BaseDetector(nn.Module): + """Base class for detectors""" + + __metaclass__ = ABCMeta + + def __init__(self): + super(BaseDetector, self).__init__() + self.fp16_enabled = False + + @property + def with_reader(self): + # Whether input data need to be processed by Input Feature Extractor + return hasattr(self, "reader") and self.reader is not None + + @property + def with_neck(self): + return hasattr(self, "neck") and self.neck is not None + + @property + def with_shared_head(self): + return hasattr(self, "shared_head") and self.shared_head is not None + + @property + def with_bbox(self): + return hasattr(self, "bbox_head") and self.bbox_head is not None + + @property + def with_mask(self): + return hasattr(self, "mask_head") and self.mask_head is not None + + @abstractmethod + def extract_feat(self, imgs): + pass + + def extract_feats(self, imgs): + assert isinstance(imgs, list) + for img in imgs: + yield self.extract_feat(img) + + @abstractmethod + def forward_train(self, imgs, **kwargs): + pass + + @abstractmethod + def simple_test(self, img, **kwargs): + pass + + @abstractmethod + def aug_test(self, imgs, **kwargs): + pass + + def init_weights(self, pretrained=None): + if pretrained is not None: + logger = logging.getLogger() + logger.info("load model from: {}".format(pretrained)) + + def forward_test(self, imgs, **kwargs): + pass + + def forward(self, example, return_loss=True, **kwargs): + pass diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/point_pillars.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/point_pillars.py new file mode 100644 index 0000000000000000000000000000000000000000..00045ab05b1e683a6905aa6616af18d9bc8dd8a4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/point_pillars.py @@ -0,0 +1,90 @@ +from ..registry import DETECTORS +from .single_stage import SingleStageDetector +from copy import deepcopy + +@DETECTORS.register_module +class PointPillars(SingleStageDetector): + def __init__( + self, + reader, + backbone, + neck, + bbox_head, + train_cfg=None, + test_cfg=None, + pretrained=None, + ): + super(PointPillars, self).__init__( + reader, backbone, neck, bbox_head, train_cfg, test_cfg, pretrained + ) + + def extract_feat(self, data): + input_features = self.reader( + data["features"], data["num_voxels"], data["coors"] + ) + x = self.backbone( + input_features, data["coors"], data["batch_size"], data["input_shape"] + ) + if self.with_neck: + x = self.neck(x) + return x + + def forward(self, example, return_loss=True, **kwargs): + voxels = example["voxels"] + coordinates = example["coordinates"] + num_points_in_voxel = example["num_points"] + num_voxels = example["num_voxels"] + + batch_size = len(num_voxels) + + data = dict( + features=voxels, + num_voxels=num_points_in_voxel, + coors=coordinates, + batch_size=batch_size, + input_shape=example["shape"][0], + ) + + x = self.extract_feat(data) + preds, _ = self.bbox_head(x) + + if return_loss: + return self.bbox_head.loss(example, preds, self.test_cfg) + else: + return self.bbox_head.predict(example, preds, self.test_cfg) + + def forward_two_stage(self, example, return_loss=True, **kwargs): + voxels = example["voxels"] + coordinates = example["coordinates"] + num_points_in_voxel = example["num_points"] + num_voxels = example["num_voxels"] + + batch_size = len(num_voxels) + + data = dict( + features=voxels, + num_voxels=num_points_in_voxel, + coors=coordinates, + batch_size=batch_size, + input_shape=example["shape"][0], + ) + + x = self.extract_feat(data) + bev_feature = x + preds, _ = self.bbox_head(x) + + # manual deepcopy ... + new_preds = [] + for pred in preds: + new_pred = {} + for k, v in pred.items(): + new_pred[k] = v.detach() + + new_preds.append(new_pred) + + boxes = self.bbox_head.predict(example, new_preds, self.test_cfg) + + if return_loss: + return boxes, bev_feature, self.bbox_head.loss(example, preds, self.test_cfg) + else: + return boxes, bev_feature, None diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/single_stage.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/single_stage.py new file mode 100644 index 0000000000000000000000000000000000000000..6275b693e69ba5d43af7fb86ac805e9032b36d23 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/single_stage.py @@ -0,0 +1,62 @@ +import torch.nn as nn + +from .. import builder +from ..registry import DETECTORS +from .base import BaseDetector +from ..utils.finetune_utils import FrozenBatchNorm2d +from det3d.torchie.trainer import load_checkpoint + + +@DETECTORS.register_module +class SingleStageDetector(BaseDetector): + def __init__( + self, + reader, + backbone, + neck=None, + bbox_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + ): + super(SingleStageDetector, self).__init__() + self.reader = builder.build_reader(reader) + self.backbone = builder.build_backbone(backbone) + if neck is not None: + self.neck = builder.build_neck(neck) + self.bbox_head = builder.build_head(bbox_head) + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + self.init_weights(pretrained=pretrained) + + def init_weights(self, pretrained=None): + if pretrained is None: + return + try: + load_checkpoint(self, pretrained, strict=False) + print("init weight from {}".format(pretrained)) + except: + print("no pretrained model at {}".format(pretrained)) + + def extract_feat(self, data): + input_features = self.reader(data) + x = self.backbone(input_features) + if self.with_neck: + x = self.neck(x) + return x + + def aug_test(self, example, rescale=False): + raise NotImplementedError + + def forward(self, example, return_loss=True, **kwargs): + pass + + def predict(self, example, preds_dicts): + pass + + def freeze(self): + for p in self.parameters(): + p.requires_grad = False + FrozenBatchNorm2d.convert_frozen_batchnorm(self) + return self \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/two_stage.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/two_stage.py new file mode 100644 index 0000000000000000000000000000000000000000..8ffbc23e151c8dd4214fa24c4b916af0a716ba60 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/two_stage.py @@ -0,0 +1,199 @@ +from det3d.core.bbox import box_torch_ops +from ..registry import DETECTORS +from .base import BaseDetector +from .. import builder +import torch +from torch import nn + +@DETECTORS.register_module +class TwoStageDetector(BaseDetector): + def __init__( + self, + first_stage_cfg, + second_stage_modules, + roi_head, + NMS_POST_MAXSIZE, + num_point=1, + freeze=False, + use_final_feature=False, + **kwargs + ): + super(TwoStageDetector, self).__init__() + self.single_det = builder.build_detector(first_stage_cfg, **kwargs) + self.NMS_POST_MAXSIZE = NMS_POST_MAXSIZE + + if freeze: + print("Freeze First Stage Network") + # we train the model in two steps + self.single_det = self.single_det.freeze() + self.bbox_head = self.single_det.bbox_head + + self.second_stage = nn.ModuleList() + # can be any number of modules + # bird eye view, cylindrical view, image, multiple timesteps, etc.. + for module in second_stage_modules: + self.second_stage.append(builder.build_second_stage_module(module)) + + self.roi_head = builder.build_roi_head(roi_head) + + self.num_point = num_point + self.use_final_feature = use_final_feature + + def combine_loss(self, one_stage_loss, roi_loss, tb_dict): + one_stage_loss['loss'][0] += (roi_loss) + + for i in range(len(one_stage_loss['loss'])): + one_stage_loss['roi_reg_loss'].append(tb_dict['rcnn_loss_reg']) + one_stage_loss['roi_cls_loss'].append(tb_dict['rcnn_loss_cls']) + + return one_stage_loss + + def get_box_center(self, boxes): + # box [List] + centers = [] + for box in boxes: + if self.num_point == 1 or len(box['box3d_lidar']) == 0: + centers.append(box['box3d_lidar'][:, :3]) + + elif self.num_point == 5: + center2d = box['box3d_lidar'][:, :2] + height = box['box3d_lidar'][:, 2:3] + dim2d = box['box3d_lidar'][:, 3:5] + rotation_y = box['box3d_lidar'][:, -1] + + corners = box_torch_ops.center_to_corner_box2d(center2d, dim2d, rotation_y) + + front_middle = torch.cat([(corners[:, 0] + corners[:, 1])/2, height], dim=-1) + back_middle = torch.cat([(corners[:, 2] + corners[:, 3])/2, height], dim=-1) + left_middle = torch.cat([(corners[:, 0] + corners[:, 3])/2, height], dim=-1) + right_middle = torch.cat([(corners[:, 1] + corners[:, 2])/2, height], dim=-1) + + points = torch.cat([box['box3d_lidar'][:, :3], front_middle, back_middle, left_middle, \ + right_middle], dim=0) + + centers.append(points) + else: + raise NotImplementedError() + + return centers + + def reorder_first_stage_pred_and_feature(self, first_pred, example, features): + batch_size = len(first_pred) + box_length = first_pred[0]['box3d_lidar'].shape[1] + feature_vector_length = sum([feat[0].shape[-1] for feat in features]) + + rois = first_pred[0]['box3d_lidar'].new_zeros((batch_size, + self.NMS_POST_MAXSIZE, box_length + )) + roi_scores = first_pred[0]['scores'].new_zeros((batch_size, + self.NMS_POST_MAXSIZE + )) + roi_labels = first_pred[0]['label_preds'].new_zeros((batch_size, + self.NMS_POST_MAXSIZE), dtype=torch.long + ) + roi_features = features[0][0].new_zeros((batch_size, + self.NMS_POST_MAXSIZE, feature_vector_length + )) + + for i in range(batch_size): + num_obj = features[0][i].shape[0] + # basically move rotation to position 6, so now the box is 7 + C . C is 2 for nuscenes to + # include velocity target + + box_preds = first_pred[i]['box3d_lidar'] + + if self.roi_head.code_size == 9: + # x, y, z, w, l, h, rotation_y, velocity_x, velocity_y + box_preds = box_preds[:, [0, 1, 2, 3, 4, 5, 8, 6, 7]] + + rois[i, :num_obj] = box_preds + roi_labels[i, :num_obj] = first_pred[i]['label_preds'] + 1 + roi_scores[i, :num_obj] = first_pred[i]['scores'] + roi_features[i, :num_obj] = torch.cat([feat[i] for feat in features], dim=-1) + + example['rois'] = rois + example['roi_labels'] = roi_labels + example['roi_scores'] = roi_scores + example['roi_features'] = roi_features + + example['has_class_labels']= True + + return example + + def post_process(self, batch_dict): + batch_size = batch_dict['batch_size'] + pred_dicts = [] + + for index in range(batch_size): + box_preds = batch_dict['batch_box_preds'][index] + cls_preds = batch_dict['batch_cls_preds'][index] # this is the predicted iou + label_preds = batch_dict['roi_labels'][index] + + if box_preds.shape[-1] == 9: + # move rotation to the end (the create submission file will take elements from 0:6 and -1) + box_preds = box_preds[:, [0, 1, 2, 3, 4, 5, 7, 8, 6]] + + scores = torch.sqrt(torch.sigmoid(cls_preds).reshape(-1) * batch_dict['roi_scores'][index].reshape(-1)) + mask = (label_preds != 0).reshape(-1) + + box_preds = box_preds[mask, :] + scores = scores[mask] + labels = label_preds[mask]-1 + + # currently don't need nms + pred_dict = { + 'box3d_lidar': box_preds, + 'scores': scores, + 'label_preds': labels, + "metadata": batch_dict["metadata"][index] + } + + pred_dicts.append(pred_dict) + + return pred_dicts + + + def forward(self, example, return_loss=True, **kwargs): + out = self.single_det.forward_two_stage(example, + return_loss, **kwargs) + + if len(out) == 5: + one_stage_pred, bev_feature, voxel_feature, final_feature, one_stage_loss = out + example['voxel_feature'] = voxel_feature + elif len(out) == 3: + one_stage_pred, bev_feature, one_stage_loss = out + else: + raise NotImplementedError + + # N C H W -> N H W C + if self.use_final_feature: + example['bev_feature'] = final_feature.permute(0, 2, 3, 1).contiguous() + else: + example['bev_feature'] = bev_feature.permute(0, 2, 3, 1).contiguous() + + centers_vehicle_frame = self.get_box_center(one_stage_pred) + + if self.roi_head.code_size == 7 and return_loss is True: + # drop velocity + example['gt_boxes_and_cls'] = example['gt_boxes_and_cls'][:, :, [0, 1, 2, 3, 4, 5, 6, -1]] + + features = [] + + for module in self.second_stage: + feature = module.forward(example, centers_vehicle_frame, self.num_point) + features.append(feature) + # feature is two level list + # first level is number of two stage information streams + # second level is batch + + example = self.reorder_first_stage_pred_and_feature(first_pred=one_stage_pred, example=example, features=features) + + # final classification / regression + batch_dict = self.roi_head(example, training=return_loss) + + if return_loss: + roi_loss, tb_dict = self.roi_head.get_loss() + + return self.combine_loss(one_stage_loss, roi_loss, tb_dict) + else: + return self.post_process(batch_dict) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/voxelnet.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/voxelnet.py new file mode 100644 index 0000000000000000000000000000000000000000..5a8218fb7cbeecebca74c1a5240f00cc8c9b3296 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/detectors/voxelnet.py @@ -0,0 +1,83 @@ +from ..registry import DETECTORS +from .single_stage import SingleStageDetector +from det3d.torchie.trainer import load_checkpoint +import torch +from copy import deepcopy + +@DETECTORS.register_module +class VoxelNet(SingleStageDetector): + def __init__( + self, + reader, + backbone, + neck, + bbox_head, + train_cfg=None, + test_cfg=None, + pretrained=None, + ): + super(VoxelNet, self).__init__( + reader, backbone, neck, bbox_head, train_cfg, test_cfg, pretrained + ) + + def extract_feat(self, data): + if 'voxels' not in data: + output = self.reader(data['points']) + voxels, coors, shape = output + + data = dict( + features=voxels, + coors=coors, + batch_size=len(data['points']), + input_shape=shape, + voxels=voxels + ) + input_features = voxels + else: + data = dict( + features=data['voxels'], + num_voxels=data["num_points"], + coors=data["coordinates"], + batch_size=len(data['points']), + input_shape=data["shape"][0], + ) + input_features = self.reader(data["features"], data['num_voxels']) + + x, voxel_feature = self.backbone( + input_features, data["coors"], data["batch_size"], data["input_shape"] + ) + + if self.with_neck: + x = self.neck(x) + + return x, voxel_feature + + def forward(self, example, return_loss=True, **kwargs): + x, _ = self.extract_feat(example) + preds, _ = self.bbox_head(x) + + if return_loss: + return self.bbox_head.loss(example, preds, self.test_cfg) + else: + return self.bbox_head.predict(example, preds, self.test_cfg) + + def forward_two_stage(self, example, return_loss=True, **kwargs): + x, voxel_feature = self.extract_feat(example) + bev_feature = x + preds, final_feat = self.bbox_head(x) + + if return_loss: + # manual deepcopy ... + new_preds = [] + for pred in preds: + new_pred = {} + for k, v in pred.items(): + new_pred[k] = v.detach() + new_preds.append(new_pred) + + boxes = self.bbox_head.predict(example, new_preds, self.test_cfg) + + return boxes, bev_feature, voxel_feature, final_feat, self.bbox_head.loss(example, preds, self.test_cfg) + else: + boxes = self.bbox_head.predict(example, preds, self.test_cfg) + return boxes, bev_feature, voxel_feature, final_feat, None \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/losses/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/losses/centernet_loss.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/losses/centernet_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..4d65e4264eff3c2571092d685db56243ee5d8703 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/losses/centernet_loss.py @@ -0,0 +1,54 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from det3d.core.utils.center_utils import _transpose_and_gather_feat + +class RegLoss(nn.Module): + '''Regression loss for an output tensor + Arguments: + output (batch x dim x h x w) + mask (batch x max_objects) + ind (batch x max_objects) + target (batch x max_objects x dim) + ''' + def __init__(self): + super(RegLoss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _transpose_and_gather_feat(output, ind) + mask = mask.float().unsqueeze(2) + + loss = F.l1_loss(pred*mask, target*mask, reduction='none') + loss = loss / (mask.sum() + 1e-4) + loss = loss.transpose(2 ,0).sum(dim=2).sum(dim=1) + return loss + +class FastFocalLoss(nn.Module): + ''' + Reimplemented focal loss, exactly the same as the CornerNet version. + Faster and costs much less memory. + ''' + def __init__(self): + super(FastFocalLoss, self).__init__() + + def forward(self, out, target, ind, mask, cat): + ''' + Arguments: + out, target: B x C x H x W + ind, mask: B x M + cat (category id for peaks): B x M + ''' + mask = mask.float() + gt = torch.pow(1 - target, 4) + neg_loss = torch.log(1 - out) * torch.pow(out, 2) * gt + neg_loss = neg_loss.sum() + + pos_pred_pix = _transpose_and_gather_feat(out, ind) # B x M x C + pos_pred = pos_pred_pix.gather(2, cat.unsqueeze(2)) # B x M + num_pos = mask.sum() + pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, 2) * \ + mask.unsqueeze(2) + pos_loss = pos_loss.sum() + if num_pos == 0: + return - neg_loss + return - (pos_loss + neg_loss) / num_pos diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/necks/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1a1db7e4d2754ea7f9720c1b13d8f17a8f5fe304 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/necks/__init__.py @@ -0,0 +1,3 @@ +from .rpn import RPN + +__all__ = ["RPN"] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/necks/rpn.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/necks/rpn.py new file mode 100644 index 0000000000000000000000000000000000000000..8d489399d959deb3a5e93c42f902bfc49dc4a6c4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/necks/rpn.py @@ -0,0 +1,160 @@ +import time +import numpy as np +import math + +import torch + +from torch import nn +from torch.nn import functional as F +from torchvision.models import resnet +from torch.nn.modules.batchnorm import _BatchNorm + +from det3d.torchie.cnn import constant_init, kaiming_init, xavier_init +from det3d.torchie.trainer import load_checkpoint +from det3d.models.utils import Empty, GroupNorm, Sequential +from det3d.models.utils import change_default_args + +from .. import builder +from ..registry import NECKS +from ..utils import build_norm_layer + + +@NECKS.register_module +class RPN(nn.Module): + def __init__( + self, + layer_nums, + ds_layer_strides, + ds_num_filters, + us_layer_strides, + us_num_filters, + num_input_features, + norm_cfg=None, + name="rpn", + logger=None, + **kwargs + ): + super(RPN, self).__init__() + self._layer_strides = ds_layer_strides + self._num_filters = ds_num_filters + self._layer_nums = layer_nums + self._upsample_strides = us_layer_strides + self._num_upsample_filters = us_num_filters + self._num_input_features = num_input_features + + if norm_cfg is None: + norm_cfg = dict(type="BN", eps=1e-3, momentum=0.01) + self._norm_cfg = norm_cfg + + assert len(self._layer_strides) == len(self._layer_nums) + assert len(self._num_filters) == len(self._layer_nums) + assert len(self._num_upsample_filters) == len(self._upsample_strides) + + self._upsample_start_idx = len(self._layer_nums) - len(self._upsample_strides) + + must_equal_list = [] + for i in range(len(self._upsample_strides)): + # print(upsample_strides[i]) + must_equal_list.append( + self._upsample_strides[i] + / np.prod(self._layer_strides[: i + self._upsample_start_idx + 1]) + ) + + for val in must_equal_list: + assert val == must_equal_list[0] + + in_filters = [self._num_input_features, *self._num_filters[:-1]] + blocks = [] + deblocks = [] + + for i, layer_num in enumerate(self._layer_nums): + block, num_out_filters = self._make_layer( + in_filters[i], + self._num_filters[i], + layer_num, + stride=self._layer_strides[i], + ) + blocks.append(block) + if i - self._upsample_start_idx >= 0: + stride = (self._upsample_strides[i - self._upsample_start_idx]) + if stride > 1: + deblock = Sequential( + nn.ConvTranspose2d( + num_out_filters, + self._num_upsample_filters[i - self._upsample_start_idx], + stride, + stride=stride, + bias=False, + ), + build_norm_layer( + self._norm_cfg, + self._num_upsample_filters[i - self._upsample_start_idx], + )[1], + nn.ReLU(), + ) + else: + stride = np.round(1 / stride).astype(np.int64) + deblock = Sequential( + nn.Conv2d( + num_out_filters, + self._num_upsample_filters[i - self._upsample_start_idx], + stride, + stride=stride, + bias=False, + ), + build_norm_layer( + self._norm_cfg, + self._num_upsample_filters[i - self._upsample_start_idx], + )[1], + nn.ReLU(), + ) + deblocks.append(deblock) + self.blocks = nn.ModuleList(blocks) + self.deblocks = nn.ModuleList(deblocks) + + logger.info("Finish RPN Initialization") + + @property + def downsample_factor(self): + factor = np.prod(self._layer_strides) + if len(self._upsample_strides) > 0: + factor /= self._upsample_strides[-1] + return factor + + def _make_layer(self, inplanes, planes, num_blocks, stride=1): + + block = Sequential( + nn.ZeroPad2d(1), + nn.Conv2d(inplanes, planes, 3, stride=stride, bias=False), + build_norm_layer(self._norm_cfg, planes)[1], + # nn.BatchNorm2d(planes, eps=1e-3, momentum=0.01), + nn.ReLU(), + ) + + for j in range(num_blocks): + block.add(nn.Conv2d(planes, planes, 3, padding=1, bias=False)) + block.add( + build_norm_layer(self._norm_cfg, planes)[1], + # nn.BatchNorm2d(planes, eps=1e-3, momentum=0.01) + ) + block.add(nn.ReLU()) + + return block, planes + + # default init_weights for conv(msra) and norm in ConvModule + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + xavier_init(m, distribution="uniform") + + def forward(self, x): + ups = [] + for i in range(len(self.blocks)): + x = self.blocks[i](x) + if i - self._upsample_start_idx >= 0: + ups.append(self.deblocks[i - self._upsample_start_idx](x)) + if len(ups) > 0: + x = torch.cat(ups, dim=1) + + return x + diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..94ed32c7ea77be0301a12bc6073d7f31c049d350 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/__init__.py @@ -0,0 +1,9 @@ +from .pillar_encoder import PillarFeatureNet, PointPillarsScatter +from .voxel_encoder import VoxelFeatureExtractorV3 +from .dynamic_voxel_encoder import DynamicVoxelEncoder + +__all__ = [ + "VoxelFeatureExtractorV3", + "PillarFeatureNet", + "PointPillarsScatter", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/dynamic_voxel_encoder.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/dynamic_voxel_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..5cfc476d97534af83ee55e386104bfeed871fd14 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/dynamic_voxel_encoder.py @@ -0,0 +1,103 @@ +from det3d.core.utils.scatter import scatter_mean +from torch.nn import functional as F +from ..registry import READERS +from torch import nn +import numpy as np +import torch + +def voxelization(points, pc_range, voxel_size): + keep = (points[:, 0] >= pc_range[0]) & (points[:, 0] <= pc_range[3]) & \ + (points[:, 1] >= pc_range[1]) & (points[:, 1] <= pc_range[4]) & \ + (points[:, 2] >= pc_range[2]) & (points[:, 2] <= pc_range[5]) + points = points[keep, :] + coords = ((points[:, [2, 1, 0]] - pc_range[[2, 1, 0]]) / voxel_size[[2, 1, 0]]).to(torch.int64) + unique_coords, inverse_indices = coords.unique(return_inverse=True, dim=0) + + voxels = scatter_mean(points, inverse_indices, dim=0) + return voxels, unique_coords + +def voxelization_virtual(points, pc_range, voxel_size): + # current one is hard coded for nuScenes + # TODO: fix those magic number + keep = (points[:, 0] >= pc_range[0]) & (points[:, 0] <= pc_range[3]) & \ + (points[:, 1] >= pc_range[1]) & (points[:, 1] <= pc_range[4]) & \ + (points[:, 2] >= pc_range[2]) & (points[:, 2] <= pc_range[5]) + points = points[keep, :] + + real_points_mask = points[:, -2] == 1 + painted_points_mask = points[:, -2] == 0 + virtual_points_mask = points[:, -2] == -1 + + # remove zero padding for real points + real_points = points[real_points_mask][:, [0, 1, 2, 3, -1]] + painted_point = points[painted_points_mask] + virtual_point = points[virtual_points_mask] + + padded_points = torch.zeros(len(points), 22, device=points.device, dtype=points.dtype) + + # real points will occupy channels 0 to 4 and -1 + padded_points[:len(real_points), :5] = real_points + padded_points[:len(real_points), -1] = 1 + + # painted points will occupy channels 5 to 21 + padded_points[len(real_points):len(real_points)+len(painted_point), 5:19] = painted_point[:, :-2] + padded_points[len(real_points):len(real_points)+len(painted_point), 19] = painted_point[:, -1] + padded_points[len(real_points):len(real_points)+len(painted_point), 20] = 1 + padded_points[len(real_points):len(real_points)+len(painted_point), 21] = 0 + + # virtual points will occupy channels 5 to 21 + padded_points[len(real_points)+len(painted_point):, 5:19] = virtual_point[:, :-2] + padded_points[len(real_points)+len(painted_point):, 19] = virtual_point[:, -1] + padded_points[len(real_points)+len(painted_point):, 20] = 0 + padded_points[len(real_points)+len(painted_point):, 21] = 0 + + points_xyz = torch.cat([real_points[:, :3], painted_point[:, :3], virtual_point[:, :3]], dim=0) + + coords = ((points_xyz[:, [2, 1, 0]] - pc_range[[2, 1, 0]]) / voxel_size[[2, 1, 0]]).to(torch.int64) + unique_coords, inverse_indices = coords.unique(return_inverse=True, dim=0) + + voxels = scatter_mean(padded_points, inverse_indices, dim=0) + + indicator = voxels[:, -1] + mix_mask = (indicator > 0) * (indicator < 1) + # remove index + voxels = voxels[:, :-1] + + voxels[mix_mask, :5] = voxels[mix_mask, :5] / indicator[mix_mask].unsqueeze(-1) + voxels[mix_mask, 5:] = voxels[mix_mask, 5:] / (1-indicator[mix_mask].unsqueeze(-1)) + return voxels, unique_coords + +@READERS.register_module +class DynamicVoxelEncoder(nn.Module): + def __init__( + self, pc_range, voxel_size, virtual=False + ): + super(DynamicVoxelEncoder, self).__init__() + self.pc_range = torch.tensor(pc_range) + self.voxel_size = torch.tensor(voxel_size) + self.shape = torch.round((self.pc_range[3:] - self.pc_range[:3]) / self.voxel_size) + self.shape_np = self.shape.numpy().astype(np.int32) + self.virtual = virtual + + @torch.no_grad() + def forward(self, points): + # points list[torch.Tensor] + coors = [] + voxels = [] + for res in points: + if self.virtual: + voxel, coor = voxelization_virtual(res, self.pc_range.to(res.device), self.voxel_size.to(res.device)) + else: + voxel, coor = voxelization(res, self.pc_range.to(res.device), self.voxel_size.to(res.device)) + voxels.append(voxel) + coors.append(coor) + + coors_batch = [] + for i in range(len(voxels)): + coor_pad = F.pad(coors[i], (1, 0), mode='constant', value=i) + coors_batch.append(coor_pad) + + coors_batch = torch.cat(coors_batch, dim=0) + voxels_batch = torch.cat(voxels, dim=0) + return voxels_batch, coors_batch, self.shape_np + diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/pillar_encoder.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/pillar_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..1a2553a35a7cea86ff2042587b8bf1fa6215f23e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/pillar_encoder.py @@ -0,0 +1,218 @@ +""" +PointPillars fork from SECOND. +Code written by Alex Lang and Oscar Beijbom, 2018. +Licensed under MIT License [see LICENSE]. +""" + +import torch +from det3d.models.utils import get_paddings_indicator +from torch import nn +from torch.nn import functional as F +from ..registry import BACKBONES, READERS +from ..utils import build_norm_layer + + +class PFNLayer(nn.Module): + def __init__(self, in_channels, out_channels, norm_cfg=None, last_layer=False): + """ + Pillar Feature Net Layer. + The Pillar Feature Net could be composed of a series of these layers, but the PointPillars paper results only + used a single PFNLayer. This layer performs a similar role as second.pytorch.voxelnet.VFELayer. + :param in_channels: . Number of input channels. + :param out_channels: . Number of output channels. + :param last_layer: . If last_layer, there is no concatenation of features. + """ + + super().__init__() + self.name = "PFNLayer" + self.last_vfe = last_layer + if not self.last_vfe: + out_channels = out_channels // 2 + self.units = out_channels + + if norm_cfg is None: + norm_cfg = dict(type="BN1d", eps=1e-3, momentum=0.01) + self.norm_cfg = norm_cfg + + self.linear = nn.Linear(in_channels, self.units, bias=False) + self.norm = build_norm_layer(self.norm_cfg, self.units)[1] + + def forward(self, inputs): + + x = self.linear(inputs) + torch.backends.cudnn.enabled = False + x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous() + torch.backends.cudnn.enabled = True + x = F.relu(x) + + x_max = torch.max(x, dim=1, keepdim=True)[0] + + if self.last_vfe: + return x_max + else: + x_repeat = x_max.repeat(1, inputs.shape[1], 1) + x_concatenated = torch.cat([x, x_repeat], dim=2) + return x_concatenated + + +@READERS.register_module +class PillarFeatureNet(nn.Module): + def __init__( + self, + num_input_features=4, + num_filters=(64,), + with_distance=False, + voxel_size=(0.2, 0.2, 4), + pc_range=(0, -40, -3, 70.4, 40, 1), + norm_cfg=None, + virtual=False + ): + """ + Pillar Feature Net. + The network prepares the pillar features and performs forward pass through PFNLayers. This net performs a + similar role to SECOND's second.pytorch.voxelnet.VoxelFeatureExtractor. + :param num_input_features: . Number of input features, either x, y, z or x, y, z, r. + :param num_filters: (: N). Number of features in each of the N PFNLayers. + :param with_distance: . Whether to include Euclidean distance to points. + :param voxel_size: (: 3). Size of voxels, only utilize x and y size. + :param pc_range: (: 6). Point cloud range, only utilize x and y min. + """ + + super().__init__() + self.name = "PillarFeatureNet" + assert len(num_filters) > 0 + + self.num_input = num_input_features + num_input_features += 5 + if with_distance: + num_input_features += 1 + self._with_distance = with_distance + + # Create PillarFeatureNet layers + num_filters = [num_input_features] + list(num_filters) + pfn_layers = [] + for i in range(len(num_filters) - 1): + in_filters = num_filters[i] + out_filters = num_filters[i + 1] + if i < len(num_filters) - 2: + last_layer = False + else: + last_layer = True + pfn_layers.append( + PFNLayer( + in_filters, out_filters, norm_cfg=norm_cfg, last_layer=last_layer + ) + ) + self.pfn_layers = nn.ModuleList(pfn_layers) + + self.virtual = virtual + + # Need pillar (voxel) size and x/y offset in order to calculate pillar offset + self.vx = voxel_size[0] + self.vy = voxel_size[1] + self.x_offset = self.vx / 2 + pc_range[0] + self.y_offset = self.vy / 2 + pc_range[1] + + def forward(self, features, num_voxels, coors): + device = features.device + + if self.virtual: + virtual_point_mask = features[..., -2] == -1 + virtual_points = features[virtual_point_mask] + virtual_points[..., -2] = 1 + features[..., -2] = 0 + features[virtual_point_mask] = virtual_points + + dtype = features.dtype + # Find distance of x, y, and z from cluster center + # features = features[:, :, :self.num_input] + points_mean = features[:, :, :3].sum(dim=1, keepdim=True) / num_voxels.type_as( + features + ).view(-1, 1, 1) + f_cluster = features[:, :, :3] - points_mean + + # Find distance of x, y, and z from pillar center + # f_center = features[:, :, :2] + f_center = torch.zeros_like(features[:, :, :2]) + f_center[:, :, 0] = features[:, :, 0] - ( + coors[:, 3].to(dtype).unsqueeze(1) * self.vx + self.x_offset + ) + f_center[:, :, 1] = features[:, :, 1] - ( + coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset + ) + + # Combine together feature decorations + features_ls = [features, f_cluster, f_center] + if self._with_distance: + points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) + features_ls.append(points_dist) + features = torch.cat(features_ls, dim=-1) + + # The feature decorations were calculated without regard to whether pillar was empty. Need to ensure that + # empty pillars remain set to zeros. + voxel_count = features.shape[1] + mask = get_paddings_indicator(num_voxels, voxel_count, axis=0) + mask = torch.unsqueeze(mask, -1).type_as(features) + features *= mask + + # Forward pass through PFNLayers + for pfn in self.pfn_layers: + features = pfn(features) + + return features.squeeze() + + +@BACKBONES.register_module +class PointPillarsScatter(nn.Module): + def __init__( + self, num_input_features=64, norm_cfg=None, name="PointPillarsScatter", **kwargs + ): + """ + Point Pillar's Scatter. + Converts learned features from dense tensor to sparse pseudo image. This replaces SECOND's + second.pytorch.voxelnet.SparseMiddleExtractor. + :param output_shape: ([int]: 4). Required output shape of features. + :param num_input_features: . Number of input features. + """ + + super().__init__() + self.name = "PointPillarsScatter" + self.nchannels = num_input_features + + def forward(self, voxel_features, coords, batch_size, input_shape): + + self.nx = input_shape[0] + self.ny = input_shape[1] + + # batch_canvas will be the final output. + batch_canvas = [] + for batch_itt in range(batch_size): + # Create the canvas for this sample + canvas = torch.zeros( + self.nchannels, + self.nx * self.ny, + dtype=voxel_features.dtype, + device=voxel_features.device, + ) + + # Only include non-empty pillars + batch_mask = coords[:, 0] == batch_itt + + this_coords = coords[batch_mask, :] + indices = this_coords[:, 2] * self.nx + this_coords[:, 3] + indices = indices.type(torch.long) + voxels = voxel_features[batch_mask, :] + voxels = voxels.t() + + # Now scatter the blob back to the canvas. + canvas[:, indices] = voxels + + # Append to a list for later stacking. + batch_canvas.append(canvas) + + # Stack to 3-dim tensor (batch-size, nchannels, nrows*ncols) + batch_canvas = torch.stack(batch_canvas, 0) + + # Undo the column stacking to final 4-dim tensor + batch_canvas = batch_canvas.view(batch_size, self.nchannels, self.ny, self.nx) + return batch_canvas diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/voxel_encoder.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/voxel_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..b889314376f99106f0dd294288a2c4a948198aa8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/readers/voxel_encoder.py @@ -0,0 +1,24 @@ +from torch import nn +from torch.nn import functional as F + +from ..registry import READERS + + + +@READERS.register_module +class VoxelFeatureExtractorV3(nn.Module): + def __init__( + self, num_input_features=4, norm_cfg=None, name="VoxelFeatureExtractorV3" + ): + super(VoxelFeatureExtractorV3, self).__init__() + self.name = name + self.num_input_features = num_input_features + + def forward(self, features, num_voxels, coors=None): + assert self.num_input_features == features.shape[-1] + + points_mean = features[:, :, : self.num_input_features].sum( + dim=1, keepdim=False + ) / num_voxels.type_as(features).view(-1, 1) + + return points_mean.contiguous() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/registry.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..de7c71e822d0c6fb6ac5e8f9ded5220f2b6be548 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/registry.py @@ -0,0 +1,10 @@ +from det3d.utils import Registry + +READERS = Registry("reader") +BACKBONES = Registry("backbone") +NECKS = Registry("neck") +HEADS = Registry("head") +LOSSES = Registry("loss") +DETECTORS = Registry("detector") +SECOND_STAGE = Registry("second_stage") +ROI_HEAD = Registry("roi_head") \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..31856f05f9df704fb16aa2a3f48e021cd58fd12b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/__init__.py @@ -0,0 +1,7 @@ +from .roi_head_template import RoIHeadTemplate +from .roi_head import RoIHead + +__all__ = [ + 'RoIHeadTemplate', + 'RoIHead' +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/roi_head.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/roi_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a6e87ee81483891006d19e295a1cb7916ec6faf0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/roi_head.py @@ -0,0 +1,111 @@ +# ------------------------------------------------------------------------------ +# Portions of this code are from +# OpenPCDet (https://github.com/open-mmlab/OpenPCDet) +# Licensed under the Apache License. +# ------------------------------------------------------------------------------ + +from torch import batch_norm +import torch.nn as nn +import torch +from .roi_head_template import RoIHeadTemplate + +from det3d.core import box_torch_ops + +from ..registry import ROI_HEAD + +@ROI_HEAD.register_module +class RoIHead(RoIHeadTemplate): + def __init__(self, input_channels, model_cfg, num_class=1, code_size=7, add_box_param=False, test_cfg=None): + super().__init__(num_class=num_class, model_cfg=model_cfg) + self.model_cfg = model_cfg + self.test_cfg = test_cfg + self.code_size = code_size + self.add_box_param = add_box_param + + pre_channel = input_channels + + shared_fc_list = [] + for k in range(0, self.model_cfg.SHARED_FC.__len__()): + shared_fc_list.extend([ + nn.Conv1d(pre_channel, self.model_cfg.SHARED_FC[k], kernel_size=1, bias=False), + nn.BatchNorm1d(self.model_cfg.SHARED_FC[k]), + nn.ReLU() + ]) + pre_channel = self.model_cfg.SHARED_FC[k] + + if k != self.model_cfg.SHARED_FC.__len__() - 1 and self.model_cfg.DP_RATIO > 0: + shared_fc_list.append(nn.Dropout(self.model_cfg.DP_RATIO)) + + self.shared_fc_layer = nn.Sequential(*shared_fc_list) + + self.cls_layers = self.make_fc_layers( + input_channels=pre_channel, output_channels=self.num_class, fc_list=self.model_cfg.CLS_FC + ) + self.reg_layers = self.make_fc_layers( + input_channels=pre_channel, + output_channels=code_size, + fc_list=self.model_cfg.REG_FC + ) + self.init_weights(weight_init='xavier') + + def init_weights(self, weight_init='xavier'): + if weight_init == 'kaiming': + init_func = nn.init.kaiming_normal_ + elif weight_init == 'xavier': + init_func = nn.init.xavier_normal_ + elif weight_init == 'normal': + init_func = nn.init.normal_ + else: + raise NotImplementedError + + for m in self.modules(): + if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d): + if weight_init == 'normal': + init_func(m.weight, mean=0, std=0.001) + else: + init_func(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + nn.init.normal_(self.reg_layers[-1].weight, mean=0, std=0.001) + + def forward(self, batch_dict, training=True): + """ + :param input_data: input dict + :return: + """ + batch_dict['batch_size'] = len(batch_dict['rois']) + if training: + targets_dict = self.assign_targets(batch_dict) + batch_dict['rois'] = targets_dict['rois'] + batch_dict['roi_labels'] = targets_dict['roi_labels'] + batch_dict['roi_features'] = targets_dict['roi_features'] + batch_dict['roi_scores'] = targets_dict['roi_scores'] + + # RoI aware pooling + if self.add_box_param: + batch_dict['roi_features'] = torch.cat([batch_dict['roi_features'], batch_dict['rois'], batch_dict['roi_scores'].unsqueeze(-1)], dim=-1) + + pooled_features = batch_dict['roi_features'].reshape(-1, 1, + batch_dict['roi_features'].shape[-1]).contiguous() # (BxN, 1, C) + + batch_size_rcnn = pooled_features.shape[0] + pooled_features = pooled_features.permute(0, 2, 1).contiguous() # (BxN, C, 1) + + shared_features = self.shared_fc_layer(pooled_features.view(batch_size_rcnn, -1, 1)) + rcnn_cls = self.cls_layers(shared_features).transpose(1, 2).contiguous().squeeze(dim=1) # (B, 1 or 2) + rcnn_reg = self.reg_layers(shared_features).transpose(1, 2).contiguous().squeeze(dim=1) # (B, C) + + if not training: + batch_cls_preds, batch_box_preds = self.generate_predicted_boxes( + batch_size=batch_dict['batch_size'], rois=batch_dict['rois'], cls_preds=rcnn_cls, box_preds=rcnn_reg + ) + batch_dict['batch_cls_preds'] = batch_cls_preds + batch_dict['batch_box_preds'] = batch_box_preds + batch_dict['cls_preds_normalized'] = False + else: + targets_dict['rcnn_cls'] = rcnn_cls + targets_dict['rcnn_reg'] = rcnn_reg + + self.forward_ret_dict = targets_dict + + return batch_dict \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/roi_head_template.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/roi_head_template.py new file mode 100644 index 0000000000000000000000000000000000000000..a9ea1854ec3708c2118a07cb6c827fb8255480ef --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/roi_head_template.py @@ -0,0 +1,183 @@ +# ------------------------------------------------------------------------------ +# Portions of this code are from +# OpenPCDet (https://github.com/open-mmlab/OpenPCDet) +# Licensed under the Apache License. +# ------------------------------------------------------------------------------ + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from det3d.core.bbox import box_torch_ops +from .target_assigner.proposal_target_layer import ProposalTargetLayer + +def limit_period(val, offset=0.5, period=np.pi): + return val - torch.floor(val / period + offset) * period + + +class RoIHeadTemplate(nn.Module): + def __init__(self, num_class, model_cfg): + super().__init__() + self.model_cfg = model_cfg + self.num_class = num_class + self.proposal_target_layer = ProposalTargetLayer(roi_sampler_cfg=self.model_cfg.TARGET_CONFIG) + + self.forward_ret_dict = None + + def make_fc_layers(self, input_channels, output_channels, fc_list): + fc_layers = [] + pre_channel = input_channels + for k in range(0, fc_list.__len__()): + fc_layers.extend([ + nn.Conv1d(pre_channel, fc_list[k], kernel_size=1, bias=False), + nn.BatchNorm1d(fc_list[k]), + nn.ReLU() + ]) + pre_channel = fc_list[k] + if self.model_cfg.DP_RATIO >= 0 and k == 0: + fc_layers.append(nn.Dropout(self.model_cfg.DP_RATIO)) + fc_layers.append(nn.Conv1d(pre_channel, output_channels, kernel_size=1, bias=True)) + fc_layers = nn.Sequential(*fc_layers) + return fc_layers + + def assign_targets(self, batch_dict): + batch_size = batch_dict['batch_size'] + with torch.no_grad(): + targets_dict = self.proposal_target_layer.forward(batch_dict) + + rois = targets_dict['rois'] # (B, N, 7 + C) + gt_of_rois = targets_dict['gt_of_rois'] # (B, N, 7 + C + 1) + targets_dict['gt_of_rois_src'] = gt_of_rois.clone().detach() + + roi_ry = limit_period(rois[:, :, 6], offset=0.5, period=np.pi*2) + + gt_of_rois[:, :, :6] = gt_of_rois[:, :, :6] - rois[:, :, :6] + gt_of_rois[:, :, 6] = gt_of_rois[:, :, 6] - roi_ry + + gt_of_rois = box_torch_ops.rotate_points_along_z( + points=gt_of_rois.view(-1, 1, gt_of_rois.shape[-1]), angle=-roi_ry.view(-1) + ).view(batch_size, -1, gt_of_rois.shape[-1]) + + if rois.shape[-1] == 9: + # rotate velocity + gt_of_rois[:, :, 7:-1] = gt_of_rois[:, :, 7:-1] - rois[:, :, 7:] + + """ + roi_vel = gt_of_rois[:, :, 7:-1] + roi_vel = torch.cat([roi_vel, torch.zeros([roi_vel.shape[0], roi_vel.shape[1], 1]).to(roi_vel)], dim=-1) + + gt_of_rois[:, :, 7:-1] = box_torch_ops.rotate_points_along_z( + points=roi_vel.view(-1, 1, 3), angle=-roi_ry.view(-1) + ).view(batch_size, -1, 3)[..., :2] + """ + + # flip orientation if rois have opposite orientation + heading_label = gt_of_rois[:, :, 6] % (2 * np.pi) # 0 ~ 2pi + opposite_flag = (heading_label > np.pi * 0.5) & (heading_label < np.pi * 1.5) + heading_label[opposite_flag] = (heading_label[opposite_flag] + np.pi) % (2 * np.pi) # (0 ~ pi/2, 3pi/2 ~ 2pi) + flag = heading_label > np.pi + heading_label[flag] = heading_label[flag] - np.pi * 2 # (-pi/2, pi/2) + heading_label = torch.clamp(heading_label, min=-np.pi / 2, max=np.pi / 2) + + gt_of_rois[:, :, 6] = heading_label + + + targets_dict['gt_of_rois'] = gt_of_rois + return targets_dict + + def get_box_reg_layer_loss(self, forward_ret_dict): + loss_cfgs = self.model_cfg.LOSS_CONFIG + code_size = forward_ret_dict['rcnn_reg'].shape[-1] + reg_valid_mask = forward_ret_dict['reg_valid_mask'].view(-1) + gt_boxes3d_ct = forward_ret_dict['gt_of_rois'][..., 0:code_size] + rcnn_reg = forward_ret_dict['rcnn_reg'] # (rcnn_batch_size, C) + rcnn_batch_size = gt_boxes3d_ct.view(-1, code_size).shape[0] + + fg_mask = (reg_valid_mask > 0) + fg_sum = fg_mask.long().sum().item() + + tb_dict = {} + + if loss_cfgs.REG_LOSS == 'L1': + reg_targets = gt_boxes3d_ct.view(rcnn_batch_size, -1) + rcnn_loss_reg = F.l1_loss( + rcnn_reg.view(rcnn_batch_size, -1), + reg_targets, + reduction='none' + ) # [B, M, 7] + + rcnn_loss_reg = rcnn_loss_reg * rcnn_loss_reg.new_tensor(\ + loss_cfgs.LOSS_WEIGHTS['code_weights']) + + rcnn_loss_reg = (rcnn_loss_reg.view(rcnn_batch_size, -1) * fg_mask.unsqueeze(dim=-1).float()).sum() / max(fg_sum, 1) + rcnn_loss_reg = rcnn_loss_reg * loss_cfgs.LOSS_WEIGHTS['rcnn_reg_weight'] + tb_dict['rcnn_loss_reg'] = rcnn_loss_reg.detach() + else: + raise NotImplementedError + + return rcnn_loss_reg, tb_dict + + def get_box_cls_layer_loss(self, forward_ret_dict): + loss_cfgs = self.model_cfg.LOSS_CONFIG + rcnn_cls = forward_ret_dict['rcnn_cls'] + rcnn_cls_labels = forward_ret_dict['rcnn_cls_labels'].view(-1) + if loss_cfgs.CLS_LOSS == 'BinaryCrossEntropy': + rcnn_cls_flat = rcnn_cls.view(-1) + batch_loss_cls = F.binary_cross_entropy(torch.sigmoid(rcnn_cls_flat), rcnn_cls_labels.float(), reduction='none') + cls_valid_mask = (rcnn_cls_labels >= 0).float() + rcnn_loss_cls = (batch_loss_cls * cls_valid_mask).sum() / torch.clamp(cls_valid_mask.sum(), min=1.0) + elif loss_cfgs.CLS_LOSS == 'CrossEntropy': + batch_loss_cls = F.cross_entropy(rcnn_cls, rcnn_cls_labels, reduction='none', ignore_index=-1) + cls_valid_mask = (rcnn_cls_labels >= 0).float() + rcnn_loss_cls = (batch_loss_cls * cls_valid_mask).sum() / torch.clamp(cls_valid_mask.sum(), min=1.0) + else: + raise NotImplementedError + + rcnn_loss_cls = rcnn_loss_cls * loss_cfgs.LOSS_WEIGHTS['rcnn_cls_weight'] + tb_dict = {'rcnn_loss_cls': rcnn_loss_cls.detach()} + return rcnn_loss_cls, tb_dict + + def get_loss(self, tb_dict=None): + tb_dict = {} if tb_dict is None else tb_dict + rcnn_loss = 0 + rcnn_loss_cls, cls_tb_dict = self.get_box_cls_layer_loss(self.forward_ret_dict) + rcnn_loss += rcnn_loss_cls + tb_dict.update(cls_tb_dict) + + rcnn_loss_reg, reg_tb_dict = self.get_box_reg_layer_loss(self.forward_ret_dict) + rcnn_loss += rcnn_loss_reg + tb_dict.update(reg_tb_dict) + tb_dict['rcnn_loss'] = rcnn_loss.item() + return rcnn_loss, tb_dict + + def generate_predicted_boxes(self, batch_size, rois, cls_preds, box_preds): + """ + Args: + batch_size: + rois: (B, N, 7) + cls_preds: (BN, num_class) + box_preds: (BN, code_size) + + Returns: + + """ + code_size = box_preds.shape[-1] + # batch_cls_preds: (B, N, num_class or 1) + batch_cls_preds = cls_preds.view(batch_size, -1, cls_preds.shape[-1]) + batch_box_preds = box_preds.view(batch_size, -1, code_size) + + roi_ry = rois[:, :, 6].view(-1) + roi_xyz = rois[:, :, 0:3].view(-1, 3) + + local_rois = rois.clone().detach() + local_rois[:, :, 0:3] = 0 + + batch_box_preds = (batch_box_preds + local_rois).view(-1, code_size) + batch_box_preds = box_torch_ops.rotate_points_along_z( + batch_box_preds.unsqueeze(dim=1), roi_ry + ).squeeze(dim=1) + + batch_box_preds[:, 0:3] += roi_xyz + batch_box_preds = batch_box_preds.view(batch_size, -1, code_size) + + return batch_cls_preds, batch_box_preds diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/target_assigner/proposal_target_layer.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/target_assigner/proposal_target_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..460f3ed512d0d24458f0a578c07c242755a09ad4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/roi_heads/target_assigner/proposal_target_layer.py @@ -0,0 +1,244 @@ +# ------------------------------------------------------------------------------ +# Portions of this code are from +# OpenPCDet (https://github.com/open-mmlab/OpenPCDet) +# Licensed under the Apache License. +# ------------------------------------------------------------------------------ + +import numpy as np +import torch +import torch.nn as nn + +from ....ops.iou3d_nms.iou3d_nms_utils import boxes_iou3d_gpu + + +class ProposalTargetLayer(nn.Module): + def __init__(self, roi_sampler_cfg): + super().__init__() + self.roi_sampler_cfg = roi_sampler_cfg + + def forward(self, batch_dict): + """ + Args: + batch_dict: + batch_size: + rois: (B, num_rois, 7 + C) + roi_scores: (B, num_rois) + gt_boxes: (B, N, 7 + C + 1) + roi_labels: (B, num_rois) + Returns: + batch_dict: + rois: (B, M, 7 + C) + gt_of_rois: (B, M, 7 + C) + gt_iou_of_rois: (B, M) + roi_scores: (B, M) + roi_labels: (B, M) + reg_valid_mask: (B, M) + rcnn_cls_labels: (B, M) + """ + batch_rois, batch_gt_of_rois, batch_roi_ious, batch_roi_scores, batch_roi_labels, \ + batch_roi_features = self.sample_rois_for_rcnn( + batch_dict=batch_dict + ) + # regression valid mask + reg_valid_mask = (batch_roi_ious > self.roi_sampler_cfg.REG_FG_THRESH).long() + + # classification label + if self.roi_sampler_cfg.CLS_SCORE_TYPE == 'cls': + batch_cls_labels = (batch_roi_ious > self.roi_sampler_cfg.CLS_FG_THRESH).long() + ignore_mask = (batch_roi_ious > self.roi_sampler_cfg.CLS_BG_THRESH) & \ + (batch_roi_ious < self.roi_sampler_cfg.CLS_FG_THRESH) + batch_cls_labels[ignore_mask > 0] = -1 + elif self.roi_sampler_cfg.CLS_SCORE_TYPE == 'roi_iou': + # padding_mask = (torch.isclose(batch_rois.sum(dim=-1), batch_rois.new_zeros(1))) + + iou_bg_thresh = self.roi_sampler_cfg.CLS_BG_THRESH + iou_fg_thresh = self.roi_sampler_cfg.CLS_FG_THRESH + fg_mask = batch_roi_ious > iou_fg_thresh + bg_mask = batch_roi_ious < iou_bg_thresh + interval_mask = (fg_mask == 0) & (bg_mask == 0) + + batch_cls_labels = (fg_mask > 0).float() + batch_cls_labels[interval_mask] = \ + (batch_roi_ious[interval_mask] - iou_bg_thresh) / (iou_fg_thresh - iou_bg_thresh) + # batch_cls_labels[padding_mask > 0] = -1 + else: + raise NotImplementedError + + targets_dict = {'rois': batch_rois, 'gt_of_rois': batch_gt_of_rois, 'gt_iou_of_rois': batch_roi_ious, + 'roi_scores': batch_roi_scores, 'roi_labels': batch_roi_labels, + 'roi_features': batch_roi_features, 'reg_valid_mask': reg_valid_mask, + 'rcnn_cls_labels': batch_cls_labels} + + return targets_dict + + def sample_rois_for_rcnn(self, batch_dict): + """ + Args: + batch_dict: + batch_size: + rois: (B, num_rois, 7 + C) + roi_scores: (B, num_rois) + gt_boxes: (B, N, 7 + C + 1) + roi_labels: (B, num_rois) + Returns: + + """ + batch_size = batch_dict['batch_size'] + rois = batch_dict['rois'] + roi_scores = batch_dict['roi_scores'] + roi_labels = batch_dict['roi_labels'] + gt_boxes = batch_dict['gt_boxes_and_cls'] + roi_features = batch_dict['roi_features'] + + code_size = rois.shape[-1] + batch_rois = rois.new_zeros(batch_size, self.roi_sampler_cfg.ROI_PER_IMAGE, code_size) + batch_gt_of_rois = rois.new_zeros(batch_size, self.roi_sampler_cfg.ROI_PER_IMAGE, code_size + 1) + batch_roi_ious = rois.new_zeros(batch_size, self.roi_sampler_cfg.ROI_PER_IMAGE) + batch_roi_scores = rois.new_zeros(batch_size, self.roi_sampler_cfg.ROI_PER_IMAGE) + batch_roi_labels = rois.new_zeros((batch_size, self.roi_sampler_cfg.ROI_PER_IMAGE), dtype=torch.long) + batch_roi_features = roi_features.new_zeros(batch_size, self.roi_sampler_cfg.ROI_PER_IMAGE, + roi_features.shape[-1]) + + for index in range(batch_size): + cur_roi, cur_gt, cur_roi_labels, cur_roi_scores, cur_roi_features = \ + rois[index], gt_boxes[index], roi_labels[index], roi_scores[index], \ + roi_features[index] + + k = cur_gt.__len__() - 1 + while k > 0 and cur_gt[k].sum() == 0: + k -= 1 + cur_gt = cur_gt[:k + 1] + cur_gt = cur_gt.new_zeros((1, cur_gt.shape[1])) if len(cur_gt) == 0 else cur_gt + + if self.roi_sampler_cfg.get('SAMPLE_ROI_BY_EACH_CLASS', False): + max_overlaps, gt_assignment = self.get_max_iou_with_same_class( + rois=cur_roi[:, :7], roi_labels=cur_roi_labels, + gt_boxes=cur_gt[:, 0:7], gt_labels=cur_gt[:, -1].long() + ) + else: + iou3d = boxes_iou3d_gpu(cur_roi, cur_gt[:, 0:7]) # (M, N) + max_overlaps, gt_assignment = torch.max(iou3d, dim=1) + + sampled_inds = self.subsample_rois(max_overlaps=max_overlaps) + + batch_rois[index] = cur_roi[sampled_inds] + batch_roi_labels[index] = cur_roi_labels[sampled_inds] + batch_roi_ious[index] = max_overlaps[sampled_inds] + batch_roi_scores[index] = cur_roi_scores[sampled_inds] + batch_gt_of_rois[index] = cur_gt[gt_assignment[sampled_inds]] + batch_roi_features[index] = cur_roi_features[sampled_inds] + + return batch_rois, batch_gt_of_rois, batch_roi_ious, batch_roi_scores, batch_roi_labels, batch_roi_features + + def subsample_rois(self, max_overlaps): + # sample fg, easy_bg, hard_bg + fg_rois_per_image = int(np.round(self.roi_sampler_cfg.FG_RATIO * self.roi_sampler_cfg.ROI_PER_IMAGE)) + fg_thresh = min(self.roi_sampler_cfg.REG_FG_THRESH, self.roi_sampler_cfg.CLS_FG_THRESH) + + fg_inds = ((max_overlaps >= fg_thresh)).nonzero().view(-1) + easy_bg_inds = ((max_overlaps < self.roi_sampler_cfg.CLS_BG_THRESH_LO)).nonzero().view(-1) + hard_bg_inds = ((max_overlaps < self.roi_sampler_cfg.REG_FG_THRESH) & + (max_overlaps >= self.roi_sampler_cfg.CLS_BG_THRESH_LO)).nonzero().view(-1) + + fg_num_rois = fg_inds.numel() + bg_num_rois = hard_bg_inds.numel() + easy_bg_inds.numel() + + if fg_num_rois > 0 and bg_num_rois > 0: + # sampling fg + fg_rois_per_this_image = min(fg_rois_per_image, fg_num_rois) + + rand_num = torch.from_numpy(np.random.permutation(fg_num_rois)).type_as(max_overlaps).long() + fg_inds = fg_inds[rand_num[:fg_rois_per_this_image]] + + # sampling bg + bg_rois_per_this_image = self.roi_sampler_cfg.ROI_PER_IMAGE - fg_rois_per_this_image + bg_inds = self.sample_bg_inds( + hard_bg_inds, easy_bg_inds, bg_rois_per_this_image, self.roi_sampler_cfg.HARD_BG_RATIO + ) + + elif fg_num_rois > 0 and bg_num_rois == 0: + # sampling fg + rand_num = np.floor(np.random.rand(self.roi_sampler_cfg.ROI_PER_IMAGE) * fg_num_rois) + rand_num = torch.from_numpy(rand_num).type_as(max_overlaps).long() + fg_inds = fg_inds[rand_num] + bg_inds = [] + + elif bg_num_rois > 0 and fg_num_rois == 0: + # sampling bg + bg_rois_per_this_image = self.roi_sampler_cfg.ROI_PER_IMAGE + bg_inds = self.sample_bg_inds( + hard_bg_inds, easy_bg_inds, bg_rois_per_this_image, self.roi_sampler_cfg.HARD_BG_RATIO + ) + else: + print('maxoverlaps:(min=%f, max=%f)' % (max_overlaps.min().item(), max_overlaps.max().item())) + print('ERROR: FG=%d, BG=%d' % (fg_num_rois, bg_num_rois)) + raise NotImplementedError + + sampled_inds = torch.cat((fg_inds, bg_inds), dim=0) + return sampled_inds + + @staticmethod + def sample_bg_inds(hard_bg_inds, easy_bg_inds, bg_rois_per_this_image, hard_bg_ratio): + if hard_bg_inds.numel() > 0 and easy_bg_inds.numel() > 0: + hard_bg_rois_num = min(int(bg_rois_per_this_image * hard_bg_ratio), len(hard_bg_inds)) + easy_bg_rois_num = bg_rois_per_this_image - hard_bg_rois_num + + # sampling hard bg + rand_idx = torch.randint(low=0, high=hard_bg_inds.numel(), size=(hard_bg_rois_num,)).long() + hard_bg_inds = hard_bg_inds[rand_idx] + + # sampling easy bg + rand_idx = torch.randint(low=0, high=easy_bg_inds.numel(), size=(easy_bg_rois_num,)).long() + easy_bg_inds = easy_bg_inds[rand_idx] + + bg_inds = torch.cat([hard_bg_inds, easy_bg_inds], dim=0) + elif hard_bg_inds.numel() > 0 and easy_bg_inds.numel() == 0: + hard_bg_rois_num = bg_rois_per_this_image + # sampling hard bg + rand_idx = torch.randint(low=0, high=hard_bg_inds.numel(), size=(hard_bg_rois_num,)).long() + bg_inds = hard_bg_inds[rand_idx] + elif hard_bg_inds.numel() == 0 and easy_bg_inds.numel() > 0: + easy_bg_rois_num = bg_rois_per_this_image + # sampling easy bg + rand_idx = torch.randint(low=0, high=easy_bg_inds.numel(), size=(easy_bg_rois_num,)).long() + bg_inds = easy_bg_inds[rand_idx] + else: + raise NotImplementedError + + return bg_inds + + @staticmethod + def get_max_iou_with_same_class(rois, roi_labels, gt_boxes, gt_labels): + """ + Args: + rois: (N, 7) + roi_labels: (N) + gt_boxes: (N, ) + gt_labels: + + Returns: + + """ + """ + :param rois: (N, 7) + :param roi_labels: (N) + :param gt_boxes: (N, 8) + :return: + """ + max_overlaps = rois.new_zeros(rois.shape[0]) + gt_assignment = roi_labels.new_zeros(roi_labels.shape[0]) + + for k in range(gt_labels.min().item(), gt_labels.max().item() + 1): + roi_mask = (roi_labels == k) + gt_mask = (gt_labels == k) + if roi_mask.sum() > 0 and gt_mask.sum() > 0: + cur_roi = rois[roi_mask] + cur_gt = gt_boxes[gt_mask] + original_gt_assignment = gt_mask.nonzero().view(-1) + + iou3d = boxes_iou3d_gpu(cur_roi, cur_gt) # (M, N) + cur_max_overlaps, cur_gt_assignment = torch.max(iou3d, dim=1) + max_overlaps[roi_mask] = cur_max_overlaps + gt_assignment[roi_mask] = original_gt_assignment[cur_gt_assignment] + + return max_overlaps, gt_assignment diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/second_stage/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/second_stage/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d5db279528a51fd7c57ab19b8b86e9c7d9188c75 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/second_stage/__init__.py @@ -0,0 +1 @@ +from .bird_eye_view import BEVFeatureExtractor diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/second_stage/bird_eye_view.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/second_stage/bird_eye_view.py new file mode 100644 index 0000000000000000000000000000000000000000..3cbff6dfb66aa5d5645ce4d31743e7f3a0d9b849 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/second_stage/bird_eye_view.py @@ -0,0 +1,41 @@ +import torch +from torch import nn + +from ..registry import SECOND_STAGE +from det3d.core.utils.center_utils import ( + bilinear_interpolate_torch, +) + +@SECOND_STAGE.register_module +class BEVFeatureExtractor(nn.Module): + def __init__(self, pc_start, + voxel_size, out_stride): + super().__init__() + self.pc_start = pc_start + self.voxel_size = voxel_size + self.out_stride = out_stride + + def absl_to_relative(self, absolute): + a1 = (absolute[..., 0] - self.pc_start[0]) / self.voxel_size[0] / self.out_stride + a2 = (absolute[..., 1] - self.pc_start[1]) / self.voxel_size[1] / self.out_stride + + return a1, a2 + + def forward(self, example, batch_centers, num_point): + batch_size = len(example['bev_feature']) + ret_maps = [] + + for batch_idx in range(batch_size): + xs, ys = self.absl_to_relative(batch_centers[batch_idx]) + + # N x C + feature_map = bilinear_interpolate_torch(example['bev_feature'][batch_idx], + xs, ys) + + if num_point > 1: + section_size = len(feature_map) // num_point + feature_map = torch.cat([feature_map[i*section_size: (i+1)*section_size] for i in range(num_point)], dim=1) + + ret_maps.append(feature_map) + + return ret_maps \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e436f92d91a82a85b2fb477dc32b3da52b7c6be5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/__init__.py @@ -0,0 +1,45 @@ +from .conv_module import ConvModule, build_conv_layer +from .conv_ws import ConvWS2d, conv_ws_2d +from .misc import ( + Empty, + GroupNorm, + Sequential, + change_default_args, + get_kw_to_default_map, + get_paddings_indicator, + get_pos_to_kw_map, + get_printer, + register_hook, +) +from .norm import build_norm_layer +from .scale import Scale +from .weight_init import ( + bias_init_with_prob, + kaiming_init, + normal_init, + uniform_init, + xavier_init, +) + +__all__ = [ + "conv_ws_2d", + "ConvWS2d", + "build_conv_layer", + "ConvModule", + "build_norm_layer", + "xavier_init", + "normal_init", + "uniform_init", + "kaiming_init", + "bias_init_with_prob", + "Scale", + "Sequential", + "GroupNorm", + "Empty", + "get_pos_to_kw_map", + "get_kw_to_default_map", + "change_default_args", + "get_printer", + "register_hook", + "get_paddings_indicator", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/conv_module.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/conv_module.py new file mode 100644 index 0000000000000000000000000000000000000000..ae659a55e70e4a305f3150ced6c1b15754c798c9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/conv_module.py @@ -0,0 +1,165 @@ +import warnings + +import torch.nn as nn +from det3d.torchie.cnn import constant_init, kaiming_init + +from .conv_ws import ConvWS2d +from .norm import build_norm_layer + +conv_cfg = { + "Conv": nn.Conv2d, + "ConvWS": ConvWS2d, + # TODO: octave conv +} + + +def build_conv_layer(cfg, *args, **kwargs): + """ Build convolution layer + Args: + cfg (None or dict): cfg should contain: + type (str): identify conv layer type. + layer args: args needed to instantiate a conv layer. + Returns: + layer (nn.Module): created conv layer + """ + if cfg is None: + cfg_ = dict(type="Conv") + else: + assert isinstance(cfg, dict) and "type" in cfg + cfg_ = cfg.copy() + + layer_type = cfg_.pop("type") + if layer_type not in conv_cfg: + raise KeyError("Unrecognized norm type {}".format(layer_type)) + else: + conv_layer = conv_cfg[layer_type] + + layer = conv_layer(*args, **kwargs, **cfg_) + + return layer + + +class ConvModule(nn.Module): + """A conv block that contains conv/norm/activation layers. + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int or tuple[int]): Same as nn.Conv2d. + stride (int or tuple[int]): Same as nn.Conv2d. + padding (int or tuple[int]): Same as nn.Conv2d. + dilation (int or tuple[int]): Same as nn.Conv2d. + groups (int): Same as nn.Conv2d. + bias (bool or str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if norm_cfg is None, otherwise + False. + conv_cfg (dict): Config dict for convolution layer. + norm_cfg (dict): Config dict for normalization layer. + activation (str or None): Activation type, "ReLU" by default. + inplace (bool): Whether to use inplace mode for activation. + order (tuple[str]): The order of conv/norm/activation layers. It is a + sequence of "conv", "norm" and "act". Examples are + ("conv", "norm", "act") and ("act", "conv", "norm"). + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias="auto", + conv_cfg=None, + norm_cfg=None, + activation="relu", + inplace=True, + order=("conv", "norm", "act"), + ): + super(ConvModule, self).__init__() + assert conv_cfg is None or isinstance(conv_cfg, dict) + assert norm_cfg is None or isinstance(norm_cfg, dict) + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.activation = activation + self.inplace = inplace + self.order = order + assert isinstance(self.order, tuple) and len(self.order) == 3 + assert set(order) == set(["conv", "norm", "act"]) + + self.with_norm = norm_cfg is not None + self.with_activatation = activation is not None + # if the conv layer is before a norm layer, bias is unnecessary. + if bias == "auto": + bias = False if self.with_norm else True + self.with_bias = bias + + if self.with_norm and self.with_bias: + warnings.warn("ConvModule has norm and bias at the same time") + + # build convolution layer + self.conv = build_conv_layer( + conv_cfg, + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + # export the attributes of self.conv to a higher level for convenience + self.in_channels = self.conv.in_channels + self.out_channels = self.conv.out_channels + self.kernel_size = self.conv.kernel_size + self.stride = self.conv.stride + self.padding = self.conv.padding + self.dilation = self.conv.dilation + self.transposed = self.conv.transposed + self.output_padding = self.conv.output_padding + self.groups = self.conv.groups + + # build normalization layers + if self.with_norm: + # norm layer is after conv layer + if order.index("norm") > order.index("conv"): + norm_channels = out_channels + else: + norm_channels = in_channels + self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels) + self.add_module(self.norm_name, norm) + + # build activation layer + if self.with_activatation: + # TODO: introduce `act_cfg` and supports more activation layers + if self.activation not in ["relu"]: + raise ValueError( + "{} is currently not supported.".format(self.activation) + ) + if self.activation == "relu": + self.activate = nn.ReLU(inplace=inplace) + + # Use msra init by default + self.init_weights() + + @property + def norm(self): + return getattr(self, self.norm_name) + + def init_weights(self): + nonlinearity = "relu" if self.activation is None else self.activation + kaiming_init(self.conv, nonlinearity=nonlinearity) + if self.with_norm: + constant_init(self.norm, 1, bias=0) + + def forward(self, x, activate=True, norm=True): + for layer in self.order: + if layer == "conv": + x = self.conv(x) + elif layer == "norm" and norm and self.with_norm: + x = self.norm(x) + elif layer == "act" and activate and self.with_activatation: + x = self.activate(x) + return x diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/conv_ws.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/conv_ws.py new file mode 100644 index 0000000000000000000000000000000000000000..d7abd92dc744b820a4996a0db3db7bebde3e1665 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/conv_ws.py @@ -0,0 +1,51 @@ +import torch.nn as nn +import torch.nn.functional as F + + +def conv_ws_2d( + input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, eps=1e-5 +): + c_in = weight.size(0) + weight_flat = weight.view(c_in, -1) + mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1) + std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1) + weight = (weight - mean) / (std + eps) + return F.conv2d(input, weight, bias, stride, padding, dilation, groups) + + +class ConvWS2d(nn.Conv2d): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + eps=1e-5, + ): + super(ConvWS2d, self).__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self.eps = eps + + def forward(self, x): + return conv_ws_2d( + x, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.groups, + self.eps, + ) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/finetune_utils.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/finetune_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..77ee0d4f3374aae5b4ad9ed22f10ee7a712685b7 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/finetune_utils.py @@ -0,0 +1,111 @@ +import torch +import torch.distributed as dist +from torch import nn +from torch.autograd.function import Function +from torch.nn import functional as F +import logging + +class FrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + It contains non-trainable buffers called + "weight" and "bias", "running_mean", "running_var", + initialized to perform identity transformation. + The pre-trained backbone models from Caffe2 only contain "weight" and "bias", + which are computed from the original four parameters of BN. + The affine transform `x * weight + bias` will perform the equivalent + computation of `(x - running_mean) / sqrt(running_var) * weight + bias`. + When loading a backbone model from Caffe2, "running_mean" and "running_var" + will be left unchanged as identity transformation. + Other pre-trained backbone models may contain all 4 parameters. + The forward is implemented by `F.batch_norm(..., training=False)`. + """ + + _version = 3 + + def __init__(self, num_features, eps=1e-5): + super().__init__() + self.num_features = num_features + self.eps = eps + self.register_buffer("weight", torch.ones(num_features)) + self.register_buffer("bias", torch.zeros(num_features)) + self.register_buffer("running_mean", torch.zeros(num_features)) + self.register_buffer("running_var", torch.ones(num_features) - eps) + + def forward(self, x): + if x.requires_grad: + # When gradients are needed, F.batch_norm will use extra memory + # because its backward op computes gradients for weight/bias as well. + scale = self.weight * (self.running_var + self.eps).rsqrt() + bias = self.bias - self.running_mean * scale + scale = scale.reshape(1, -1, 1, 1) + bias = bias.reshape(1, -1, 1, 1) + return x * scale + bias + else: + # When gradients are not needed, F.batch_norm is a single fused op + # and provide more optimization opportunities. + return F.batch_norm( + x, + self.running_mean, + self.running_var, + self.weight, + self.bias, + training=False, + eps=self.eps, + ) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + version = local_metadata.get("version", None) + + if version is None or version < 2: + # No running_mean/var in early versions + # This will silent the warnings + if prefix + "running_mean" not in state_dict: + state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean) + if prefix + "running_var" not in state_dict: + state_dict[prefix + "running_var"] = torch.ones_like(self.running_var) + + if version is not None and version < 3: + logger = logging.getLogger(__name__) + logger.info("FrozenBatchNorm {} is upgraded to version 3.".format(prefix.rstrip("."))) + # In version < 3, running_var are used without +eps. + state_dict[prefix + "running_var"] -= self.eps + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def __repr__(self): + return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps) + + @classmethod + def convert_frozen_batchnorm(cls, module): + """ + Convert BatchNorm/SyncBatchNorm in module into FrozenBatchNorm. + Args: + module (torch.nn.Module): + Returns: + If module is BatchNorm/SyncBatchNorm, returns a new module. + Otherwise, in-place convert module and return it. + Similar to convert_sync_batchnorm in + https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py + """ + bn_module = nn.modules.batchnorm + bn_module = (bn_module.BatchNorm1d, bn_module.BatchNorm2d, bn_module.SyncBatchNorm) + res = module + if isinstance(module, bn_module): + res = cls(module.num_features) + if module.affine: + res.weight.data = module.weight.data.clone().detach() + res.bias.data = module.bias.data.clone().detach() + res.running_mean.data = module.running_mean.data + res.running_var.data = module.running_var.data + res.eps = module.eps + else: + for name, child in module.named_children(): + new_child = cls.convert_frozen_batchnorm(child) + if new_child is not child: + res.add_module(name, new_child) + return res \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/misc.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..6ba6e751df583529c692d4744dba0a0a995eca90 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/misc.py @@ -0,0 +1,202 @@ +import functools +import inspect +import sys +from collections import OrderedDict + +import numba +import numpy as np +import torch + +# from lib.models.backbone.utils import Registry +# +# BACKBONES = Registry() +# RPN_HEADS = Registry() +# ROI_BOX_FEATURE_EXTRACTORS = Registry() +# ROI_BOX_PREDICTOR = Registry() +# ROI_KEYPOINT_FEATURE_EXTRACTORS = Registry() +# ROI_KEYPOINT_PREDICTOR = Registry() +# ROI_MASK_FEATURE_EXTRACTORS = Registry() +# ROI_MASK_PREDICTOR = Registry() + + +class Sequential(torch.nn.Module): + r"""A sequential container. + Modules will be added to it in the order they are passed in the constructor. + Alternatively, an ordered dict of modules can also be passed in. + + To make it easier to understand, given is a small example:: + + # Example of using Sequential + model = Sequential( + nn.Conv2d(1,20,5), + nn.ReLU(), + nn.Conv2d(20,64,5), + nn.ReLU() + ) + + # Example of using Sequential with OrderedDict + model = Sequential(OrderedDict([ + ('conv1', nn.Conv2d(1,20,5)), + ('relu1', nn.ReLU()), + ('conv2', nn.Conv2d(20,64,5)), + ('relu2', nn.ReLU()) + ])) + + # Example of using Sequential with kwargs(python 3.6+) + model = Sequential( + conv1=nn.Conv2d(1,20,5), + relu1=nn.ReLU(), + conv2=nn.Conv2d(20,64,5), + relu2=nn.ReLU() + ) + """ + + def __init__(self, *args, **kwargs): + super(Sequential, self).__init__() + if len(args) == 1 and isinstance(args[0], OrderedDict): + for key, module in args[0].items(): + self.add_module(key, module) + else: + for idx, module in enumerate(args): + self.add_module(str(idx), module) + for name, module in kwargs.items(): + if sys.version_info < (3, 6): + raise ValueError("kwargs only supported in py36+") + if name in self._modules: + raise ValueError("name exists.") + self.add_module(name, module) + + def __getitem__(self, idx): + if not (-len(self) <= idx < len(self)): + raise IndexError("index {} is out of range".format(idx)) + if idx < 0: + idx += len(self) + it = iter(self._modules.values()) + for i in range(idx): + next(it) + return next(it) + + def __len__(self): + return len(self._modules) + + def add(self, module, name=None): + if name is None: + name = str(len(self._modules)) + if name in self._modules: + raise KeyError("name exists") + self.add_module(name, module) + + def forward(self, input): + # i = 0 + for module in self._modules.values(): + # print(i) + input = module(input) + # i += 1 + return input + + +class GroupNorm(torch.nn.GroupNorm): + def __init__(self, num_channels, num_groups, eps=1e-5, affine=True): + super().__init__( + num_groups=num_groups, num_channels=num_channels, eps=eps, affine=affine + ) + + +class Empty(torch.nn.Module): + def __init__(self, *args, **kwargs): + super(Empty, self).__init__() + + def forward(self, *args, **kwargs): + if len(args) == 1: + return args[0] + elif len(args) == 0: + return None + return args + + +def get_pos_to_kw_map(func): + pos_to_kw = {} + fsig = inspect.signature(func) + pos = 0 + for name, info in fsig.parameters.items(): + if info.kind is info.POSITIONAL_OR_KEYWORD: + pos_to_kw[pos] = name + pos += 1 + return pos_to_kw + + +def get_kw_to_default_map(func): + kw_to_default = {} + fsig = inspect.signature(func) + for name, info in fsig.parameters.items(): + if info.kind is info.POSITIONAL_OR_KEYWORD: + if info.default is not info.empty: + kw_to_default[name] = info.default + return kw_to_default + + +def change_default_args(**kwargs): + def layer_wrapper(layer_class): + class DefaultArgLayer(layer_class): + def __init__(self, *args, **kw): + pos_to_kw = get_pos_to_kw_map(layer_class.__init__) + kw_to_pos = {kw: pos for pos, kw in pos_to_kw.items()} + for key, val in kwargs.items(): + if key not in kw and kw_to_pos[key] > len(args): + kw[key] = val + super().__init__(*args, **kw) + + return DefaultArgLayer + + return layer_wrapper + + +def get_printer(msg): + """This function returns a printer function, that prints information about a tensor's + gradient. Used by register_hook in the backward pass. + """ + + def printer(tensor): + if tensor.nelement() == 1: + print(f"{msg} {tensor}") + else: + print( + f"{msg} shape: {tensor.shape}" + f" max: {tensor.max()} min: {tensor.min()}" + f" mean: {tensor.mean()}" + ) + + return printer + + +def register_hook(tensor, msg): + """Utility function to call retain_grad and Pytorch's register_hook + in a single line + """ + tensor.retain_grad() + tensor.register_hook(get_printer(msg)) + + +def get_paddings_indicator(actual_num, max_num, axis=0): + """Create boolean mask by actually number of a padded tensor. + + Args: + actual_num ([type]): [description] + max_num ([type]): [description] + + Returns: + [type]: [description] + """ + + actual_num = torch.unsqueeze(actual_num, axis + 1) + # tiled_actual_num: [N, M, 1] + max_num_shape = [1] * len(actual_num.shape) + max_num_shape[axis + 1] = -1 + max_num = torch.arange(max_num, dtype=torch.int, device=actual_num.device).view( + max_num_shape + ) + # tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]] + # tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]] + paddings_indicator = actual_num.int() > max_num + # paddings_indicator shape: [batch_size, max_num] + return paddings_indicator diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/norm.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/norm.py new file mode 100644 index 0000000000000000000000000000000000000000..095b045dbef5ed86d13f5a0664859d349f272837 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/norm.py @@ -0,0 +1,108 @@ +import torch +import torch.distributed as dist +import torch.nn as nn +from det3d.utils.dist import dist_common as comm +from torch.autograd.function import Function +from torch.nn import BatchNorm2d + + +class AllReduce(Function): + @staticmethod + def forward(ctx, input): + input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())] + # Use allgather instead of allreduce since I don't trust in-place operations .. + dist.all_gather(input_list, input, async_op=False) + inputs = torch.stack(input_list, dim=0) + return torch.sum(inputs, dim=0) + + @staticmethod + def backward(ctx, grad_output): + dist.all_reduce(grad_output, async_op=False) + return grad_output + + +class NaiveSyncBatchNorm(BatchNorm2d): + """ + `torch.nn.SyncBatchNorm` has known unknown bugs. + It produces significantly worse AP (and sometimes goes NaN) + when the batch size on each worker is quite different + (e.g., when scale augmentation is used, or when it is applied to mask head). + Use this implementation before `nn.SyncBatchNorm` is fixed. + It is slower than `nn.SyncBatchNorm`. + """ + + def forward(self, input): + if comm.get_world_size() == 1 or not self.training: + return super().forward(input) + + assert input.shape[0] > 0, "SyncBatchNorm does not support empty input" + C = input.shape[1] + mean = torch.mean(input, dim=[0, 2, 3]) + meansqr = torch.mean(input * input, dim=[0, 2, 3]) + + vec = torch.cat([mean, meansqr], dim=0) + vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) + + mean, meansqr = torch.split(vec, C) + var = meansqr - mean * mean + self.running_mean += self.momentum * (mean.detach() - self.running_mean) + self.running_var += self.momentum * (var.detach() - self.running_var) + + invstd = torch.rsqrt(var + self.eps) + scale = self.weight * invstd + bias = self.bias - mean * scale + scale = scale.reshape(1, -1, 1, 1) + bias = bias.reshape(1, -1, 1, 1) + return input * scale + bias + + +norm_cfg = { + # format: layer_type: (abbreviation, module) + "BN": ("bn", nn.BatchNorm2d), + "BN1d": ("bn1d", nn.BatchNorm1d), + "GN": ("gn", nn.GroupNorm), +} + + +def build_norm_layer(cfg, num_features, postfix=""): + """ Build normalization layer + Args: + cfg (dict): cfg should contain: + type (str): identify norm layer type. + layer args: args needed to instantiate a norm layer. + requires_grad (bool): [optional] whether stop gradient updates + num_features (int): number of channels from input. + postfix (int, str): appended into norm abbreviation to + create named layer. + Returns: + name (str): abbreviation + postfix + layer (nn.Module): created norm layer + """ + assert isinstance(cfg, dict) and "type" in cfg + cfg_ = cfg.copy() + + layer_type = cfg_.pop("type") + if layer_type not in norm_cfg: + raise KeyError("Unrecognized norm type {}".format(layer_type)) + else: + abbr, norm_layer = norm_cfg[layer_type] + if norm_layer is None: + raise NotImplementedError + + assert isinstance(postfix, (int, str)) + name = abbr + str(postfix) + + requires_grad = cfg_.pop("requires_grad", True) + cfg_.setdefault("eps", 1e-5) + if layer_type != "GN": + layer = norm_layer(num_features, **cfg_) + # if layer_type == 'SyncBN': + # layer._specify_ddp_gpu_num(1) + else: + assert "num_groups" in cfg_ + layer = norm_layer(num_channels=num_features, **cfg_) + + for param in layer.parameters(): + param.requires_grad = requires_grad + + return name, layer diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/scale.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/scale.py new file mode 100644 index 0000000000000000000000000000000000000000..01501b54fa71f95c5202d7587455c904441ce2e7 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/scale.py @@ -0,0 +1,11 @@ +import torch +import torch.nn as nn + + +class Scale(nn.Module): + def __init__(self, scale=1.0): + super(Scale, self).__init__() + self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) + + def forward(self, x): + return x * self.scale diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/weight_init.py b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..c629cbbb54bb3e4272254d242921f4dbe9b067ce --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/models/utils/weight_init.py @@ -0,0 +1,42 @@ +import numpy as np +import torch.nn as nn + + +def xavier_init(module, gain=1, bias=0, distribution="normal"): + assert distribution in ["uniform", "normal"] + if distribution == "uniform": + nn.init.xavier_uniform_(module.weight, gain=gain) + else: + nn.init.xavier_normal_(module.weight, gain=gain) + if hasattr(module, "bias"): + nn.init.constant_(module.bias, bias) + + +def normal_init(module, mean=0, std=1, bias=0): + nn.init.normal_(module.weight, mean, std) + if hasattr(module, "bias"): + nn.init.constant_(module.bias, bias) + + +def uniform_init(module, a=0, b=1, bias=0): + nn.init.uniform_(module.weight, a, b) + if hasattr(module, "bias"): + nn.init.constant_(module.bias, bias) + + +def kaiming_init( + module, mode="fan_out", nonlinearity="relu", bias=0, distribution="normal" +): + assert distribution in ["uniform", "normal"] + if distribution == "uniform": + nn.init.kaiming_uniform_(module.weight, mode=mode, nonlinearity=nonlinearity) + else: + nn.init.kaiming_normal_(module.weight, mode=mode, nonlinearity=nonlinearity) + if hasattr(module, "bias"): + nn.init.constant_(module.bias, bias) + + +def bias_init_with_prob(prior_prob): + """ initialize conv/fc bias value according to giving probablity""" + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + return bias_init diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d3cbb677c65c3a34081df884b5c00938f07a95eb --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/__init__.py @@ -0,0 +1,8 @@ +from .deform_conv import (DeformConv, DeformConvPack, ModulatedDeformConv, + ModulatedDeformConvPack, deform_conv, + modulated_deform_conv) + +__all__ = [ + 'DeformConv', 'DeformConvPack', 'ModulatedDeformConv', + 'ModulatedDeformConvPack', 'deform_conv', 'modulated_deform_conv', +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/deform_conv.py b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/deform_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..4680cb0e72c150a728073934d484649508ce3f46 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/deform_conv.py @@ -0,0 +1,446 @@ +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair, _single + +# from mmdet.utils import print_log +from . import deform_conv_cuda + + +class DeformConvFunction(Function): + + @staticmethod + def forward(ctx, + input, + offset, + weight, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + im2col_step=64): + if input is not None and input.dim() != 4: + raise ValueError( + 'Expected 4D tensor as input, got {}D tensor instead.'.format( + input.dim())) + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.groups = groups + ctx.deformable_groups = deformable_groups + ctx.im2col_step = im2col_step + + ctx.save_for_backward(input, offset, weight) + + output = input.new_empty( + DeformConvFunction._output_size(input, weight, ctx.padding, + ctx.dilation, ctx.stride)) + + ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones + + if not input.is_cuda: + raise NotImplementedError + else: + cur_im2col_step = min(ctx.im2col_step, input.shape[0]) + assert (input.shape[0] % + cur_im2col_step) == 0, 'im2col step must divide batchsize' + deform_conv_cuda.deform_conv_forward_cuda( + input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1], + weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], + ctx.padding[1], ctx.padding[0], ctx.dilation[1], + ctx.dilation[0], ctx.groups, ctx.deformable_groups, + cur_im2col_step) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, offset, weight = ctx.saved_tensors + + grad_input = grad_offset = grad_weight = None + + if not grad_output.is_cuda: + raise NotImplementedError + else: + cur_im2col_step = min(ctx.im2col_step, input.shape[0]) + assert (input.shape[0] % + cur_im2col_step) == 0, 'im2col step must divide batchsize' + + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + deform_conv_cuda.deform_conv_backward_input_cuda( + input, offset, grad_output, grad_input, + grad_offset, weight, ctx.bufs_[0], weight.size(3), + weight.size(2), ctx.stride[1], ctx.stride[0], + ctx.padding[1], ctx.padding[0], ctx.dilation[1], + ctx.dilation[0], ctx.groups, ctx.deformable_groups, + cur_im2col_step) + + if ctx.needs_input_grad[2]: + grad_weight = torch.zeros_like(weight) + deform_conv_cuda.deform_conv_backward_parameters_cuda( + input, offset, grad_output, + grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3), + weight.size(2), ctx.stride[1], ctx.stride[0], + ctx.padding[1], ctx.padding[0], ctx.dilation[1], + ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1, + cur_im2col_step) + + return (grad_input, grad_offset, grad_weight, None, None, None, None, + None) + + @staticmethod + def _output_size(input, weight, padding, dilation, stride): + channels = weight.size(0) + output_size = (input.size(0), channels) + for d in range(input.dim() - 2): + in_size = input.size(d + 2) + pad = padding[d] + kernel = dilation[d] * (weight.size(d + 2) - 1) + 1 + stride_ = stride[d] + output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) + if not all(map(lambda s: s > 0, output_size)): + raise ValueError( + 'convolution input is too small (output would be {})'.format( + 'x'.join(map(str, output_size)))) + return output_size + + +class ModulatedDeformConvFunction(Function): + + @staticmethod + def forward(ctx, + input, + offset, + mask, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1): + ctx.stride = stride + ctx.padding = padding + ctx.dilation = dilation + ctx.groups = groups + ctx.deformable_groups = deformable_groups + ctx.with_bias = bias is not None + if not ctx.with_bias: + bias = input.new_empty(1) # fake tensor + if not input.is_cuda: + raise NotImplementedError + if weight.requires_grad or mask.requires_grad or offset.requires_grad \ + or input.requires_grad: + ctx.save_for_backward(input, offset, mask, weight, bias) + output = input.new_empty( + ModulatedDeformConvFunction._infer_shape(ctx, input, weight)) + ctx._bufs = [input.new_empty(0), input.new_empty(0)] + deform_conv_cuda.modulated_deform_conv_cuda_forward( + input, weight, bias, ctx._bufs[0], offset, mask, output, + ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride, + ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, + ctx.groups, ctx.deformable_groups, ctx.with_bias) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + if not grad_output.is_cuda: + raise NotImplementedError + input, offset, mask, weight, bias = ctx.saved_tensors + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + grad_mask = torch.zeros_like(mask) + grad_weight = torch.zeros_like(weight) + grad_bias = torch.zeros_like(bias) + deform_conv_cuda.modulated_deform_conv_cuda_backward( + input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1], + grad_input, grad_weight, grad_bias, grad_offset, grad_mask, + grad_output, weight.shape[2], weight.shape[3], ctx.stride, + ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, + ctx.groups, ctx.deformable_groups, ctx.with_bias) + if not ctx.with_bias: + grad_bias = None + + return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias, + None, None, None, None, None) + + @staticmethod + def _infer_shape(ctx, input, weight): + n = input.size(0) + channels_out = weight.size(0) + height, width = input.shape[2:4] + kernel_h, kernel_w = weight.shape[2:4] + height_out = (height + 2 * ctx.padding - + (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1 + width_out = (width + 2 * ctx.padding - + (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1 + return n, channels_out, height_out, width_out + + +deform_conv = DeformConvFunction.apply +modulated_deform_conv = ModulatedDeformConvFunction.apply + + +class DeformConv(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + bias=False): + super(DeformConv, self).__init__() + + assert not bias + assert in_channels % groups == 0, \ + 'in_channels {} cannot be divisible by groups {}'.format( + in_channels, groups) + assert out_channels % groups == 0, \ + 'out_channels {} cannot be divisible by groups {}'.format( + out_channels, groups) + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.groups = groups + self.deformable_groups = deformable_groups + # enable compatibility with nn.Conv2d + self.transposed = False + self.output_padding = _single(0) + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // self.groups, + *self.kernel_size)) + + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + + def forward(self, x, offset): + # To fix an assert error in deform_conv_cuda.cpp:128 + # input image is smaller than kernel + input_pad = ( + x.size(2) < self.kernel_size[0] or x.size(3) < self.kernel_size[1]) + if input_pad: + pad_h = max(self.kernel_size[0] - x.size(2), 0) + pad_w = max(self.kernel_size[1] - x.size(3), 0) + x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous() + offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', + 0).contiguous() + out = deform_conv(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deformable_groups) + if input_pad: + out = out[:, :, :out.size(2) - pad_h, :out.size(3) - + pad_w].contiguous() + return out + + +class DeformConvPack(DeformConv): + """A Deformable Conv Encapsulation that acts as normal Conv layers. + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int or tuple[int]): Same as nn.Conv2d. + stride (int or tuple[int]): Same as nn.Conv2d. + padding (int or tuple[int]): Same as nn.Conv2d. + dilation (int or tuple[int]): Same as nn.Conv2d. + groups (int): Same as nn.Conv2d. + bias (bool or str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if norm_cfg is None, otherwise + False. + """ + + _version = 2 + + def __init__(self, *args, **kwargs): + super(DeformConvPack, self).__init__(*args, **kwargs) + + self.conv_offset = nn.Conv2d( + self.in_channels, + self.deformable_groups * 2 * self.kernel_size[0] * + self.kernel_size[1], + kernel_size=self.kernel_size, + stride=_pair(self.stride), + padding=_pair(self.padding), + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x): + offset = self.conv_offset(x) + return deform_conv(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deformable_groups) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + version = local_metadata.get('version', None) + + if version is None or version < 2: + # the key is different in early versions + # In version < 2, DeformConvPack loads previous benchmark models. + if (prefix + 'conv_offset.weight' not in state_dict + and prefix[:-1] + '_offset.weight' in state_dict): + state_dict[prefix + 'conv_offset.weight'] = state_dict.pop( + prefix[:-1] + '_offset.weight') + if (prefix + 'conv_offset.bias' not in state_dict + and prefix[:-1] + '_offset.bias' in state_dict): + state_dict[prefix + + 'conv_offset.bias'] = state_dict.pop(prefix[:-1] + + '_offset.bias') + + if version is not None and version > 1: + print_log( + 'DeformConvPack {} is upgraded to version 2.'.format( + prefix.rstrip('.')), + logger='root') + + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) + + +class ModulatedDeformConv(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + bias=True): + super(ModulatedDeformConv, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.deformable_groups = deformable_groups + self.with_bias = bias + # enable compatibility with nn.Conv2d + self.transposed = False + self.output_padding = _single(0) + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // groups, + *self.kernel_size)) + if bias: + self.bias = nn.Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.zero_() + + def forward(self, x, offset, mask): + return modulated_deform_conv(x, offset, mask, self.weight, self.bias, + self.stride, self.padding, self.dilation, + self.groups, self.deformable_groups) + + +class ModulatedDeformConvPack(ModulatedDeformConv): + """A ModulatedDeformable Conv Encapsulation that acts as normal Conv layers. + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int or tuple[int]): Same as nn.Conv2d. + stride (int or tuple[int]): Same as nn.Conv2d. + padding (int or tuple[int]): Same as nn.Conv2d. + dilation (int or tuple[int]): Same as nn.Conv2d. + groups (int): Same as nn.Conv2d. + bias (bool or str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if norm_cfg is None, otherwise + False. + """ + + _version = 2 + + def __init__(self, *args, **kwargs): + super(ModulatedDeformConvPack, self).__init__(*args, **kwargs) + + self.conv_offset = nn.Conv2d( + self.in_channels, + self.deformable_groups * 3 * self.kernel_size[0] * + self.kernel_size[1], + kernel_size=self.kernel_size, + stride=_pair(self.stride), + padding=_pair(self.padding), + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x): + out = self.conv_offset(x) + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + return modulated_deform_conv(x, offset, mask, self.weight, self.bias, + self.stride, self.padding, self.dilation, + self.groups, self.deformable_groups) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + version = local_metadata.get('version', None) + + if version is None or version < 2: + # the key is different in early versions + # In version < 2, ModulatedDeformConvPack + # loads previous benchmark models. + if (prefix + 'conv_offset.weight' not in state_dict + and prefix[:-1] + '_offset.weight' in state_dict): + state_dict[prefix + 'conv_offset.weight'] = state_dict.pop( + prefix[:-1] + '_offset.weight') + if (prefix + 'conv_offset.bias' not in state_dict + and prefix[:-1] + '_offset.bias' in state_dict): + state_dict[prefix + + 'conv_offset.bias'] = state_dict.pop(prefix[:-1] + + '_offset.bias') + + if version is not None and version > 1: + print_log( + 'ModulatedDeformConvPack {} is upgraded to version 2.'.format( + prefix.rstrip('.')), + logger='root') + + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/setup.py b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..0f5639332392673d5e95573319ea5269dee52674 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup( + name='masked_conv', + ext_modules=[ + CUDAExtension('deform_conv_cuda', [ + 'src/deform_conv_cuda.cpp', + 'src/deform_conv_cuda_kernel.cu', + ], + define_macros=[('WITH_CUDA', None)], + extra_compile_args={ + 'cxx': [], + 'nvcc': [ + '-D__CUDA_NO_HALF_OPERATORS__', + '-D__CUDA_NO_HALF_CONVERSIONS__', + '-D__CUDA_NO_HALF2_OPERATORS__', + ]})], + cmdclass={'build_ext': BuildExtension}) + diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_conv_cuda.cpp b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_conv_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8ef0d2318e11372529dd854dfd7030cac37d591f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_conv_cuda.cpp @@ -0,0 +1,701 @@ +// modify from +// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c + +#include +#include + +#include +#include + +void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset, + const int channels, const int height, const int width, + const int ksize_h, const int ksize_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, + at::Tensor data_col); + +void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset, + const int channels, const int height, const int width, + const int ksize_h, const int ksize_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, + at::Tensor grad_im); + +void deformable_col2im_coord( + const at::Tensor data_col, const at::Tensor data_im, + const at::Tensor data_offset, const int channels, const int height, + const int width, const int ksize_h, const int ksize_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int parallel_imgs, + const int deformable_group, at::Tensor grad_offset); + +void modulated_deformable_im2col_cuda( + const at::Tensor data_im, const at::Tensor data_offset, + const at::Tensor data_mask, const int batch_size, const int channels, + const int height_im, const int width_im, const int height_col, + const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int deformable_group, + at::Tensor data_col); + +void modulated_deformable_col2im_cuda( + const at::Tensor data_col, const at::Tensor data_offset, + const at::Tensor data_mask, const int batch_size, const int channels, + const int height_im, const int width_im, const int height_col, + const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int deformable_group, + at::Tensor grad_im); + +void modulated_deformable_col2im_coord_cuda( + const at::Tensor data_col, const at::Tensor data_im, + const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, + const int width_im, const int height_col, const int width_col, + const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int deformable_group, at::Tensor grad_offset, + at::Tensor grad_mask); + +void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput, + at::Tensor weight, int kH, int kW, int dH, int dW, int padH, + int padW, int dilationH, int dilationW, int group, + int deformable_group) { + TORCH_CHECK(weight.ndimension() == 4, + "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " + "but got: %s", + weight.ndimension()); + + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + + TORCH_CHECK(kW > 0 && kH > 0, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, + kW); + + TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW), + "kernel size should be consistent with weight, ", + "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH, + kW, weight.size(2), weight.size(3)); + + TORCH_CHECK(dW > 0 && dH > 0, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + TORCH_CHECK( + dilationW > 0 && dilationH > 0, + "dilation should be greater than 0, but got dilationH: %d dilationW: %d", + dilationH, dilationW); + + int ndim = input.ndimension(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s", + ndim); + + long nInputPlane = weight.size(1) * group; + long inputHeight = input.size(dimh); + long inputWidth = input.size(dimw); + long nOutputPlane = weight.size(0); + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + TORCH_CHECK(nInputPlane % deformable_group == 0, + "input channels must divide deformable group size"); + + if (outputWidth < 1 || outputHeight < 1) + AT_ERROR( + "Given input size: (%ld x %ld x %ld). " + "Calculated output size: (%ld x %ld x %ld). Output size is too small", + nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight, + outputWidth); + + TORCH_CHECK(input.size(1) == nInputPlane, + "invalid number of input planes, expected: %d, but got: %d", + nInputPlane, input.size(1)); + + TORCH_CHECK((inputHeight >= kH && inputWidth >= kW), + "input image is smaller than kernel"); + + TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth), + "invalid spatial size of offset, expected height: %d width: %d, but " + "got height: %d width: %d", + outputHeight, outputWidth, offset.size(2), offset.size(3)); + + TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW), + "invalid number of channels of offset"); + + if (gradOutput != NULL) { + TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane, + "invalid number of gradOutput planes, expected: %d, but got: %d", + nOutputPlane, gradOutput->size(dimf)); + + TORCH_CHECK((gradOutput->size(dimh) == outputHeight && + gradOutput->size(dimw) == outputWidth), + "invalid size of gradOutput, expected height: %d width: %d , but " + "got height: %d width: %d", + outputHeight, outputWidth, gradOutput->size(dimh), + gradOutput->size(dimw)); + } +} + +int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, + at::Tensor offset, at::Tensor output, + at::Tensor columns, at::Tensor ones, int kW, + int kH, int dW, int dH, int padW, int padH, + int dilationW, int dilationH, int group, + int deformable_group, int im2col_step) { + // todo: resize columns to include im2col: done + // todo: add im2col_step as input + // todo: add new output buffer and transpose it to output (or directly + // transpose output) todo: possibly change data indexing because of + // parallel_imgs + + shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, group, deformable_group); + at::DeviceGuard guard(input.device()); + + input = input.contiguous(); + offset = offset.contiguous(); + weight = weight.contiguous(); + + int batch = 1; + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input.unsqueeze_(0); + offset.unsqueeze_(0); + } + + // todo: assert batchsize dividable by im2col_step + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = weight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); + + output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane, + outputHeight, outputWidth}); + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < outputHeight * outputWidth) { + ones = at::ones({outputHeight, outputWidth}, input.options()); + } + + input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + offset = + offset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + at::Tensor output_buffer = + at::zeros({batchSize / im2col_step, nOutputPlane, + im2col_step * outputHeight, outputWidth}, + output.options()); + + output_buffer = output_buffer.view( + {output_buffer.size(0), group, output_buffer.size(1) / group, + output_buffer.size(2), output_buffer.size(3)}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, + inputWidth, kH, kW, padH, padW, dH, dW, dilationH, + dilationW, im2col_step, deformable_group, columns); + + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + + for (int g = 0; g < group; g++) { + output_buffer[elt][g] = output_buffer[elt][g] + .flatten(1) + .addmm_(weight[g].flatten(1), columns[g]) + .view_as(output_buffer[elt][g]); + } + } + + output_buffer = output_buffer.view( + {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2), + output_buffer.size(3), output_buffer.size(4)}); + + output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane, + im2col_step, outputHeight, outputWidth}); + output_buffer.transpose_(1, 2); + output.copy_(output_buffer); + output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + output = output.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); + } + + return 1; +} + +int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset, + at::Tensor gradOutput, at::Tensor gradInput, + at::Tensor gradOffset, at::Tensor weight, + at::Tensor columns, int kW, int kH, int dW, + int dH, int padW, int padH, int dilationW, + int dilationH, int group, + int deformable_group, int im2col_step) { + shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW, + dilationH, dilationW, group, deformable_group); + at::DeviceGuard guard(input.device()); + + input = input.contiguous(); + offset = offset.contiguous(); + gradOutput = gradOutput.contiguous(); + weight = weight.contiguous(); + + int batch = 1; + + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input = input.view({1, input.size(0), input.size(1), input.size(2)}); + offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)}); + gradOutput = gradOutput.view( + {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); + } + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = weight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset"); + gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + // change order of grad output + gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, + nOutputPlane, outputHeight, outputWidth}); + gradOutput.transpose_(1, 2); + + gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, + outputWidth}); + offset = + offset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + // divide into groups + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + gradOutput = gradOutput.view( + {gradOutput.size(0), group, gradOutput.size(1) / group, + gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)}); + + for (int g = 0; g < group; g++) { + columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), + gradOutput[elt][g].flatten(1), 0.0f, 1.0f); + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + gradOutput = gradOutput.view( + {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2), + gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)}); + + deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane, + inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, im2col_step, deformable_group, + gradOffset[elt]); + + deformable_col2im(columns, offset[elt], nInputPlane, inputHeight, + inputWidth, kH, kW, padH, padW, dH, dW, dilationH, + dilationW, im2col_step, deformable_group, gradInput[elt]); + } + + gradOutput.transpose_(1, 2); + gradOutput = + gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + gradOffset = gradOffset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth}); + offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); + gradOffset = + gradOffset.view({offset.size(1), offset.size(2), offset.size(3)}); + } + + return 1; +} + +int deform_conv_backward_parameters_cuda( + at::Tensor input, at::Tensor offset, at::Tensor gradOutput, + at::Tensor gradWeight, // at::Tensor gradBias, + at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, + int padW, int padH, int dilationW, int dilationH, int group, + int deformable_group, float scale, int im2col_step) { + // todo: transpose and reshape outGrad + // todo: reshape columns + // todo: add im2col_step as input + + shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH, + padW, dilationH, dilationW, group, deformable_group); + at::DeviceGuard guard(input.device()); + + input = input.contiguous(); + offset = offset.contiguous(); + gradOutput = gradOutput.contiguous(); + + int batch = 1; + + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input = input.view( + at::IntList({1, input.size(0), input.size(1), input.size(2)})); + gradOutput = gradOutput.view( + {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); + } + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = gradWeight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); + + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step, + nOutputPlane, outputHeight, outputWidth}); + gradOutput.transpose_(1, 2); + + at::Tensor gradOutputBuffer = at::zeros_like(gradOutput); + gradOutputBuffer = + gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step, + outputHeight, outputWidth}); + gradOutputBuffer.copy_(gradOutput); + gradOutputBuffer = + gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, + im2col_step * outputHeight, outputWidth}); + + gradOutput.transpose_(1, 2); + gradOutput = + gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + input = input.view({batchSize / im2col_step, im2col_step, nInputPlane, + inputHeight, inputWidth}); + offset = + offset.view({batchSize / im2col_step, im2col_step, + deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, + inputWidth, kH, kW, padH, padW, dH, dW, dilationH, + dilationW, im2col_step, deformable_group, columns); + + // divide into group + gradOutputBuffer = gradOutputBuffer.view( + {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group, + gradOutputBuffer.size(2), gradOutputBuffer.size(3)}); + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + gradWeight = + gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1), + gradWeight.size(2), gradWeight.size(3)}); + + for (int g = 0; g < group; g++) { + gradWeight[g] = gradWeight[g] + .flatten(1) + .addmm_(gradOutputBuffer[elt][g].flatten(1), + columns[g].transpose(1, 0), 1.0, scale) + .view_as(gradWeight[g]); + } + gradOutputBuffer = gradOutputBuffer.view( + {gradOutputBuffer.size(0), + gradOutputBuffer.size(1) * gradOutputBuffer.size(2), + gradOutputBuffer.size(3), gradOutputBuffer.size(4)}); + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1), + gradWeight.size(2), gradWeight.size(3), + gradWeight.size(4)}); + } + + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + } + + return 1; +} + +void modulated_deform_conv_cuda_forward( + at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, + at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns, + int kernel_h, int kernel_w, const int stride_h, const int stride_w, + const int pad_h, const int pad_w, const int dilation_h, + const int dilation_w, const int group, const int deformable_group, + const bool with_bias) { + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + at::DeviceGuard guard(input.device()); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) + AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).", + kernel_h_, kernel_w, kernel_h_, kernel_w_); + if (channels != channels_kernel * group) + AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).", + channels, channels_kernel * group); + + const int height_out = + (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = + (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < height_out * width_out) { + // Resize plane and fill with ones... + ones = at::ones({height_out, width_out}, input.options()); + } + + // resize output + output = output.view({batch, channels_out, height_out, width_out}).zero_(); + // resize temporary columns + columns = + at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out}, + input.options()); + + output = output.view({output.size(0), group, output.size(1) / group, + output.size(2), output.size(3)}); + + for (int b = 0; b < batch; b++) { + modulated_deformable_im2col_cuda( + input[b], offset[b], mask[b], 1, channels, height, width, height_out, + width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, columns); + + // divide into group + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + + for (int g = 0; g < group; g++) { + output[b][g] = output[b][g] + .flatten(1) + .addmm_(weight[g].flatten(1), columns[g]) + .view_as(output[b][g]); + } + + weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), + weight.size(3), weight.size(4)}); + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + } + + output = output.view({output.size(0), output.size(1) * output.size(2), + output.size(3), output.size(4)}); + + if (with_bias) { + output += bias.view({1, bias.size(0), 1, 1}); + } +} + +void modulated_deform_conv_cuda_backward( + at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, + at::Tensor offset, at::Tensor mask, at::Tensor columns, + at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias, + at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output, + int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, + int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, + const bool with_bias) { + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + at::DeviceGuard guard(input.device()); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) + AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).", + kernel_h_, kernel_w, kernel_h_, kernel_w_); + if (channels != channels_kernel * group) + AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).", + channels, channels_kernel * group); + + const int height_out = + (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = + (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < height_out * width_out) { + // Resize plane and fill with ones... + ones = at::ones({height_out, width_out}, input.options()); + } + + grad_input = grad_input.view({batch, channels, height, width}); + columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out}, + input.options()); + + grad_output = + grad_output.view({grad_output.size(0), group, grad_output.size(1) / group, + grad_output.size(2), grad_output.size(3)}); + + for (int b = 0; b < batch; b++) { + // divide int group + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, weight.size(0) / group, weight.size(1), + weight.size(2), weight.size(3)}); + + for (int g = 0; g < group; g++) { + columns[g].addmm_(weight[g].flatten(1).transpose(0, 1), + grad_output[b][g].flatten(1), 0.0f, 1.0f); + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), + weight.size(3), weight.size(4)}); + + // gradient w.r.t. input coordinate data + modulated_deformable_col2im_coord_cuda( + columns, input[b], offset[b], mask[b], 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, + stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], + grad_mask[b]); + // gradient w.r.t. input data + modulated_deformable_col2im_cuda( + columns, offset[b], mask[b], 1, channels, height, width, height_out, + width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, grad_input[b]); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and + // group + modulated_deformable_im2col_cuda( + input[b], offset[b], mask[b], 1, channels, height, width, height_out, + width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, columns); + + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + grad_weight = grad_weight.view({group, grad_weight.size(0) / group, + grad_weight.size(1), grad_weight.size(2), + grad_weight.size(3)}); + if (with_bias) + grad_bias = grad_bias.view({group, grad_bias.size(0) / group}); + + for (int g = 0; g < group; g++) { + grad_weight[g] = + grad_weight[g] + .flatten(1) + .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1)) + .view_as(grad_weight[g]); + if (with_bias) { + grad_bias[g] = + grad_bias[g] + .view({-1, 1}) + .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1})) + .view(-1); + } + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1), + grad_weight.size(2), grad_weight.size(3), + grad_weight.size(4)}); + if (with_bias) + grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)}); + } + grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1), + grad_output.size(2), grad_output.size(3), + grad_output.size(4)}); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("deform_conv_forward_cuda", &deform_conv_forward_cuda, + "deform forward (CUDA)"); + m.def("deform_conv_backward_input_cuda", &deform_conv_backward_input_cuda, + "deform_conv_backward_input (CUDA)"); + m.def("deform_conv_backward_parameters_cuda", + &deform_conv_backward_parameters_cuda, + "deform_conv_backward_parameters (CUDA)"); + m.def("modulated_deform_conv_cuda_forward", + &modulated_deform_conv_cuda_forward, + "modulated deform conv forward (CUDA)"); + m.def("modulated_deform_conv_cuda_backward", + &modulated_deform_conv_cuda_backward, + "modulated deform conv backward (CUDA)"); +} diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_conv_cuda_kernel.cu b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_conv_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e7a26f2e830846f80272bcd8c5ce0def34593c95 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_conv_cuda_kernel.cu @@ -0,0 +1,867 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.cuh + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng + */ + +// modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu + +#include +#include +#include +#include +#include +#include + +using namespace at; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +const int kMaxGridNum = 65535; + +inline int GET_BLOCKS(const int N) +{ + return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS); +} + +template +__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width, + const int height, const int width, scalar_t h, scalar_t w) +{ + + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + scalar_t lh = h - h_low; + scalar_t lw = w - w_low; + scalar_t hh = 1 - lh, hw = 1 - lw; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w, + const int h, const int w, const int height, const int width) +{ + + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w, + const int height, const int width, const scalar_t *im_data, + const int data_width, const int bp_dir) +{ + + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +template +__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; + const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; + const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) + { + for (int j = 0; j < kernel_w; ++j) + { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + scalar_t val = static_cast(0); + const scalar_t h_im = h_in + i * dilation_h + offset_h; + const scalar_t w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const scalar_t map_h = i * dilation_h + offset_h; + //const scalar_t map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +void deformable_im2col( + const at::Tensor data_im, const at::Tensor data_offset, const int channels, + const int height, const int width, const int ksize_h, const int ksize_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, const int parallel_imgs, + const int deformable_group, at::Tensor data_col) +{ + // num_axes should be smaller than block size + // todo: check parallel_imgs is correctly passed in + int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = channels * height_col * width_col * parallel_imgs; + int channel_per_deformable_group = channels / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_im.scalar_type(), "deformable_im2col_gpu", ([&] { + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + scalar_t *data_col_ = data_col.data(); + + deformable_im2col_gpu_kernel<<>>( + num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + channel_per_deformable_group, parallel_imgs, channels, deformable_group, + height_col, width_col, data_col_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in deformable_im2col: %s\n", cudaGetErrorString(err)); + } +} + +template +__global__ void deformable_col2im_gpu_kernel( + const int n, const scalar_t *data_col, const scalar_t *data_offset, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + scalar_t *grad_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; + const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const scalar_t cur_top_grad = data_col[index]; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +void deformable_col2im( + const at::Tensor data_col, const at::Tensor data_offset, const int channels, + const int height, const int width, const int ksize_h, + const int ksize_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, + at::Tensor grad_im) +{ + + // todo: make sure parallel_imgs is passed in correctly + int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs; + int channel_per_deformable_group = channels / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.scalar_type(), "deformable_col2im_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_offset_ = data_offset.data(); + scalar_t *grad_im_ = grad_im.data(); + + deformable_col2im_gpu_kernel<<>>( + num_kernels, data_col_, data_offset_, channels, height, width, ksize_h, + ksize_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + parallel_imgs, deformable_group, height_col, width_col, grad_im_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in deformable_col2im: %s\n", cudaGetErrorString(err)); + } +} + +template +__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col, + const scalar_t *data_im, const scalar_t *data_offset, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, scalar_t *grad_offset) +{ + CUDA_KERNEL_LOOP(index, n) + { + scalar_t val = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * + batch_size * width_col * height_col; + const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / kernel_w * height * width; + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + scalar_t inv_h = h_in + i * dilation_h + offset_h; + scalar_t inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + const scalar_t weight = get_coordinate_weight( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos]; + cnt += 1; + } + + grad_offset[index] = val; + } +} + +void deformable_col2im_coord( + const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, + const int channels, const int height, const int width, const int ksize_h, + const int ksize_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, const int dilation_h, const int dilation_w, + const int parallel_imgs, const int deformable_group, at::Tensor grad_offset) +{ + + int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs; + int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + scalar_t *grad_offset_ = grad_offset.data(); + + deformable_col2im_coord_gpu_kernel<<>>( + num_kernels, data_col_, data_im_, data_offset_, channels, height, width, + ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group, + height_col, width_col, grad_offset_); + })); +} + +template +__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width, + const int height, const int width, scalar_t h, scalar_t w) +{ + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + scalar_t lh = h - h_low; + scalar_t lw = w - w_low; + scalar_t hh = 1 - lh, hw = 1 - lw; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w, + const int h, const int w, const int height, const int width) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w, + const int height, const int width, const scalar_t *im_data, + const int data_width, const int bp_dir) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +template +__global__ void modulated_deformable_im2col_gpu_kernel(const int n, + const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; + const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; + const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + + const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) + { + for (int j = 0; j < kernel_w; ++j) + { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + scalar_t val = static_cast(0); + const scalar_t h_im = h_in + i * dilation_h + offset_h; + const scalar_t w_im = w_in + j * dilation_w + offset_w; + //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const float map_h = i * dilation_h + offset_h; + //const float map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + //data_col_ptr += height_col * width_col; + } + } + } +} + +template +__global__ void modulated_deformable_col2im_gpu_kernel(const int n, + const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + scalar_t *grad_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; + const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const scalar_t cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +template +__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, + const scalar_t *data_col, const scalar_t *data_im, + const scalar_t *data_offset, const scalar_t *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, + scalar_t *grad_offset, scalar_t *grad_mask) +{ + CUDA_KERNEL_LOOP(index, n) + { + scalar_t val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; + const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + scalar_t inv_h = h_in + i * dilation_h + offset_h; + scalar_t inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + else + { + mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); + } + const scalar_t weight = dmcn_get_coordinate_weight( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; + } +} + +void modulated_deformable_im2col_cuda( + const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, at::Tensor data_col) +{ + // num_axes should be smaller than block size + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * batch_size * height_col * width_col; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] { + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + const scalar_t *data_mask_ = data_mask.data(); + scalar_t *data_col_ = data_col.data(); + + modulated_deformable_im2col_gpu_kernel<<>>( + num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, + batch_size, channels, deformable_group, height_col, width_col, data_col_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } +} + +void modulated_deformable_col2im_cuda( + const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, at::Tensor grad_im) +{ + + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_offset_ = data_offset.data(); + const scalar_t *data_mask_ = data_mask.data(); + scalar_t *grad_im_ = grad_im.data(); + + modulated_deformable_col2im_gpu_kernel<<>>( + num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, deformable_group, height_col, width_col, grad_im_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } +} + +void modulated_deformable_col2im_coord_cuda( + const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + at::Tensor grad_offset, at::Tensor grad_mask) +{ + const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; + const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] { + const scalar_t *data_col_ = data_col.data(); + const scalar_t *data_im_ = data_im.data(); + const scalar_t *data_offset_ = data_offset.data(); + const scalar_t *data_mask_ = data_mask.data(); + scalar_t *grad_offset_ = grad_offset.data(); + scalar_t *grad_mask_ = grad_mask.data(); + + modulated_deformable_col2im_coord_gpu_kernel<<>>( + num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, + grad_offset_, grad_mask_); + })); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); + } +} diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_pool_cuda.cpp b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_pool_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f6f087b884a037a07cb6dc90252d598bbd4178b6 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_pool_cuda.cpp @@ -0,0 +1,90 @@ +// modify from +// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c + +// based on +// author: Charles Shang +// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu + +#include +#include + +#include +#include + +void DeformablePSROIPoolForward( + const at::Tensor data, const at::Tensor bbox, const at::Tensor trans, + at::Tensor out, at::Tensor top_count, const int batch, const int channels, + const int height, const int width, const int num_bbox, + const int channels_trans, const int no_trans, const float spatial_scale, + const int output_dim, const int group_size, const int pooled_size, + const int part_size, const int sample_per_part, const float trans_std); + +void DeformablePSROIPoolBackwardAcc( + const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox, + const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad, + at::Tensor trans_grad, const int batch, const int channels, + const int height, const int width, const int num_bbox, + const int channels_trans, const int no_trans, const float spatial_scale, + const int output_dim, const int group_size, const int pooled_size, + const int part_size, const int sample_per_part, const float trans_std); + +void deform_psroi_pooling_cuda_forward( + at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out, + at::Tensor top_count, const int no_trans, const float spatial_scale, + const int output_dim, const int group_size, const int pooled_size, + const int part_size, const int sample_per_part, const float trans_std) { + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + at::DeviceGuard guard(input.device()); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + + const int num_bbox = bbox.size(0); + if (num_bbox != out.size(0)) + AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", + out.size(0), num_bbox); + + DeformablePSROIPoolForward( + input, bbox, trans, out, top_count, batch, channels, height, width, + num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size, + pooled_size, part_size, sample_per_part, trans_std); +} + +void deform_psroi_pooling_cuda_backward( + at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans, + at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad, + const int no_trans, const float spatial_scale, const int output_dim, + const int group_size, const int pooled_size, const int part_size, + const int sample_per_part, const float trans_std) { + TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous"); + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + at::DeviceGuard guard(input.device()); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + + const int num_bbox = bbox.size(0); + if (num_bbox != out_grad.size(0)) + AT_ERROR("Output shape and bbox number wont match: (%d vs %d).", + out_grad.size(0), num_bbox); + + DeformablePSROIPoolBackwardAcc( + out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch, + channels, height, width, num_bbox, channels_trans, no_trans, + spatial_scale, output_dim, group_size, pooled_size, part_size, + sample_per_part, trans_std); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward, + "deform psroi pooling forward(CUDA)"); + m.def("deform_psroi_pooling_cuda_backward", + &deform_psroi_pooling_cuda_backward, + "deform psroi pooling backward(CUDA)"); +} diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_pool_cuda_kernel.cu b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_pool_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..05b00d4be618353b404540469bf6118902651ca2 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/dcn/src/deform_pool_cuda_kernel.cu @@ -0,0 +1,364 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +/***************** Adapted by Charles Shang *********************/ +// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu + +#include +#include +#include +#include +#include + +using namespace at; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__device__ scalar_t bilinear_interp( + const scalar_t *data, + const scalar_t x, + const scalar_t y, + const int width, + const int height) +{ + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + scalar_t dist_x = (scalar_t)(x - x1); + scalar_t dist_y = (scalar_t)(y - y1); + scalar_t value11 = data[y1 * width + x1]; + scalar_t value12 = data[y2 * width + x1]; + scalar_t value21 = data[y1 * width + x2]; + scalar_t value22 = data[y2 * width + x2]; + scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22; + return value; +} + +template +__global__ void DeformablePSROIPoolForwardKernel( + const int count, + const scalar_t *bottom_data, + const scalar_t spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const scalar_t *bottom_rois, const scalar_t *bottom_trans, + const int no_trans, + const scalar_t trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + scalar_t *top_data, + scalar_t *top_count) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const scalar_t *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height); + scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width); + + scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part); + scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part); + + int part_h = floor((scalar_t)(ph) / pooled_height * part_size); + int part_w = floor((scalar_t)(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + + scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + scalar_t sum = 0; + int count = 0; + int gw = floor((scalar_t)(pw)*group_size / pooled_width); + int gh = floor((scalar_t)(ph)*group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + scalar_t w = wstart + iw * sub_bin_size_w; + scalar_t h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? (scalar_t)(0) : sum / count; + top_count[index] = count; + } +} + +template +__global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, + const scalar_t *top_diff, + const scalar_t *top_count, + const int num_rois, + const scalar_t spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff, + const scalar_t *bottom_data, + const scalar_t *bottom_rois, + const scalar_t *bottom_trans, + const int no_trans, + const scalar_t trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const scalar_t *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height); + scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width); + + scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part); + scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part); + + int part_h = floor((scalar_t)(ph) / pooled_height * part_size); + int part_w = floor((scalar_t)(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std; + + scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) + { + continue; + } + scalar_t diff_val = top_diff[index] / top_count[index]; + const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor((scalar_t)(pw)*group_size / pooled_width); + int gh = floor((scalar_t)(ph)*group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + scalar_t w = wstart + iw * sub_bin_size_w; + scalar_t h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + scalar_t dist_x = w - x0, dist_y = h - y0; + scalar_t q00 = (1 - dist_x) * (1 - dist_y); + scalar_t q01 = (1 - dist_x) * dist_y; + scalar_t q10 = dist_x * (1 - dist_y); + scalar_t q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); + + if (no_trans) + { + continue; + } + scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; + diff_x *= roi_width; + scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; + diff_y *= roi_height; + + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); + } + } + } +} + +void DeformablePSROIPoolForward(const at::Tensor data, + const at::Tensor bbox, + const at::Tensor trans, + at::Tensor out, + at::Tensor top_count, + const int batch, + const int channels, + const int height, + const int width, + const int num_bbox, + const int channels_trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int count = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data.scalar_type(), "deformable_psroi_pool_forward", ([&] { + const scalar_t *bottom_data = data.data(); + const scalar_t *bottom_rois = bbox.data(); + const scalar_t *bottom_trans = no_trans ? NULL : trans.data(); + scalar_t *top_data = out.data(); + scalar_t *top_count_data = top_count.data(); + + DeformablePSROIPoolForwardKernel<<>>( + count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width, + bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim, + group_size, part_size, num_classes, channels_each_class, top_data, top_count_data); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err)); + } +} + +void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad, + const at::Tensor data, + const at::Tensor bbox, + const at::Tensor trans, + const at::Tensor top_count, + at::Tensor in_grad, + at::Tensor trans_grad, + const int batch, + const int channels, + const int height, + const int width, + const int num_bbox, + const int channels_trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + // LOG(INFO) << "DeformablePSROIPoolBackward"; + const int num_rois = num_bbox; + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int count = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] { + const scalar_t *top_diff = out_grad.data(); + const scalar_t *bottom_data = data.data(); + const scalar_t *bottom_rois = bbox.data(); + const scalar_t *bottom_trans = no_trans ? NULL : trans.data(); + scalar_t *bottom_data_diff = in_grad.data(); + scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data(); + const scalar_t *top_count_data = top_count.data(); + + DeformablePSROIPoolBackwardAccKernel<<>>( + count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width, + pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, + bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, + group_size, part_size, num_classes, channels_each_class); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err)); + } +} diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c267f071482f45a28f6c5dfd674ee7b563fc5868 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/__init__.py @@ -0,0 +1 @@ +from det3d.ops.iou3d_nms import iou3d_nms_cuda, iou3d_nms_utils diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/iou3d_nms_utils.py b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/iou3d_nms_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4d71e33acfd690eca10a33aa4bcc49052ad15a0b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/iou3d_nms_utils.py @@ -0,0 +1,107 @@ +""" +3D IoU Calculation and Rotated NMS +Written by Shaoshuai Shi +All Rights Reserved 2019-2020. +""" +import torch + +from . import iou3d_nms_cuda +import numpy as np + + + +def boxes_iou_bev(boxes_a, boxes_b): + """ + Args: + boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] + boxes_b: (N, 7) [x, y, z, dx, dy, dz, heading] + + Returns: + ans_iou: (N, M) + """ + assert boxes_a.shape[1] == boxes_b.shape[1] == 7 + ans_iou = torch.cuda.FloatTensor(torch.Size((boxes_a.shape[0], boxes_b.shape[0]))).zero_() + + iou3d_nms_cuda.boxes_iou_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(), ans_iou) + + return ans_iou + +def to_pcdet(boxes): + # transform back to pcdet's coordinate + boxes = boxes[:, [0, 1, 2, 4, 3, 5, -1]] + boxes[:, -1] = -boxes[:, -1] - np.pi/2 + return boxes + +def boxes_iou3d_gpu(boxes_a, boxes_b): + """ + Args: + boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] + boxes_b: (N, 7) [x, y, z, dx, dy, dz, heading] + + Returns: + ans_iou: (N, M) + """ + assert boxes_a.shape[1] == boxes_b.shape[1] == 7 + + # transform back to pcdet's coordinate + boxes_a = to_pcdet(boxes_a) + boxes_b = to_pcdet(boxes_b) + + # height overlap + boxes_a_height_max = (boxes_a[:, 2] + boxes_a[:, 5] / 2).view(-1, 1) + boxes_a_height_min = (boxes_a[:, 2] - boxes_a[:, 5] / 2).view(-1, 1) + boxes_b_height_max = (boxes_b[:, 2] + boxes_b[:, 5] / 2).view(1, -1) + boxes_b_height_min = (boxes_b[:, 2] - boxes_b[:, 5] / 2).view(1, -1) + + # bev overlap + overlaps_bev = torch.cuda.FloatTensor(torch.Size((boxes_a.shape[0], boxes_b.shape[0]))).zero_() # (N, M) + iou3d_nms_cuda.boxes_overlap_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(), overlaps_bev) + + max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min) + min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max) + overlaps_h = torch.clamp(min_of_max - max_of_min, min=0) + + # 3d iou + overlaps_3d = overlaps_bev * overlaps_h + + vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1) + vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1) + + iou3d = overlaps_3d / torch.clamp(vol_a + vol_b - overlaps_3d, min=1e-6) + + return iou3d + + +def nms_gpu(boxes, scores, thresh, pre_maxsize=None, **kwargs): + """ + :param boxes: (N, 7) [x, y, z, dx, dy, dz, heading] + :param scores: (N) + :param thresh: + :return: + """ + assert boxes.shape[1] == 7 + order = scores.sort(0, descending=True)[1] + if pre_maxsize is not None: + order = order[:pre_maxsize] + + boxes = boxes[order].contiguous() + keep = torch.LongTensor(boxes.size(0)) + num_out = iou3d_nms_cuda.nms_gpu(boxes, keep, thresh) + return order[keep[:num_out].cuda()].contiguous(), None + + +def nms_normal_gpu(boxes, scores, thresh, **kwargs): + """ + :param boxes: (N, 7) [x, y, z, dx, dy, dz, heading] + :param scores: (N) + :param thresh: + :return: + """ + assert boxes.shape[1] == 7 + order = scores.sort(0, descending=True)[1] + + boxes = boxes[order].contiguous() + + keep = torch.LongTensor(boxes.size(0)) + num_out = iou3d_nms_cuda.nms_normal_gpu(boxes, keep, thresh) + return order[keep[:num_out].cuda()].contiguous(), None \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/setup.py b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..74b89a8ce0f4bd5242299c390cc09f8af83f3fcd --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/setup.py @@ -0,0 +1,16 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup( + name='iou3d_nms', + ext_modules=[ + CUDAExtension('iou3d_nms_cuda', [ + 'src/iou3d_cpu.cpp', + 'src/iou3d_nms_api.cpp', + 'src/iou3d_nms.cpp', + 'src/iou3d_nms_kernel.cu', + ], + extra_compile_args={'cxx': ['-g', '-I /usr/local/cuda/include'], + 'nvcc': ['-O2']}) + ], + cmdclass={'build_ext': BuildExtension}) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_cpu.cpp b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d864ca587d11c92f89a3df151402c45139d8766c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_cpu.cpp @@ -0,0 +1,252 @@ +/* +3D Rotated IoU Calculation (CPU) +Written by Shaoshuai Shi +All Rights Reserved 2020. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include "iou3d_cpu.h" + +#define CHECK_CUDA(x) do { \ + if (!x.type().is_cuda()) { \ + fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \ + exit(-1); \ + } \ +} while (0) +#define CHECK_CONTIGUOUS(x) do { \ + if (!x.is_contiguous()) { \ + fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \ + exit(-1); \ + } \ +} while (0) +#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) + +inline float min(float a, float b){ + return a > b ? b : a; +} + +inline float max(float a, float b){ + return a > b ? a : b; +} + +const float EPS = 1e-8; +struct Point { + float x, y; + __device__ Point() {} + __device__ Point(float _x, float _y){ + x = _x, y = _y; + } + + __device__ void set(float _x, float _y){ + x = _x; y = _y; + } + + __device__ Point operator +(const Point &b)const{ + return Point(x + b.x, y + b.y); + } + + __device__ Point operator -(const Point &b)const{ + return Point(x - b.x, y - b.y); + } +}; + +inline float cross(const Point &a, const Point &b){ + return a.x * b.y - a.y * b.x; +} + +inline float cross(const Point &p1, const Point &p2, const Point &p0){ + return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); +} + +inline int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){ + int ret = min(p1.x,p2.x) <= max(q1.x,q2.x) && + min(q1.x,q2.x) <= max(p1.x,p2.x) && + min(p1.y,p2.y) <= max(q1.y,q2.y) && + min(q1.y,q2.y) <= max(p1.y,p2.y); + return ret; +} + +inline int check_in_box2d(const float *box, const Point &p){ + //params: (7) [x, y, z, dx, dy, dz, heading] + const float MARGIN = 1e-2; + + float center_x = box[0], center_y = box[1]; + float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]); // rotate the point in the opposite direction of box + float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); + float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos; + + return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN); +} + +inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){ + // fast exclusion + if (check_rect_cross(p0, p1, q0, q1) == 0) return 0; + + // check cross standing + float s1 = cross(q0, p1, p0); + float s2 = cross(p1, q1, p0); + float s3 = cross(p0, q1, q0); + float s4 = cross(q1, p1, q0); + + if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0; + + // calculate intersection of two lines + float s5 = cross(q1, p1, p0); + if(fabs(s5 - s1) > EPS){ + ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1); + ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1); + + } + else{ + float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y; + float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y; + float D = a0 * b1 - a1 * b0; + + ans.x = (b0 * c1 - b1 * c0) / D; + ans.y = (a1 * c0 - a0 * c1) / D; + } + + return 1; +} + +inline void rotate_around_center(const Point ¢er, const float angle_cos, const float angle_sin, Point &p){ + float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x; + float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; + p.set(new_x, new_y); +} + +inline int point_cmp(const Point &a, const Point &b, const Point ¢er){ + return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x); +} + +inline float box_overlap(const float *box_a, const float *box_b){ + // params: box_a (7) [x, y, z, dx, dy, dz, heading] + // params: box_b (7) [x, y, z, dx, dy, dz, heading] + +// float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3], a_angle = box_a[4]; +// float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3], b_angle = box_b[4]; + float a_angle = box_a[6], b_angle = box_b[6]; + float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2; + float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half; + float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half; + float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half; + float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half; + + Point center_a(box_a[0], box_a[1]); + Point center_b(box_b[0], box_b[1]); + + Point box_a_corners[5]; + box_a_corners[0].set(a_x1, a_y1); + box_a_corners[1].set(a_x2, a_y1); + box_a_corners[2].set(a_x2, a_y2); + box_a_corners[3].set(a_x1, a_y2); + + Point box_b_corners[5]; + box_b_corners[0].set(b_x1, b_y1); + box_b_corners[1].set(b_x2, b_y1); + box_b_corners[2].set(b_x2, b_y2); + box_b_corners[3].set(b_x1, b_y2); + + // get oriented corners + float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle); + float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle); + + for (int k = 0; k < 4; k++){ + rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); + rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); + } + + box_a_corners[4] = box_a_corners[0]; + box_b_corners[4] = box_b_corners[0]; + + // get intersection of lines + Point cross_points[16]; + Point poly_center; + int cnt = 0, flag = 0; + + poly_center.set(0, 0); + for (int i = 0; i < 4; i++){ + for (int j = 0; j < 4; j++){ + flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]); + if (flag){ + poly_center = poly_center + cross_points[cnt]; + cnt++; + } + } + } + + // check corners + for (int k = 0; k < 4; k++){ + if (check_in_box2d(box_a, box_b_corners[k])){ + poly_center = poly_center + box_b_corners[k]; + cross_points[cnt] = box_b_corners[k]; + cnt++; + } + if (check_in_box2d(box_b, box_a_corners[k])){ + poly_center = poly_center + box_a_corners[k]; + cross_points[cnt] = box_a_corners[k]; + cnt++; + } + } + + poly_center.x /= cnt; + poly_center.y /= cnt; + + // sort the points of polygon + Point temp; + for (int j = 0; j < cnt - 1; j++){ + for (int i = 0; i < cnt - j - 1; i++){ + if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){ + temp = cross_points[i]; + cross_points[i] = cross_points[i + 1]; + cross_points[i + 1] = temp; + } + } + } + + // get the overlap areas + float area = 0; + for (int k = 0; k < cnt - 1; k++){ + area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]); + } + + return fabs(area) / 2.0; +} + +inline float iou_bev(const float *box_a, const float *box_b){ + // params: box_a (7) [x, y, z, dx, dy, dz, heading] + // params: box_b (7) [x, y, z, dx, dy, dz, heading] + float sa = box_a[3] * box_a[4]; + float sb = box_b[3] * box_b[4]; + float s_overlap = box_overlap(box_a, box_b); + return s_overlap / fmaxf(sa + sb - s_overlap, EPS); +} + + +int boxes_iou_bev_cpu(at::Tensor boxes_a_tensor, at::Tensor boxes_b_tensor, at::Tensor ans_iou_tensor){ + // params boxes_a_tensor: (N, 7) [x, y, z, dx, dy, dz, heading] + // params boxes_b_tensor: (M, 7) [x, y, z, dx, dy, dz, heading] + // params ans_iou_tensor: (N, M) + + CHECK_CONTIGUOUS(boxes_a_tensor); + CHECK_CONTIGUOUS(boxes_b_tensor); + + int num_boxes_a = boxes_a_tensor.size(0); + int num_boxes_b = boxes_b_tensor.size(0); + const float *boxes_a = boxes_a_tensor.data(); + const float *boxes_b = boxes_b_tensor.data(); + float *ans_iou = ans_iou_tensor.data(); + + for (int i = 0; i < num_boxes_a; i++){ + for (int j = 0; j < num_boxes_b; j++){ + ans_iou[i * num_boxes_b + j] = iou_bev(boxes_a + i * 7, boxes_b + j * 7); + } + } + return 1; +} diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_cpu.h b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..8835ee7049c6f71866ed9f0f1bb019933a63d1cd --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_cpu.h @@ -0,0 +1,11 @@ +#ifndef IOU3D_CPU_H +#define IOU3D_CPU_H + +#include +#include +#include +#include + +int boxes_iou_bev_cpu(at::Tensor boxes_a_tensor, at::Tensor boxes_b_tensor, at::Tensor ans_iou_tensor); + +#endif diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms.cpp b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d41da8ad078431cc56d71f1a56f891480ffec07f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms.cpp @@ -0,0 +1,188 @@ +/* +3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others) +Written by Shaoshuai Shi +All Rights Reserved 2019-2020. +*/ + +#include +#include +#include +#include +#include +#include "iou3d_nms.h" + +#define CHECK_CUDA(x) do { \ + if (!x.type().is_cuda()) { \ + fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \ + exit(-1); \ + } \ +} while (0) +#define CHECK_CONTIGUOUS(x) do { \ + if (!x.is_contiguous()) { \ + fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \ + exit(-1); \ + } \ +} while (0) +#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) + +#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) + +#define CHECK_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; + + +void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap); +void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou); +void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh); +void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh); + + +int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap){ + // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] + // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] + // params ans_overlap: (N, M) + + CHECK_INPUT(boxes_a); + CHECK_INPUT(boxes_b); + CHECK_INPUT(ans_overlap); + + int num_a = boxes_a.size(0); + int num_b = boxes_b.size(0); + + const float * boxes_a_data = boxes_a.data(); + const float * boxes_b_data = boxes_b.data(); + float * ans_overlap_data = ans_overlap.data(); + + boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_overlap_data); + + return 1; +} + +int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou){ + // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] + // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] + // params ans_overlap: (N, M) + CHECK_INPUT(boxes_a); + CHECK_INPUT(boxes_b); + CHECK_INPUT(ans_iou); + + int num_a = boxes_a.size(0); + int num_b = boxes_b.size(0); + + const float * boxes_a_data = boxes_a.data(); + const float * boxes_b_data = boxes_b.data(); + float * ans_iou_data = ans_iou.data(); + + boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data); + + return 1; +} + +int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){ + // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading] + // params keep: (N) + CHECK_INPUT(boxes); + CHECK_CONTIGUOUS(keep); + + int boxes_num = boxes.size(0); + const float * boxes_data = boxes.data(); + long * keep_data = keep.data(); + + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + + unsigned long long *mask_data = NULL; + CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long))); + nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh); + + // unsigned long long mask_cpu[boxes_num * col_blocks]; + // unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks]; + std::vector mask_cpu(boxes_num * col_blocks); + +// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks); + CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long), + cudaMemcpyDeviceToHost)); + + cudaFree(mask_data); + + unsigned long long remv_cpu[col_blocks]; + memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long)); + + int num_to_keep = 0; + + for (int i = 0; i < boxes_num; i++){ + int nblock = i / THREADS_PER_BLOCK_NMS; + int inblock = i % THREADS_PER_BLOCK_NMS; + + if (!(remv_cpu[nblock] & (1ULL << inblock))){ + keep_data[num_to_keep++] = i; + unsigned long long *p = &mask_cpu[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++){ + remv_cpu[j] |= p[j]; + } + } + } + if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" ); + + return num_to_keep; +} + + +int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){ + // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading] + // params keep: (N) + + CHECK_INPUT(boxes); + CHECK_CONTIGUOUS(keep); + + int boxes_num = boxes.size(0); + const float * boxes_data = boxes.data(); + long * keep_data = keep.data(); + + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + + unsigned long long *mask_data = NULL; + CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long))); + nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh); + + // unsigned long long mask_cpu[boxes_num * col_blocks]; + // unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks]; + std::vector mask_cpu(boxes_num * col_blocks); + +// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks); + CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long), + cudaMemcpyDeviceToHost)); + + cudaFree(mask_data); + + unsigned long long remv_cpu[col_blocks]; + memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long)); + + int num_to_keep = 0; + + for (int i = 0; i < boxes_num; i++){ + int nblock = i / THREADS_PER_BLOCK_NMS; + int inblock = i % THREADS_PER_BLOCK_NMS; + + if (!(remv_cpu[nblock] & (1ULL << inblock))){ + keep_data[num_to_keep++] = i; + unsigned long long *p = &mask_cpu[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++){ + remv_cpu[j] |= p[j]; + } + } + } + if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" ); + + return num_to_keep; +} + + diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms.h b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms.h new file mode 100644 index 0000000000000000000000000000000000000000..aa7ae0edf22034c798778277c5b8aa6701906a9d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms.h @@ -0,0 +1,14 @@ +#ifndef IOU3D_NMS_H +#define IOU3D_NMS_H + +#include +#include +#include +#include + +int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap); +int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou); +int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh); +int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh); + +#endif diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms_api.cpp b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5a2d3a37fe72c299ab68258c0d31553c9b76ad02 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms_api.cpp @@ -0,0 +1,17 @@ +#include +#include +#include +#include +#include + +#include "iou3d_cpu.h" +#include "iou3d_nms.h" + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu, "oriented boxes overlap"); + m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou"); + m.def("nms_gpu", &nms_gpu, "oriented nms gpu"); + m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu"); + m.def("boxes_iou_bev_cpu", &boxes_iou_bev_cpu, "oriented boxes iou"); +} diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms_kernel.cu b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..87868823a087654fcf305093a55d89e0a2a7d9cf --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/iou3d_nms/src/iou3d_nms_kernel.cu @@ -0,0 +1,414 @@ +/* +3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others) +Written by Shaoshuai Shi +All Rights Reserved 2019-2020. +*/ + + +#include +#define THREADS_PER_BLOCK 16 +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) + +// #define DEBUG +const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; +const float EPS = 1e-8; +struct Point { + float x, y; + __device__ Point() {} + __device__ Point(float _x, float _y){ + x = _x, y = _y; + } + + __device__ void set(float _x, float _y){ + x = _x; y = _y; + } + + __device__ Point operator +(const Point &b)const{ + return Point(x + b.x, y + b.y); + } + + __device__ Point operator -(const Point &b)const{ + return Point(x - b.x, y - b.y); + } +}; + +__device__ inline float cross(const Point &a, const Point &b){ + return a.x * b.y - a.y * b.x; +} + +__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){ + return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); +} + +__device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){ + int ret = min(p1.x,p2.x) <= max(q1.x,q2.x) && + min(q1.x,q2.x) <= max(p1.x,p2.x) && + min(p1.y,p2.y) <= max(q1.y,q2.y) && + min(q1.y,q2.y) <= max(p1.y,p2.y); + return ret; +} + +__device__ inline int check_in_box2d(const float *box, const Point &p){ + //params: (7) [x, y, z, dx, dy, dz, heading] + const float MARGIN = 1e-2; + + float center_x = box[0], center_y = box[1]; + float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]); // rotate the point in the opposite direction of box + float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); + float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos; + + return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN); +} + +__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){ + // fast exclusion + if (check_rect_cross(p0, p1, q0, q1) == 0) return 0; + + // check cross standing + float s1 = cross(q0, p1, p0); + float s2 = cross(p1, q1, p0); + float s3 = cross(p0, q1, q0); + float s4 = cross(q1, p1, q0); + + if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0; + + // calculate intersection of two lines + float s5 = cross(q1, p1, p0); + if(fabs(s5 - s1) > EPS){ + ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1); + ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1); + + } + else{ + float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y; + float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y; + float D = a0 * b1 - a1 * b0; + + ans.x = (b0 * c1 - b1 * c0) / D; + ans.y = (a1 * c0 - a0 * c1) / D; + } + + return 1; +} + +__device__ inline void rotate_around_center(const Point ¢er, const float angle_cos, const float angle_sin, Point &p){ + float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x; + float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; + p.set(new_x, new_y); +} + +__device__ inline int point_cmp(const Point &a, const Point &b, const Point ¢er){ + return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x); +} + +__device__ inline float box_overlap(const float *box_a, const float *box_b){ + // params box_a: [x, y, z, dx, dy, dz, heading] + // params box_b: [x, y, z, dx, dy, dz, heading] + + float a_angle = box_a[6], b_angle = box_b[6]; + float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2; + float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half; + float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half; + float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half; + float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half; + + Point center_a(box_a[0], box_a[1]); + Point center_b(box_b[0], box_b[1]); + +#ifdef DEBUG + printf("a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle, + b_x1, b_y1, b_x2, b_y2, b_angle); + printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y); +#endif + + Point box_a_corners[5]; + box_a_corners[0].set(a_x1, a_y1); + box_a_corners[1].set(a_x2, a_y1); + box_a_corners[2].set(a_x2, a_y2); + box_a_corners[3].set(a_x1, a_y2); + + Point box_b_corners[5]; + box_b_corners[0].set(b_x1, b_y1); + box_b_corners[1].set(b_x2, b_y1); + box_b_corners[2].set(b_x2, b_y2); + box_b_corners[3].set(b_x1, b_y2); + + // get oriented corners + float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle); + float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle); + + for (int k = 0; k < 4; k++){ +#ifdef DEBUG + printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y); +#endif + rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); + rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); +#ifdef DEBUG + printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y); +#endif + } + + box_a_corners[4] = box_a_corners[0]; + box_b_corners[4] = box_b_corners[0]; + + // get intersection of lines + Point cross_points[16]; + Point poly_center; + int cnt = 0, flag = 0; + + poly_center.set(0, 0); + for (int i = 0; i < 4; i++){ + for (int j = 0; j < 4; j++){ + flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]); + if (flag){ + poly_center = poly_center + cross_points[cnt]; + cnt++; +#ifdef DEBUG + printf("Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, %.3f)->(%.3f, %.3f) \n", + cross_points[cnt - 1].x, cross_points[cnt - 1].y, + box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x, box_a_corners[i + 1].y, + box_b_corners[i].x, box_b_corners[i].y, box_b_corners[i + 1].x, box_b_corners[i + 1].y); +#endif + } + } + } + + // check corners + for (int k = 0; k < 4; k++){ + if (check_in_box2d(box_a, box_b_corners[k])){ + poly_center = poly_center + box_b_corners[k]; + cross_points[cnt] = box_b_corners[k]; + cnt++; +#ifdef DEBUG + printf("b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y); +#endif + } + if (check_in_box2d(box_b, box_a_corners[k])){ + poly_center = poly_center + box_a_corners[k]; + cross_points[cnt] = box_a_corners[k]; + cnt++; +#ifdef DEBUG + printf("a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y); +#endif + } + } + + poly_center.x /= cnt; + poly_center.y /= cnt; + + // sort the points of polygon + Point temp; + for (int j = 0; j < cnt - 1; j++){ + for (int i = 0; i < cnt - j - 1; i++){ + if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){ + temp = cross_points[i]; + cross_points[i] = cross_points[i + 1]; + cross_points[i + 1] = temp; + } + } + } + +#ifdef DEBUG + printf("cnt=%d\n", cnt); + for (int i = 0; i < cnt; i++){ + printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y); + } +#endif + + // get the overlap areas + float area = 0; + for (int k = 0; k < cnt - 1; k++){ + area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]); + } + + return fabs(area) / 2.0; +} + +__device__ inline float iou_bev(const float *box_a, const float *box_b){ + // params box_a: [x, y, z, dx, dy, dz, heading] + // params box_b: [x, y, z, dx, dy, dz, heading] + float sa = box_a[3] * box_a[4]; + float sb = box_b[3] * box_b[4]; + float s_overlap = box_overlap(box_a, box_b); + return s_overlap / fmaxf(sa + sb - s_overlap, EPS); +} + +__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){ + // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] + // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] + const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; + const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; + + if (a_idx >= num_a || b_idx >= num_b){ + return; + } + const float * cur_box_a = boxes_a + a_idx * 7; + const float * cur_box_b = boxes_b + b_idx * 7; + float s_overlap = box_overlap(cur_box_a, cur_box_b); + ans_overlap[a_idx * num_b + b_idx] = s_overlap; +} + +__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){ + // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] + // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] + const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; + const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; + + if (a_idx >= num_a || b_idx >= num_b){ + return; + } + + const float * cur_box_a = boxes_a + a_idx * 7; + const float * cur_box_b = boxes_b + b_idx * 7; + float cur_iou_bev = iou_bev(cur_box_a, cur_box_b); + ans_iou[a_idx * num_b + b_idx] = cur_iou_bev; +} + +__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh, + const float *boxes, unsigned long long *mask){ + //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] + //params: mask (N, N/THREADS_PER_BLOCK_NMS) + + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; + + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; + block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; + block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; + block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; + block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; + block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; + block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const float *cur_box = boxes + cur_box_idx * 7; + + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh){ + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + mask[cur_box_idx * col_blocks + col_start] = t; + } +} + + +__device__ inline float iou_normal(float const * const a, float const * const b) { + //params: a: [x, y, z, dx, dy, dz, heading] + //params: b: [x, y, z, dx, dy, dz, heading] + + float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2); + float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2); + float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f); + float interS = width * height; + float Sa = a[3] * a[4]; + float Sb = b[3] * b[4]; + return interS / fmaxf(Sa + Sb - interS, EPS); +} + + +__global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh, + const float *boxes, unsigned long long *mask){ + //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] + //params: mask (N, N/THREADS_PER_BLOCK_NMS) + + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; + + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; + block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; + block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; + block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; + block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; + block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; + block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const float *cur_box = boxes + cur_box_idx * 7; + + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh){ + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + mask[cur_box_idx * col_blocks + col_start] = t; + } +} + + + + + +void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){ + + dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) + dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK); + + boxes_overlap_kernel<<>>(num_a, boxes_a, num_b, boxes_b, ans_overlap); +#ifdef DEBUG + cudaDeviceSynchronize(); // for using printf in kernel function +#endif +} + +void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){ + + dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) + dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK); + + boxes_iou_bev_kernel<<>>(num_a, boxes_a, num_b, boxes_b, ans_iou); +#ifdef DEBUG + cudaDeviceSynchronize(); // for using printf in kernel function +#endif +} + + +void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh){ + dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), + DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); + dim3 threads(THREADS_PER_BLOCK_NMS); + nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes, mask); +} + + +void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh){ + dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), + DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); + dim3 threads(THREADS_PER_BLOCK_NMS); + nms_normal_kernel<<>>(boxes_num, nms_overlap_thresh, boxes, mask); +} diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/bev_ops.py b/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/bev_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..c56d7fa3286c9dd5a4028b4833594b0b9ced645e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/bev_ops.py @@ -0,0 +1,117 @@ +import math + +import numba +import numpy as np + + +@numba.jit(nopython=True) +def _points_to_bevmap_reverse_kernel( + points, + voxel_size, + coors_range, + coor_to_voxelidx, + # coors_2d, + bev_map, + height_lowers, + # density_norm_num=16, + with_reflectivity=False, + max_voxels=40000, +): + # put all computations to one loop. + # we shouldn't create large array in main jit code, otherwise + # reduce performance + N = points.shape[0] + ndim = points.shape[1] - 1 + # ndim = 3 + ndim_minus_1 = ndim - 1 + grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size + # np.round(grid_size) + # grid_size = np.round(grid_size).astype(np.int64)(np.int32) + grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) + height_slice_size = voxel_size[-1] + coor = np.zeros(shape=(3,), dtype=np.int32) # DHW + voxel_num = 0 + failed = False + for i in range(N): + failed = False + for j in range(ndim): + c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) + if c < 0 or c >= grid_size[j]: + failed = True + break + coor[ndim_minus_1 - j] = c + if failed: + continue + voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] + if voxelidx == -1: + voxelidx = voxel_num + if voxel_num >= max_voxels: + break + voxel_num += 1 + coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx + # coors_2d[voxelidx] = coor[1:] + bev_map[-1, coor[1], coor[2]] += 1 + height_norm = bev_map[coor[0], coor[1], coor[2]] + incomimg_height_norm = ( + points[i, 2] - height_lowers[coor[0]] + ) / height_slice_size + if incomimg_height_norm > height_norm: + bev_map[coor[0], coor[1], coor[2]] = incomimg_height_norm + if with_reflectivity: + bev_map[-2, coor[1], coor[2]] = points[i, 3] + # return voxel_num + + +def points_to_bev( + points, + voxel_size, + coors_range, + with_reflectivity=False, + density_norm_num=16, + max_voxels=40000, +): + """convert kitti points(N, 4) to a bev map. return [C, H, W] map. + this function based on algorithm in points_to_voxel. + takes 5ms in a reduced pointcloud with voxel_size=[0.1, 0.1, 0.8] + + Args: + points: [N, ndim] float tensor. points[:, :3] contain xyz points and + points[:, 3] contain reflectivity. + voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size + coors_range: [6] list/tuple or array, float. indicate voxel range. + format: xyzxyz, minmax + with_reflectivity: bool. if True, will add a intensity map to bev map. + Returns: + bev_map: [num_height_maps + 1(2), H, W] float tensor. + `WARNING`: bev_map[-1] is num_points map, NOT density map, + because calculate density map need more time in cpu rather than gpu. + if with_reflectivity is True, bev_map[-2] is intensity map. + """ + if not isinstance(voxel_size, np.ndarray): + voxel_size = np.array(voxel_size, dtype=points.dtype) + if not isinstance(coors_range, np.ndarray): + coors_range = np.array(coors_range, dtype=points.dtype) + voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size + voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist()) + voxelmap_shape = voxelmap_shape[::-1] # DHW format + coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32) + # coors_2d = np.zeros(shape=(max_voxels, 2), dtype=np.int32) + bev_map_shape = list(voxelmap_shape) + bev_map_shape[0] += 1 + height_lowers = np.linspace( + coors_range[2], coors_range[5], voxelmap_shape[0], endpoint=False + ) + if with_reflectivity: + bev_map_shape[0] += 1 + bev_map = np.zeros(shape=bev_map_shape, dtype=points.dtype) + _points_to_bevmap_reverse_kernel( + points, + voxel_size, + coors_range, + coor_to_voxelidx, + bev_map, + height_lowers, + with_reflectivity, + max_voxels, + ) + return bev_map diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/point_cloud_ops.py b/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/point_cloud_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..358350857ba691eb3481ab5aa45d33a5e7135757 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/ops/point_cloud/point_cloud_ops.py @@ -0,0 +1,202 @@ +import time + +import numba +import numpy as np + + +@numba.jit(nopython=True) +def _points_to_voxel_reverse_kernel( + points, + voxel_size, + coors_range, + num_points_per_voxel, + coor_to_voxelidx, + voxels, + coors, + max_points=35, + max_voxels=20000, +): + # put all computations to one loop. + # we shouldn't create large array in main jit code, otherwise + # reduce performance + N = points.shape[0] + # ndim = points.shape[1] - 1 + ndim = 3 + ndim_minus_1 = ndim - 1 + grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size + # np.round(grid_size) + # grid_size = np.round(grid_size).astype(np.int64)(np.int32) + grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) + coor = np.zeros(shape=(3,), dtype=np.int32) + voxel_num = 0 + failed = False + for i in range(N): + failed = False + for j in range(ndim): + c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) + if c < 0 or c >= grid_size[j]: + failed = True + break + coor[ndim_minus_1 - j] = c + if failed: + continue + voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] + if voxelidx == -1: + voxelidx = voxel_num + if voxel_num >= max_voxels: + continue + voxel_num += 1 + coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx + coors[voxelidx] = coor + num = num_points_per_voxel[voxelidx] + if num < max_points: + voxels[voxelidx, num] = points[i] + num_points_per_voxel[voxelidx] += 1 + return voxel_num + + +@numba.jit(nopython=True) +def _points_to_voxel_kernel( + points, + voxel_size, + coors_range, + num_points_per_voxel, + coor_to_voxelidx, + voxels, + coors, + max_points=35, + max_voxels=20000, +): + # need mutex if write in cuda, but numba.cuda don't support mutex. + # in addition, pytorch don't support cuda in dataloader(tensorflow support this). + # put all computations to one loop. + # we shouldn't create large array in main jit code, otherwise + # decrease performance + N = points.shape[0] + # ndim = points.shape[1] - 1 + ndim = 3 + grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size + # grid_size = np.round(grid_size).astype(np.int64)(np.int32) + grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) + + lower_bound = coors_range[:3] + upper_bound = coors_range[3:] + coor = np.zeros(shape=(3,), dtype=np.int32) + voxel_num = 0 + failed = False + for i in range(N): + failed = False + for j in range(ndim): + c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) + if c < 0 or c >= grid_size[j]: + failed = True + break + coor[j] = c + if failed: + continue + voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] + if voxelidx == -1: + voxelidx = voxel_num + if voxel_num >= max_voxels: + continue + voxel_num += 1 + coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx + coors[voxelidx] = coor + num = num_points_per_voxel[voxelidx] + if num < max_points: + voxels[voxelidx, num] = points[i] + num_points_per_voxel[voxelidx] += 1 + return voxel_num + + +def points_to_voxel( + points, voxel_size, coors_range, max_points=35, reverse_index=True, max_voxels=20000 +): + """convert kitti points(N, >=3) to voxels. This version calculate + everything in one loop. now it takes only 4.2ms(complete point cloud) + with jit and 3.2ghz cpu.(don't calculate other features) + Note: this function in ubuntu seems faster than windows 10. + + Args: + points: [N, ndim] float tensor. points[:, :3] contain xyz points and + points[:, 3:] contain other information such as reflectivity. + voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size + coors_range: [6] list/tuple or array, float. indicate voxel range. + format: xyzxyz, minmax + max_points: int. indicate maximum points contained in a voxel. + reverse_index: boolean. indicate whether return reversed coordinates. + if points has xyz format and reverse_index is True, output + coordinates will be zyx format, but points in features always + xyz format. + max_voxels: int. indicate maximum voxels this function create. + for second, 20000 is a good choice. you should shuffle points + before call this function because max_voxels may drop some points. + + Returns: + voxels: [M, max_points, ndim] float tensor. only contain points. + coordinates: [M, 3] int32 tensor. + num_points_per_voxel: [M] int32 tensor. + """ + if not isinstance(voxel_size, np.ndarray): + voxel_size = np.array(voxel_size, dtype=points.dtype) + if not isinstance(coors_range, np.ndarray): + coors_range = np.array(coors_range, dtype=points.dtype) + voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size + voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist()) + if reverse_index: + voxelmap_shape = voxelmap_shape[::-1] + # don't create large array in jit(nopython=True) code. + num_points_per_voxel = np.zeros(shape=(max_voxels,), dtype=np.int32) + coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32) + voxels = np.zeros( + shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype + ) + coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32) + if reverse_index: + voxel_num = _points_to_voxel_reverse_kernel( + points, + voxel_size, + coors_range, + num_points_per_voxel, + coor_to_voxelidx, + voxels, + coors, + max_points, + max_voxels, + ) + + else: + voxel_num = _points_to_voxel_kernel( + points, + voxel_size, + coors_range, + num_points_per_voxel, + coor_to_voxelidx, + voxels, + coors, + max_points, + max_voxels, + ) + + coors = coors[:voxel_num] + voxels = voxels[:voxel_num] + num_points_per_voxel = num_points_per_voxel[:voxel_num] + return voxels, coors, num_points_per_voxel + + +@numba.jit(nopython=True) +def bound_points_jit(points, upper_bound, lower_bound): + # to use nopython=True, np.bool is not supported. so you need + # convert result to np.bool after this function. + N = points.shape[0] + ndim = points.shape[1] + keep_indices = np.zeros((N,), dtype=np.int32) + success = 0 + for i in range(N): + success = 1 + for j in range(ndim): + if points[i, j] < lower_bound[j] or points[i, j] >= upper_bound[j]: + success = 0 + break + keep_indices[i] = success + return keep_indices diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/solver/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/solver/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/solver/background.py b/cv/3d_detection/centerpoint/pytorch/det3d/solver/background.py new file mode 100644 index 0000000000000000000000000000000000000000..2c7d08238196bcee16d8ab27f0f7537f8864fd56 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/solver/background.py @@ -0,0 +1,28 @@ +import threading, queue + + +class BackgroundGenerator(threading.Thread): + def __init__(self, generator, max_prefetch=1): + threading.Thread.__init__(self) + self.queue = queue.Queue(max_prefetch) + self.generator = generator + self.daemon = True + self.start() + + def run(self): + for item in self.generator: + self.queue.put(item) + self.queue.put(None) + + def next(self): + next_item = self.queue.get() + if next_item is None: + raise StopIteration + return next_item + + # Python 3 compatibility + def __next__(self): + return self.next() + + def __iter__(self): + return self diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/solver/fastai_optim.py b/cv/3d_detection/centerpoint/pytorch/det3d/solver/fastai_optim.py new file mode 100644 index 0000000000000000000000000000000000000000..a54344780f8d816f53a404fbe01f82775f838c4a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/solver/fastai_optim.py @@ -0,0 +1,309 @@ +from collections import Iterable, defaultdict +from copy import deepcopy +from itertools import chain + +import torch +from torch import nn +from torch._utils import _unflatten_dense_tensors +from torch.autograd import Variable +from torch.nn.utils import parameters_to_vector +try: + from apex.parallel.optimized_sync_batchnorm import SyncBatchNorm + bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.modules.batchnorm._BatchNorm, SyncBatchNorm) +except: + print('no apex') + bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d,nn.modules.batchnorm._BatchNorm) + +def split_bn_bias(layer_groups): + "Split the layers in `layer_groups` into batchnorm (`bn_types`) and non-batchnorm groups." + split_groups = [] + for l in layer_groups: + l1, l2 = [], [] + for c in l.children(): + if isinstance(c, bn_types): + l2.append(c) + else: + l1.append(c) + split_groups += [nn.Sequential(*l1), nn.Sequential(*l2)] + return split_groups + + +def get_master(layer_groups, flat_master: bool = False): + "Return two lists, one for the model parameters in FP16 and one for the master parameters in FP32." + split_groups = split_bn_bias(layer_groups) + model_params = [ + [param for param in lg.parameters() if param.requires_grad] + for lg in split_groups + ] + if flat_master: + master_params = [] + for lg in model_params: + if len(lg) != 0: + mp = parameters_to_vector([param.data.float() for param in lg]) + mp = torch.nn.Parameter(mp, requires_grad=True) + if mp.grad is None: + mp.grad = mp.new(*mp.size()) + master_params.append([mp]) + else: + master_params.append([]) + return model_params, master_params + else: + master_params = [ + [param.clone().float().detach() for param in lg] for lg in model_params + ] + for mp in master_params: + for param in mp: + param.requires_grad = True + return model_params, master_params + + +def model_g2master_g(model_params, master_params, flat_master: bool = False) -> None: + "Copy the `model_params` gradients to `master_params` for the optimizer step." + if flat_master: + for model_group, master_group in zip(model_params, master_params): + if len(master_group) != 0: + master_group[0].grad.data.copy_( + parameters_to_vector([p.grad.data.float() for p in model_group]) + ) + else: + for model_group, master_group in zip(model_params, master_params): + for model, master in zip(model_group, master_group): + if model.grad is not None: + if master.grad is None: + master.grad = master.data.new(*master.data.size()) + master.grad.data.copy_(model.grad.data) + else: + master.grad = None + + +def master2model(model_params, master_params, flat_master: bool = False) -> None: + "Copy `master_params` to `model_params`." + if flat_master: + for model_group, master_group in zip(model_params, master_params): + if len(model_group) != 0: + for model, master in zip( + model_group, + _unflatten_dense_tensors(master_group[0].data, model_group), + ): + model.data.copy_(master) + else: + for model_group, master_group in zip(model_params, master_params): + for model, master in zip(model_group, master_group): + model.data.copy_(master.data) + + +def listify(p=None, q=None): + "Make `p` listy and the same length as `q`." + if p is None: + p = [] + elif isinstance(p, str): + p = [p] + elif not isinstance(p, Iterable): + p = [p] + n = q if type(q) == int else len(p) if q is None else len(q) + if len(p) == 1: + p = p * n + assert len(p) == n, f"List len mismatch ({len(p)} vs {n})" + return list(p) + + +def trainable_params(m: nn.Module): + "Return list of trainable params in `m`." + res = filter(lambda p: p.requires_grad, m.parameters()) + return res + + +def is_tuple(x) -> bool: + return isinstance(x, tuple) + + +# copy from fastai. +class OptimWrapper: + "Basic wrapper around `opt` to simplify hyper-parameters changes." + + def __init__(self, opt, wd, true_wd: bool = False, bn_wd: bool = True): + self.opt, self.true_wd, self.bn_wd = opt, true_wd, bn_wd + self.opt_keys = list(self.opt.param_groups[0].keys()) + self.opt_keys.remove("params") + self.read_defaults() + self.wd = wd + + @classmethod + def create(cls, opt_func, lr, layer_groups, **kwargs): + "Create an `optim.Optimizer` from `opt_func` with `lr`. Set lr on `layer_groups`." + split_groups = split_bn_bias(layer_groups) + opt = opt_func([{"params": trainable_params(l), "lr": 0} for l in split_groups]) + opt = cls(opt, **kwargs) + opt.lr, opt.opt_func = listify(lr, layer_groups), opt_func + return opt + + def new(self, layer_groups): + "Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters." + opt_func = getattr(self, "opt_func", self.opt.__class__) + split_groups = split_bn_bias(layer_groups) + opt = opt_func([{"params": trainable_params(l), "lr": 0} for l in split_groups]) + return self.create( + opt_func, + self.lr, + layer_groups, + wd=self.wd, + true_wd=self.true_wd, + bn_wd=self.bn_wd, + ) + + def __repr__(self) -> str: + return f"OptimWrapper over {repr(self.opt)}.\nTrue weight decay: {self.true_wd}" + + # Pytorch optimizer methods + def step(self) -> None: + "Set weight decay and step optimizer." + # weight decay outside of optimizer step (AdamW) + if self.true_wd: + for lr, wd, pg1, pg2 in zip( + self._lr, + self._wd, + self.opt.param_groups[::2], + self.opt.param_groups[1::2], + ): + for p in pg1["params"]: + p.data.mul_(1 - wd * lr) + if self.bn_wd: + for p in pg2["params"]: + p.data.mul_(1 - wd * lr) + self.set_val("weight_decay", listify(0, self._wd)) + self.opt.step() + + def zero_grad(self) -> None: + "Clear optimizer gradients." + self.opt.zero_grad() + + # Passthrough to the inner opt. + def __getattr__(self, k: str): + return getattr(self.opt, k, None) + + def clear(self): + "Reset the state of the inner optimizer." + sd = self.state_dict() + sd["state"] = {} + self.load_state_dict(sd) + + # Hyperparameters as properties + @property + def lr(self) -> float: + return self._lr[-1] + + @lr.setter + def lr(self, val: float) -> None: + self._lr = self.set_val("lr", listify(val, self._lr)) + + @property + def mom(self) -> float: + return self._mom[-1] + + @mom.setter + def mom(self, val: float) -> None: + if "momentum" in self.opt_keys: + self.set_val("momentum", listify(val, self._mom)) + elif "betas" in self.opt_keys: + self.set_val("betas", (listify(val, self._mom), self._beta)) + self._mom = listify(val, self._mom) + + @property + def beta(self) -> float: + return None if self._beta is None else self._beta[-1] + + @beta.setter + def beta(self, val: float) -> None: + "Set beta (or alpha as makes sense for given optimizer)." + if val is None: + return + if "betas" in self.opt_keys: + self.set_val("betas", (self._mom, listify(val, self._beta))) + elif "alpha" in self.opt_keys: + self.set_val("alpha", listify(val, self._beta)) + self._beta = listify(val, self._beta) + + @property + def wd(self) -> float: + return self._wd[-1] + + @wd.setter + def wd(self, val: float) -> None: + "Set weight decay." + if not self.true_wd: + self.set_val("weight_decay", listify(val, self._wd), bn_groups=self.bn_wd) + self._wd = listify(val, self._wd) + + # Helper functions + def read_defaults(self) -> None: + "Read the values inside the optimizer for the hyper-parameters." + self._beta = None + if "lr" in self.opt_keys: + self._lr = self.read_val("lr") + if "momentum" in self.opt_keys: + self._mom = self.read_val("momentum") + if "alpha" in self.opt_keys: + self._beta = self.read_val("alpha") + if "betas" in self.opt_keys: + self._mom, self._beta = self.read_val("betas") + if "weight_decay" in self.opt_keys: + self._wd = self.read_val("weight_decay") + + def set_val(self, key: str, val, bn_groups: bool = True): + "Set `val` inside the optimizer dictionary at `key`." + if is_tuple(val): + val = [(v1, v2) for v1, v2 in zip(*val)] + for v, pg1, pg2 in zip( + val, self.opt.param_groups[::2], self.opt.param_groups[1::2] + ): + pg1[key] = v + if bn_groups: + pg2[key] = v + return val + + def read_val(self, key: str): + "Read a hyperparameter `key` in the optimizer dictionary." + val = [pg[key] for pg in self.opt.param_groups[::2]] + if is_tuple(val[0]): + val = [o[0] for o in val], [o[1] for o in val] + return val + + +class FastAIMixedOptim(OptimWrapper): + @classmethod + def create( + cls, + opt_func, + lr, + layer_groups, + model, + flat_master=False, + loss_scale=512.0, + **kwargs, + ): + "Create an `optim.Optimizer` from `opt_func` with `lr`. Set lr on `layer_groups`." + opt = OptimWrapper.create(opt_func, lr, layer_groups, **kwargs) + opt.model_params, opt.master_params = get_master(layer_groups, flat_master) + opt.flat_master = flat_master + opt.loss_scale = loss_scale + opt.model = model + # Changes the optimizer so that the optimization step is done in FP32. + # opt = self.learn.opt + mom, wd, beta = opt.mom, opt.wd, opt.beta + lrs = [lr for lr in opt._lr for _ in range(2)] + opt_params = [ + {"params": mp, "lr": lr} for mp, lr in zip(opt.master_params, lrs) + ] + opt.opt = opt_func(opt_params) + opt.mom, opt.wd, opt.beta = mom, wd, beta + return opt + + def step(self): + model_g2master_g(self.model_params, self.master_params, self.flat_master) + for group in self.master_params: + for param in group: + param.grad.div_(self.loss_scale) + super(FastAIMixedOptim, self).step() + self.model.zero_grad() + # Update the params from master to model. + master2model(self.model_params, self.master_params, self.flat_master) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/solver/learning_schedules.py b/cv/3d_detection/centerpoint/pytorch/det3d/solver/learning_schedules.py new file mode 100644 index 0000000000000000000000000000000000000000..cd1bf559de11ef3ad3d89f1f629f7a4cdf5be239 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/solver/learning_schedules.py @@ -0,0 +1,192 @@ +"""PyTorch edition of TensorFlow learning schedule in tensorflow object +detection API. +""" +import numpy as np +from torch.optim.optimizer import Optimizer + + +class _LRSchedulerStep(object): + def __init__(self, optimizer, last_step=-1): + if not isinstance(optimizer, Optimizer): + raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__)) + self.optimizer = optimizer + if last_step == -1: + for group in optimizer.param_groups: + group.setdefault("initial_lr", group["lr"]) + else: + for i, group in enumerate(optimizer.param_groups): + if "initial_lr" not in group: + raise KeyError( + "param 'initial_lr' is not specified " + "in param_groups[{}] when resuming an optimizer".format(i) + ) + self.base_lrs = list( + map(lambda group: group["initial_lr"], optimizer.param_groups) + ) + self.step(last_step + 1) + self.last_step = last_step + + """ + def get_lr(self): + raise NotImplementedError + """ + + def get_lr(self): + ret = [self._get_lr_per_group(base_lr) for base_lr in self.base_lrs] + return ret + + def _get_lr_per_group(self, base_lr): + raise NotImplementedError + + def step(self, step=None): + if step is None: + step = self.last_step + 1 + self.last_step = step + for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): + param_group["lr"] = lr + + +class Constant(_LRSchedulerStep): + def __init__(self, optimizer, last_step=-1): + super().__init__(optimizer, last_step) + + def _get_lr_per_group(self, base_lr): + return base_lr + + +class ManualStepping(_LRSchedulerStep): + """Pytorch edition of manual_stepping in tensorflow. + DON'T SUPPORT PARAM GROUPS. + """ + + def __init__(self, optimizer, boundaries, rates, last_step=-1): + self._boundaries = boundaries + self._num_boundaries = len(boundaries) + self._learning_rates = rates + + if any([b < 0 for b in boundaries]) or any( + [not isinstance(b, int) for b in boundaries] + ): + raise ValueError("boundaries must be a list of positive integers") + if any([bnext <= b for bnext, b in zip(boundaries[1:], boundaries[:-1])]): + raise ValueError("Entries in boundaries must be strictly increasing.") + if any([not isinstance(r, float) for r in rates]): + raise ValueError("Learning rates must be floats") + if len(rates) != len(boundaries) + 1: + raise ValueError( + "Number of provided learning rates must exceed " + "number of boundary points by exactly 1." + ) + super().__init__(optimizer, last_step) + + def _get_lr_per_group(self, base_lr): + step = self.last_step + ret = None + for i, bound in enumerate(self._boundaries): + if step > bound: + ret = self._learning_rates[i + 1] + if ret is not None: + return ret + return self._learning_rates[0] + + +class ExponentialDecayWithBurnin(_LRSchedulerStep): + """Pytorch edition of manual_stepping in tensorflow. + """ + + def __init__( + self, + optimizer, + learning_rate_decay_steps, + learning_rate_decay_factor, + burnin_learning_rate, + burnin_steps, + last_step=-1, + ): + self._decay_steps = learning_rate_decay_steps + self._decay_factor = learning_rate_decay_factor + self._burnin_learning_rate = burnin_learning_rate + self._burnin_steps = burnin_steps + + super().__init__(optimizer, last_step) + + def _get_lr_per_group(self, base_lr): + if self._burnin_learning_rate == 0: + burnin_learning_rate = base_lr + step = self.last_step + post_burnin_learning_rate = base_lr * self._decay_factor ^ ( + step // self._decay_steps + ) + if step < self._burnin_steps: + return burnin_learning_rate + else: + return post_burnin_learning_rate + + +class ExponentialDecay(_LRSchedulerStep): + def __init__( + self, + optimizer, + learning_rate_decay_steps, + learning_rate_decay_factor, + staircase=True, + last_step=-1, + ): + self._decay_steps = learning_rate_decay_steps + self._decay_factor = learning_rate_decay_factor + self._staircase = staircase + + super().__init__(optimizer, last_step) + + def _get_lr_per_group(self, base_lr): + step = self.last_step + if self._staircase: + post_burnin_learning_rate = base_lr * pow( + self._decay_factor, (step // self._decay_steps) + ) + else: + post_burnin_learning_rate = base_lr * pow( + self._decay_factor, (step / self._decay_steps) + ) + + return post_burnin_learning_rate + + +class CosineDecayWithWarmup(_LRSchedulerStep): + def __init__( + self, optimizer, total_steps, warmup_learning_rate, warmup_steps, last_step=-1 + ): + if total_steps < warmup_steps: + raise ValueError("total_steps must be larger or equal to " "warmup_steps.") + self._total_steps = total_steps + self._warmup_learning_rate = warmup_learning_rate + self._warmup_steps = warmup_steps + + super().__init__(optimizer, last_step) + + def _get_lr_per_group(self, base_lr): + if base_lr < self._warmup_learning_rate: + raise ValueError( + "learning_rate_base must be larger " "or equal to warmup_learning_rate." + ) + + step = self.last_step + learning_rate = ( + 0.5 + * base_lr + * ( + 1 + + np.cos( + np.pi + * (float(step) - self._warmup_steps) + / float(self._total_steps - self._warmup_steps) + ) + ) + ) + if self._warmup_steps > 0: + slope = (base_lr - self._warmup_learning_rate) / self._warmup_steps + pre_cosine_learning_rate = slope * float(step) + self._warmup_learning_rate + if step < self._warmup_steps: + return pre_cosine_learning_rate + else: + return learning_rate diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/solver/learning_schedules_fastai.py b/cv/3d_detection/centerpoint/pytorch/det3d/solver/learning_schedules_fastai.py new file mode 100644 index 0000000000000000000000000000000000000000..79fff8e3525a6095e6c1c8ec61aa0bb45296f0b8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/solver/learning_schedules_fastai.py @@ -0,0 +1,168 @@ +import math +from functools import partial + +import numpy as np + + +class LRSchedulerStep(object): + def __init__(self, fai_optimizer, total_step, lr_phases, mom_phases): + self.optimizer = fai_optimizer + self.total_step = total_step + self.lr_phases = [] + + for i, (start, lambda_func) in enumerate(lr_phases): + if len(self.lr_phases) != 0: + assert self.lr_phases[-1][0] < int(start * total_step) + if isinstance(lambda_func, str): + lambda_func = eval(lambda_func) + if i < len(lr_phases) - 1: + self.lr_phases.append( + ( + int(start * total_step), + int(lr_phases[i + 1][0] * total_step), + lambda_func, + ) + ) + else: + self.lr_phases.append( + (int(start * total_step), total_step, lambda_func) + ) + assert self.lr_phases[0][0] == 0 + self.mom_phases = [] + for i, (start, lambda_func) in enumerate(mom_phases): + if len(self.mom_phases) != 0: + assert self.mom_phases[-1][0] < start + if isinstance(lambda_func, str): + lambda_func = eval(lambda_func) + if i < len(mom_phases) - 1: + self.mom_phases.append( + ( + int(start * total_step), + int(mom_phases[i + 1][0] * total_step), + lambda_func, + ) + ) + else: + self.mom_phases.append( + (int(start * total_step), total_step, lambda_func) + ) + # assert self.mom_phases[0][0] == 0 + if len(mom_phases) > 0: + assert self.mom_phases[0][0] == 0 + + def step(self, step): + lrs, moms = [], [] + + for start, end, func in self.lr_phases: + if step >= start: + # self.optimizer.lr = func((step - start) / (end - start)) + lrs.append(func((step - start) / (end - start))) + if len(lrs) > 0: + self.optimizer.lr = lrs[-1] + for start, end, func in self.mom_phases: + if step >= start: + moms.append(func((step - start) / (end - start))) + self.optimizer.mom = func((step - start) / (end - start)) + if len(moms) > 0: + self.optimizer.mom = moms[-1] + + +def annealing_cos(start, end, pct): + # print(pct, start, end) + "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0." + cos_out = np.cos(np.pi * pct) + 1 + return end + (start - end) / 2 * cos_out + + +class OneCycle(LRSchedulerStep): + def __init__(self, fai_optimizer, total_step, lr_max, moms, div_factor, pct_start): + self.lr_max = lr_max + self.moms = moms + self.div_factor = div_factor + self.pct_start = pct_start + a1 = int(total_step * self.pct_start) + a2 = total_step - a1 + low_lr = self.lr_max / self.div_factor + lr_phases = ( + (0, partial(annealing_cos, low_lr, self.lr_max)), + (self.pct_start, partial(annealing_cos, self.lr_max, low_lr / 1e4)), + ) + mom_phases = ( + (0, partial(annealing_cos, *self.moms)), + (self.pct_start, partial(annealing_cos, *self.moms[::-1])), + ) + fai_optimizer.lr, fai_optimizer.mom = low_lr, self.moms[0] + super().__init__(fai_optimizer, total_step, lr_phases, mom_phases) + + +class ExponentialDecay(LRSchedulerStep): + def __init__( + self, + fai_optimizer, + total_step, + initial_learning_rate, + decay_length, + decay_factor, + staircase=True, + ): + """ + Args: + decay_length: must in (0, 1) + """ + assert decay_length > 0 + assert decay_length < 1 + self._decay_steps_unified = decay_length + self._decay_factor = decay_factor + self._staircase = staircase + step = 0 + stage = 1 + lr_phases = [] + if staircase: + while step <= total_step: + func = lambda p, _d=initial_learning_rate * stage: _d + lr_phases.append((step / total_step, func)) + stage *= decay_factor + step += int(decay_length * total_step) + else: + func = lambda p: pow(decay_factor, (p / decay_length)) + lr_phases.append((0, func)) + super().__init__(fai_optimizer, total_step, lr_phases, []) + + +class ManualStepping(LRSchedulerStep): + def __init__(self, fai_optimizer, total_step, boundaries, rates): + assert all([b > 0 and b < 1 for b in boundaries]) + assert len(boundaries) + 1 == len(rates) + boundaries.insert(0, 0.0) + lr_phases = [] + for start, rate in zip(boundaries, rates): + func = lambda p, _d=rate: _d + lr_phases.append((start, func)) + super().__init__(fai_optimizer, total_step, lr_phases, []) + + +class FakeOptim: + def __init__(self): + self.lr = 0 + self.mom = 0 + + +if __name__ == "__main__": + import matplotlib.pyplot as plt + + opt = FakeOptim() # 3e-3, wd=0.4, div_factor=10 + # schd = OneCycle(opt, 100, 3e-3, (0.95, 0.85), 10.0, 0.1) + schd = ExponentialDecay(opt, 100, 3e-4, 0.1, 0.8, staircase=True) + schd = ManualStepping(opt, 100, [0.8, 0.9], [0.001, 0.0001, 0.00005]) + + lrs = [] + moms = [] + for i in range(100): + schd.step(i) + lrs.append(opt.lr) + moms.append(opt.mom) + plt.plot(lrs) + # plt.plot(moms) + # plt.show() + # plt.plot(moms) + plt.show() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/solver/optim.py b/cv/3d_detection/centerpoint/pytorch/det3d/solver/optim.py new file mode 100644 index 0000000000000000000000000000000000000000..224ada0b2628dd7543699690dc33170160f3f1aa --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/solver/optim.py @@ -0,0 +1,116 @@ +from collections import Iterable, defaultdict +from copy import deepcopy +from itertools import chain + +import torch +from torch.autograd import Variable + +required = object() + + +def param_fp32_copy(params): + param_copy = [ + param.clone().type(torch.cuda.FloatTensor).detach() for param in params + ] + for param in param_copy: + param.requires_grad = True + return param_copy + + +def set_grad(params, params_with_grad, scale=1.0): + for param, param_w_grad in zip(params, params_with_grad): + if param.grad is None: + param.grad = torch.nn.Parameter( + param.data.new().resize_(*param.data.size()) + ) + grad = param_w_grad.grad.data + if scale is not None: + grad /= scale + if torch.isnan(grad).any() or torch.isinf(grad).any(): + return True # invalid grad + param.grad.data.copy_(grad) + return False + + +class MixedPrecisionWrapper(object): + """mixed precision optimizer wrapper. + Arguments: + optimizer (torch.optim.Optimizer): an instance of + :class:`torch.optim.Optimizer` + scale: (float): a scalar for grad scale. + auto_scale: (bool): whether enable auto scale. + The algorihm of auto scale is discribled in + http://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html + """ + + def __init__( + self, + optimizer, + scale=None, + auto_scale=True, + inc_factor=2.0, + dec_factor=0.5, + num_iters_be_stable=500, + ): + if not isinstance(optimizer, torch.optim.Optimizer): + raise ValueError("must provide a torch.optim.Optimizer") + self.optimizer = optimizer + if hasattr(self.optimizer, "name"): + self.name = self.optimizer.name # for ckpt system + param_groups_copy = [] + for i, group in enumerate(optimizer.param_groups): + group_copy = {n: v for n, v in group.items() if n != "params"} + group_copy["params"] = param_fp32_copy(group["params"]) + param_groups_copy.append(group_copy) + + # switch param_groups, may be dangerous + self.param_groups = optimizer.param_groups + optimizer.param_groups = param_groups_copy + self.grad_scale = scale + self.auto_scale = auto_scale + self.inc_factor = inc_factor + self.dec_factor = dec_factor + self.stable_iter_count = 0 + self.num_iters_be_stable = num_iters_be_stable + + def __getstate__(self): + return self.optimizer.__getstate__() + + def __setstate__(self, state): + return self.optimizer.__setstate__(state) + + def __repr__(self): + return self.optimizer.__repr__() + + def state_dict(self): + return self.optimizer.state_dict() + + def load_state_dict(self, state_dict): + return self.optimizer.load_state_dict(state_dict) + + def zero_grad(self): + return self.optimizer.zero_grad() + + def step(self, closure=None): + for g, g_copy in zip(self.param_groups, self.optimizer.param_groups): + invalid = set_grad(g_copy["params"], g["params"], self.grad_scale) + if invalid: + if self.grad_scale is None or self.auto_scale is False: + raise ValueError("nan/inf detected but auto_scale disabled.") + self.grad_scale *= self.dec_factor + print("scale decay to {}".format(self.grad_scale)) + return + if self.auto_scale is True: + self.stable_iter_count += 1 + if self.stable_iter_count > self.num_iters_be_stable: + if self.grad_scale is not None: + self.grad_scale *= self.inc_factor + self.stable_iter_count = 0 + + if closure is None: + self.optimizer.step() + else: + self.optimizer.step(closure) + for g, g_copy in zip(self.param_groups, self.optimizer.param_groups): + for p_copy, p in zip(g_copy["params"], g["params"]): + p.data.copy_(p_copy.data) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5df6bcf4e057fce71357895d836835fb3698397 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/__init__.py @@ -0,0 +1,6 @@ +# from .apis import * +from .cnn import * +from .fileio import * +from .parallel import * +from .trainer import * +from .utils import * diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/apis/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/apis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..952d978cf1677fc59c1444a1a900f69156f2cb4a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/apis/__init__.py @@ -0,0 +1,14 @@ +from .env import get_root_logger, init_dist, set_random_seed +from .train import batch_processor, batch_processor_ensemble, build_optimizer, train_detector + +# from .inference import init_detector, inference_detector, show_result + +__all__ = [ + "init_dist", + "get_root_logger", + "set_random_seed", + "train_detector", + "build_optimizer", + "batch_processor", + # 'init_detector', 'inference_detector', 'show_result' +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/apis/env.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/apis/env.py new file mode 100644 index 0000000000000000000000000000000000000000..75dc44e5f06ee04a3f87db67dfb39d880c09188c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/apis/env.py @@ -0,0 +1,67 @@ +import logging +import os +import random +import subprocess + +import numpy as np +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from det3d.torchie.trainer import get_dist_info + + +def init_dist(launcher, backend="nccl", **kwargs): + if mp.get_start_method(allow_none=True) is None: + mp.set_start_method("spawn") + if launcher == "pytorch": + _init_dist_pytorch(backend, **kwargs) + elif launcher == "mpi": + _init_dist_mpi(backend, **kwargs) + elif launcher == "slurm": + _init_dist_slurm(backend, **kwargs) + else: + raise ValueError("Invalid launcher type: {}".format(launcher)) + + +def _init_dist_pytorch(backend, **kwargs): + torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) + dist.init_process_group(backend=backend, **kwargs) + + +def _init_dist_mpi(backend, **kwargs): + raise NotImplementedError + + +def _init_dist_slurm(backend, port=29500, **kwargs): + proc_id = int(os.environ["SLURM_PROCID"]) + ntasks = int(os.environ["SLURM_NTASKS"]) + node_list = os.environ["SLURM_NODELIST"] + num_gpus = torch.cuda.device_count() + torch.cuda.set_device(proc_id % num_gpus) + addr = subprocess.getoutput( + "scontrol show hostname {} | head -n1".format(node_list) + ) + os.environ["MASTER_PORT"] = str(port) + os.environ["MASTER_ADDR"] = addr + os.environ["WORLD_SIZE"] = str(ntasks) + os.environ["RANK"] = str(proc_id) + dist.init_process_group(backend=backend) + + +def set_random_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def get_root_logger(log_level=logging.INFO): + logger = logging.getLogger() + if not logger.hasHandlers(): + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(message)s", level=log_level + ) + rank, _ = get_dist_info() + if rank != 0: + logger.setLevel("ERROR") + return logger diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/apis/train.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/apis/train.py new file mode 100644 index 0000000000000000000000000000000000000000..9c3e5fb70fea324d3b04ff0ebed70cc528f2daea --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/apis/train.py @@ -0,0 +1,326 @@ +from __future__ import division + +import re +from collections import OrderedDict, defaultdict +from functools import partial + +try: + import apex +except: + print("No APEX!") + +import numpy as np +import torch +from det3d.builder import _create_learning_rate_scheduler + +# from det3d.datasets.kitti.eval_hooks import KittiDistEvalmAPHook, KittiEvalmAPHookV2 +from det3d.core import DistOptimizerHook +from det3d.datasets import DATASETS, build_dataloader +from det3d.solver.fastai_optim import OptimWrapper +from det3d.torchie.trainer import DistSamplerSeedHook, Trainer, obj_from_dict +from det3d.utils.print_utils import metric_to_str +from torch import nn +from torch.nn.parallel import DistributedDataParallel + +from .env import get_root_logger + + +def example_to_device(example, device=None, non_blocking=False) -> dict: + assert device is not None + + example_torch = {} + float_names = ["voxels", "bev_map"] + for k, v in example.items(): + if k in ["anchors", "anchors_mask", "reg_targets", "reg_weights", "labels", 'points']: + example_torch[k] = [res.to(device, non_blocking=non_blocking) for res in v] + elif k in [ + "voxels", + "bev_map", + "coordinates", + "num_points", + "num_voxels", + "cyv_voxels", + "cyv_num_voxels", + "cyv_coordinates", + "cyv_num_points" + ]: + example_torch[k] = v.to(device, non_blocking=non_blocking) + elif k == "calib": + calib = {} + for k1, v1 in v.items(): + # calib[k1] = torch.tensor(v1, dtype=dtype, device=device) + calib[k1] = torch.tensor(v1).to(device, non_blocking=non_blocking) + example_torch[k] = calib + else: + example_torch[k] = v + + return example_torch + + +def parse_losses(losses): + log_vars = OrderedDict() + for loss_name, loss_value in losses.items(): + if isinstance(loss_value, torch.Tensor): + log_vars[loss_name] = loss_value.mean() + elif isinstance(loss_value, list): + log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) + else: + raise TypeError("{} is not a tensor or list of tensors".format(loss_name)) + + loss = sum(_value for _key, _value in log_vars.items() if "loss" in _key) + + log_vars["loss"] = loss + for name in log_vars: + log_vars[name] = log_vars[name].item() + + return loss, log_vars + + +def parse_second_losses(losses): + + log_vars = OrderedDict() + loss = sum(losses["loss"]) + for loss_name, loss_value in losses.items(): + if loss_name == "loc_loss_elem": + log_vars[loss_name] = [[i.item() for i in j] for j in loss_value] + else: + log_vars[loss_name] = [i.item() for i in loss_value] + + return loss, log_vars + + +def batch_processor(model, data, train_mode, **kwargs): + + if "local_rank" in kwargs: + device = torch.device(kwargs["local_rank"]) + else: + device = None + + # data = example_convert_to_torch(data, device=device) + example = example_to_device(data, device, non_blocking=False) + + del data + + if train_mode: + losses = model(example, return_loss=True) + loss, log_vars = parse_second_losses(losses) + + outputs = dict( + loss=loss, log_vars=log_vars, num_samples=len(example["anchors"][0]) + ) + return outputs + else: + return model(example, return_loss=False) + +def batch_processor_ensemble(model1, model2, data, train_mode, **kwargs): + assert 0, 'deprecated' + if "local_rank" in kwargs: + device = torch.device(kwargs["local_rank"]) + else: + device = None + + assert train_mode is False + + example = example_to_device(data, device, non_blocking=False) + del data + + preds_dicts1 = model1.pred_hm(example) + preds_dicts2 = model2.pred_hm(example) + + num_task = len(preds_dicts1) + + merge_list = [] + + # take the average + for task_id in range(num_task): + preds_dict1 = preds_dicts1[task_id] + preds_dict2 = preds_dicts2[task_id] + + for key in preds_dict1.keys(): + preds_dict1[key] = (preds_dict1[key] + preds_dict2[key]) / 2 + + merge_list.append(preds_dict1) + + # now get the final prediciton + return model1.pred_result(example, merge_list) + + +def flatten_model(m): + return sum(map(flatten_model, m.children()), []) if len(list(m.children())) else [m] + + +def get_layer_groups(m): + return [nn.Sequential(*flatten_model(m))] + + +def build_one_cycle_optimizer(model, optimizer_config): + if optimizer_config.fixed_wd: + optimizer_func = partial( + torch.optim.Adam, betas=(0.9, 0.99), amsgrad=optimizer_config.amsgrad + ) + else: + optimizer_func = partial(torch.optim.Adam, amsgrad=optimizer_cfg.amsgrad) + + optimizer = OptimWrapper.create( + optimizer_func, + 3e-3, # TODO: CHECKING LR HERE !!! + get_layer_groups(model), + wd=optimizer_config.wd, + true_wd=optimizer_config.fixed_wd, + bn_wd=True, + ) + + return optimizer + + +def build_optimizer(model, optimizer_cfg): + """Build optimizer from configs. + Args: + model (:obj:`nn.Module`): The model with parameters to be optimized. + optimizer_cfg (dict): The config dict of the optimizer. + Positional fields are: + - type: class name of the optimizer. + - lr: base learning rate. + Optional fields are: + - any arguments of the corresponding optimizer type, e.g., + weight_decay, momentum, etc. + - paramwise_options: a dict with 3 accepted fileds + (bias_lr_mult, bias_decay_mult, norm_decay_mult). + `bias_lr_mult` and `bias_decay_mult` will be multiplied to + the lr and weight decay respectively for all bias parameters + (except for the normalization layers), and + `norm_decay_mult` will be multiplied to the weight decay + for all weight and bias parameters of normalization layers. + Returns: + torch.optim.Optimizer: The initialized optimizer. + """ + if hasattr(model, "module"): + model = model.module + + optimizer_cfg = optimizer_cfg.copy() + paramwise_options = optimizer_cfg.pop("paramwise_options", None) + # if no paramwise option is specified, just use the global setting + if paramwise_options is None: + return obj_from_dict( + optimizer_cfg, torch.optim, dict(params=model.parameters()) + ) + else: + assert isinstance(paramwise_options, dict) + # get base lr and weight decay + base_lr = optimizer_cfg["lr"] + base_wd = optimizer_cfg.get("weight_decay", None) + # weight_decay must be explicitly specified if mult is specified + if ( + "bias_decay_mult" in paramwise_options + or "norm_decay_mult" in paramwise_options + ): + assert base_wd is not None + # get param-wise options + bias_lr_mult = paramwise_options.get("bias_lr_mult", 1.0) + bias_decay_mult = paramwise_options.get("bias_decay_mult", 1.0) + norm_decay_mult = paramwise_options.get("norm_decay_mult", 1.0) + # set param-wise lr and weight decay + params = [] + for name, param in model.named_parameters(): + param_group = {"params": [param]} + if not param.requires_grad: + # FP16 training needs to copy gradient/weight between master + # weight copy and model weight, it is convenient to keep all + # parameters here to align with model.parameters() + params.append(param_group) + continue + + # for norm layers, overwrite the weight decay of weight and bias + # TODO: obtain the norm layer prefixes dynamically + if re.search(r"(bn|gn)(\d+)?.(weight|bias)", name): + if base_wd is not None: + param_group["weight_decay"] = base_wd * norm_decay_mult + # for other layers, overwrite both lr and weight decay of bias + elif name.endswith(".bias"): + param_group["lr"] = base_lr * bias_lr_mult + if base_wd is not None: + param_group["weight_decay"] = base_wd * bias_decay_mult + # otherwise use the global settings + + params.append(param_group) + + optimizer_cls = getattr(torch.optim, optimizer_cfg.pop("type")) + return optimizer_cls(params, **optimizer_cfg) + + +def train_detector(model, dataset, cfg, distributed=False, validate=False, logger=None): + if logger is None: + logger = get_root_logger(cfg.log_level) + + # start training + # prepare data loaders + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + data_loaders = [ + build_dataloader( + ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=distributed + ) + for ds in dataset + ] + + total_steps = cfg.total_epochs * len(data_loaders[0]) + # print(f"total_steps: {total_steps}") + if distributed: + model = apex.parallel.convert_syncbn_model(model) + if cfg.lr_config.type == "one_cycle": + # build trainer + optimizer = build_one_cycle_optimizer(model, cfg.optimizer) + lr_scheduler = _create_learning_rate_scheduler( + optimizer, cfg.lr_config, total_steps + ) + cfg.lr_config = None + else: + optimizer = build_optimizer(model, cfg.optimizer) + lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.drop_step, gamma=.1) + # lr_scheduler = None + cfg.lr_config = None + + # put model on gpus + if distributed: + model = DistributedDataParallel( + model.cuda(cfg.local_rank), + device_ids=[cfg.local_rank], + output_device=cfg.local_rank, + # broadcast_buffers=False, + find_unused_parameters=True, + ) + else: + model = model.cuda() + + logger.info(f"model structure: {model}") + + trainer = Trainer( + model, batch_processor, optimizer, lr_scheduler, cfg.work_dir, cfg.log_level + ) + + if distributed: + optimizer_config = DistOptimizerHook(**cfg.optimizer_config) + else: + optimizer_config = cfg.optimizer_config + + # register hooks + trainer.register_training_hooks( + cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config + ) + + if distributed: + trainer.register_hook(DistSamplerSeedHook()) + + # # register eval hooks + # if validate: + # val_dataset_cfg = cfg.data.val + # eval_cfg = cfg.get('evaluation', {}) + # dataset_type = DATASETS.get(val_dataset_cfg.type) + # trainer.register_hook( + # KittiEvalmAPHookV2(val_dataset_cfg, **eval_cfg)) + + if cfg.resume_from: + trainer.resume(cfg.resume_from) + elif cfg.load_from: + trainer.load_checkpoint(cfg.load_from) + + trainer.run(data_loaders, cfg.workflow, cfg.total_epochs, local_rank=cfg.local_rank) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef83ac536ae6ab2881fb8070f672211758fac2c9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/__init__.py @@ -0,0 +1,25 @@ +from .alexnet import AlexNet +from .resnet import ResNet, make_res_layer +from .vgg import VGG, make_vgg_layer +from .weight_init import ( + caffe2_xavier_init, + constant_init, + kaiming_init, + normal_init, + uniform_init, + xavier_init, +) + +__all__ = [ + "AlexNet", + "VGG", + "make_vgg_layer", + "ResNet", + "make_res_layer", + "constant_init", + "xavier_init", + "normal_init", + "uniform_init", + "kaiming_init", + "caffe2_xavier_init", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/alexnet.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/alexnet.py new file mode 100644 index 0000000000000000000000000000000000000000..28db64190d079b5416ad6bd4d20760069a4b3c12 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/alexnet.py @@ -0,0 +1,61 @@ +import logging + +import torch.nn as nn + +from ..trainer import load_checkpoint + + +class AlexNet(nn.Module): + """AlexNet backbone. + + Args: + num_classes (int): number of classes for classification. + """ + + def __init__(self, num_classes=-1): + super(AlexNet, self).__init__() + self.num_classes = num_classes + self.features = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(64, 192, kernel_size=5, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(192, 384, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(384, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + ) + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Dropout(), + nn.Linear(256 * 6 * 6, 4096), + nn.ReLU(inplace=True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(inplace=True), + nn.Linear(4096, num_classes), + ) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + # use default initializer + pass + else: + raise TypeError("pretrained must be a str or None") + + def forward(self, x): + + x = self.features(x) + if self.num_classes > 0: + x = x.view(x.size(0), 256 * 6 * 6) + x = self.classifier(x) + + return x diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/resnet.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..2af2aecfab4cc5ae561c553775e0f799f448c9ff --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/resnet.py @@ -0,0 +1,323 @@ +import logging + +import torch.nn as nn +import torch.utils.checkpoint as cp + +from ..trainer import load_checkpoint +from .weight_init import constant_init, kaiming_init + + +def conv3x3(in_planes, out_planes, stride=1, dilation=1): + "3x3 convolution with padding" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False, + ) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style="pytorch", + with_cp=False, + ): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride, dilation) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + assert not with_cp + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style="pytorch", + with_cp=False, + ): + """Bottleneck block. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, + if it is "caffe", the stride-two layer is the first 1x1 conv layer. + """ + super(Bottleneck, self).__init__() + assert style in ["pytorch", "caffe"] + if style == "pytorch": + conv1_stride = 1 + conv2_stride = stride + else: + conv1_stride = stride + conv2_stride = 1 + self.conv1 = nn.Conv2d( + inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False + ) + self.conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + stride=conv2_stride, + padding=dilation, + dilation=dilation, + bias=False, + ) + + self.bn1 = nn.BatchNorm2d(planes) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d( + planes, planes * self.expansion, kernel_size=1, bias=False + ) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.with_cp = with_cp + + def forward(self, x): + def _inner_forward(x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +def make_res_layer( + block, + inplanes, + planes, + blocks, + stride=1, + dilation=1, + style="pytorch", + with_cp=False, +): + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + ), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append( + block( + inplanes, planes, stride, dilation, downsample, style=style, with_cp=with_cp + ) + ) + inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp) + ) + + return nn.Sequential(*layers) + + +class ResNet(nn.Module): + """ResNet backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + num_stages (int): Resnet stages, normally 4. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze + running stats (mean and var). + bn_frozen (bool): Whether to freeze weight and bias of BN layers. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + """ + + arch_settings = { + 18: (BasicBlock, (2, 2, 2, 2)), + 34: (BasicBlock, (3, 4, 6, 3)), + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)), + } + + def __init__( + self, + depth, + num_stages=4, + strides=(1, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(0, 1, 2, 3), + style="pytorch", + frozen_stages=-1, + bn_eval=True, + bn_frozen=False, + with_cp=False, + ): + super(ResNet, self).__init__() + if depth not in self.arch_settings: + raise KeyError("invalid depth {} for resnet".format(depth)) + assert num_stages >= 1 and num_stages <= 4 + block, stage_blocks = self.arch_settings[depth] + stage_blocks = stage_blocks[:num_stages] + assert len(strides) == len(dilations) == num_stages + assert max(out_indices) < num_stages + + self.out_indices = out_indices + self.style = style + self.frozen_stages = frozen_stages + self.bn_eval = bn_eval + self.bn_frozen = bn_frozen + self.with_cp = with_cp + + self.inplanes = 64 + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.res_layers = [] + for i, num_blocks in enumerate(stage_blocks): + stride = strides[i] + dilation = dilations[i] + planes = 64 * 2 ** i + res_layer = make_res_layer( + block, + self.inplanes, + planes, + num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + with_cp=with_cp, + ) + self.inplanes = planes * block.expansion + layer_name = "layer{}".format(i + 1) + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self.feat_dim = block.expansion * 64 * 2 ** (len(stage_blocks) - 1) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + else: + raise TypeError("pretrained must be a str or None") + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def train(self, mode=True): + super(ResNet, self).train(mode) + if self.bn_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + if self.bn_frozen: + for params in m.parameters(): + params.requires_grad = False + if mode and self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for param in self.bn1.parameters(): + param.requires_grad = False + self.bn1.eval() + self.bn1.weight.requires_grad = False + self.bn1.bias.requires_grad = False + for i in range(1, self.frozen_stages + 1): + mod = getattr(self, "layer{}".format(i)) + mod.eval() + for param in mod.parameters(): + param.requires_grad = False diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/vgg.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..21e389c8b3c6ff99f7e6e00b7850131c21ebcaeb --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/vgg.py @@ -0,0 +1,171 @@ +import logging + +import torch.nn as nn + +from ..trainer import load_checkpoint +from .weight_init import constant_init, kaiming_init, normal_init + + +def conv3x3(in_planes, out_planes, dilation=1): + "3x3 convolution with padding" + return nn.Conv2d( + in_planes, out_planes, kernel_size=3, padding=dilation, dilation=dilation + ) + + +def make_vgg_layer( + inplanes, planes, num_blocks, dilation=1, with_bn=False, ceil_mode=False +): + layers = [] + for _ in range(num_blocks): + layers.append(conv3x3(inplanes, planes, dilation)) + if with_bn: + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + inplanes = planes + layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode)) + + return layers + + +class VGG(nn.Module): + """VGG backbone. + + Args: + depth (int): Depth of vgg, from {11, 13, 16, 19}. + with_bn (bool): Use BatchNorm or not. + num_classes (int): number of classes for classification. + num_stages (int): VGG stages, normally 5. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze + running stats (mean and var). + bn_frozen (bool): Whether to freeze weight and bias of BN layers. + """ + + arch_settings = { + 11: (1, 1, 2, 2, 2), + 13: (2, 2, 2, 2, 2), + 16: (2, 2, 3, 3, 3), + 19: (2, 2, 4, 4, 4), + } + + def __init__( + self, + depth, + with_bn=False, + num_classes=-1, + num_stages=5, + dilations=(1, 1, 1, 1, 1), + out_indices=(0, 1, 2, 3, 4), + frozen_stages=-1, + bn_eval=True, + bn_frozen=False, + ceil_mode=False, + with_last_pool=True, + ): + super(VGG, self).__init__() + if depth not in self.arch_settings: + raise KeyError("invalid depth {} for vgg".format(depth)) + assert num_stages >= 1 and num_stages <= 5 + stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + assert len(dilations) == num_stages + assert max(out_indices) <= num_stages + + self.num_classes = num_classes + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.bn_eval = bn_eval + self.bn_frozen = bn_frozen + + self.inplanes = 3 + start_idx = 0 + vgg_layers = [] + self.range_sub_modules = [] + for i, num_blocks in enumerate(self.stage_blocks): + num_modules = num_blocks * (2 + with_bn) + 1 + end_idx = start_idx + num_modules + dilation = dilations[i] + planes = 64 * 2 ** i if i < 4 else 512 + vgg_layer = make_vgg_layer( + self.inplanes, + planes, + num_blocks, + dilation=dilation, + with_bn=with_bn, + ceil_mode=ceil_mode, + ) + vgg_layers.extend(vgg_layer) + self.inplanes = planes + self.range_sub_modules.append([start_idx, end_idx]) + start_idx = end_idx + if not with_last_pool: + vgg_layers.pop(-1) + self.range_sub_modules[-1][1] -= 1 + self.module_name = "features" + self.add_module(self.module_name, nn.Sequential(*vgg_layers)) + + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Linear(512 * 7 * 7, 4096), + nn.ReLU(True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(True), + nn.Dropout(), + nn.Linear(4096, num_classes), + ) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + elif isinstance(m, nn.Linear): + normal_init(m, std=0.01) + else: + raise TypeError("pretrained must be a str or None") + + def forward(self, x): + outs = [] + vgg_layers = getattr(self, self.module_name) + for i, num_blocks in enumerate(self.stage_blocks): + for j in range(*self.range_sub_modules[i]): + vgg_layer = vgg_layers[j] + x = vgg_layer(x) + if i in self.out_indices: + outs.append(x) + if self.num_classes > 0: + x = x.view(x.size(0), -1) + x = self.classifier(x) + outs.append(x) + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def train(self, mode=True): + super(VGG, self).train(mode) + if self.bn_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + if self.bn_frozen: + for params in m.parameters(): + params.requires_grad = False + vgg_layers = getattr(self, self.module_name) + if mode and self.frozen_stages >= 0: + for i in range(self.frozen_stages): + for j in range(*self.range_sub_modules[i]): + mod = vgg_layers[j] + mod.eval() + for param in mod.parameters(): + param.requires_grad = False diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/weight_init.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..c876d79e681b43174a309f917f6fdfdf24222a01 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/cnn/weight_init.py @@ -0,0 +1,53 @@ +import torch.nn as nn + + +def constant_init(module, val, bias=0): + nn.init.constant_(module.weight, val) + if hasattr(module, "bias") and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def xavier_init(module, gain=1, bias=0, distribution="normal"): + assert distribution in ["uniform", "normal"] + if distribution == "uniform": + nn.init.xavier_uniform_(module.weight, gain=gain) + else: + nn.init.xavier_normal_(module.weight, gain=gain) + if hasattr(module, "bias") and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def normal_init(module, mean=0, std=1, bias=0): + nn.init.normal_(module.weight, mean, std) + if hasattr(module, "bias") and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def uniform_init(module, a=0, b=1, bias=0): + nn.init.uniform_(module.weight, a, b) + if hasattr(module, "bias") and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def kaiming_init( + module, a=0, mode="fan_out", nonlinearity="relu", bias=0, distribution="normal" +): + assert distribution in ["uniform", "normal"] + if distribution == "uniform": + nn.init.kaiming_uniform_( + module.weight, a=a, mode=mode, nonlinearity=nonlinearity + ) + else: + nn.init.kaiming_normal_( + module.weight, a=a, mode=mode, nonlinearity=nonlinearity + ) + if hasattr(module, "bias") and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def caffe2_xavier_init(module, bias=0): + # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch + # Acknowledgment to FAIR's internal code + kaiming_init( + module, a=1, mode="fan_in", nonlinearity="leaky_relu", distribution="uniform" + ) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a93b684691f438c0c1ffb00ab635dc5a31a191db --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/__init__.py @@ -0,0 +1,15 @@ +from .io import load, dump, register_handler +from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler +from .parse import list_from_file, dict_from_file + +__all__ = [ + "load", + "dump", + "register_handler", + "BaseFileHandler", + "JsonHandler", + "PickleHandler", + "YamlHandler", + "list_from_file", + "dict_from_file", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b808563d1bdaf57dd1a8083315abdccee5a26e70 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/__init__.py @@ -0,0 +1,6 @@ +from .base import BaseFileHandler +from .json_handler import JsonHandler +from .pickle_handler import PickleHandler +from .yaml_handler import YamlHandler + +__all__ = ["BaseFileHandler", "JsonHandler", "PickleHandler", "YamlHandler"] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/base.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/base.py new file mode 100644 index 0000000000000000000000000000000000000000..413502d455b49bec710d9f875e34bc9d0fd9bc34 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/base.py @@ -0,0 +1,26 @@ +from abc import ABCMeta, abstractmethod + + +class BaseFileHandler(object): + + __metaclass__ = ABCMeta # python 2 compatibility + + @abstractmethod + def load_from_fileobj(self, file, **kwargs): + pass + + @abstractmethod + def dump_to_fileobj(self, obj, file, **kwargs): + pass + + @abstractmethod + def dump_to_str(self, obj, **kwargs): + pass + + def load_from_path(self, filepath, mode="r", **kwargs): + with open(filepath, mode) as f: + return self.load_from_fileobj(f, **kwargs) + + def dump_to_path(self, obj, filepath, mode="w", **kwargs): + with open(filepath, mode) as f: + self.dump_to_fileobj(obj, f, **kwargs) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/json_handler.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/json_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..567244b985ca7b74a9f235b7412875b9bc32ece4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/json_handler.py @@ -0,0 +1,14 @@ +import json + +from .base import BaseFileHandler + + +class JsonHandler(BaseFileHandler): + def load_from_fileobj(self, file): + return json.load(file) + + def dump_to_fileobj(self, obj, file, **kwargs): + json.dump(obj, file, **kwargs) + + def dump_to_str(self, obj, **kwargs): + return json.dumps(obj, **kwargs) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/pickle_handler.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/pickle_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..001da2d5b347d3304cda9945241e1595bce69bbd --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/pickle_handler.py @@ -0,0 +1,22 @@ +from six.moves import cPickle as pickle + +from .base import BaseFileHandler + + +class PickleHandler(BaseFileHandler): + def load_from_fileobj(self, file, **kwargs): + return pickle.load(file, **kwargs) + + def load_from_path(self, filepath, **kwargs): + return super(PickleHandler, self).load_from_path(filepath, mode="rb", **kwargs) + + def dump_to_str(self, obj, **kwargs): + kwargs.setdefault("protocol", 2) + return pickle.dumps(obj, **kwargs) + + def dump_to_fileobj(self, obj, file, **kwargs): + kwargs.setdefault("protocol", 2) + pickle.dump(obj, file, **kwargs) + + def dump_to_path(self, obj, filepath, **kwargs): + super(PickleHandler, self).dump_to_path(obj, filepath, mode="wb", **kwargs) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/yaml_handler.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/yaml_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..013ae013c2e1a7d6e25c186c10de0ee330dc8089 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/handlers/yaml_handler.py @@ -0,0 +1,22 @@ +import yaml + +try: + from yaml import CLoader as Loader, CDumper as Dumper +except ImportError: + from yaml import Loader, Dumper + +from .base import BaseFileHandler # isort:skip + + +class YamlHandler(BaseFileHandler): + def load_from_fileobj(self, file, **kwargs): + kwargs.setdefault("Loader", Loader) + return yaml.load(file, **kwargs) + + def dump_to_fileobj(self, obj, file, **kwargs): + kwargs.setdefault("Dumper", Dumper) + yaml.dump(obj, file, **kwargs) + + def dump_to_str(self, obj, **kwargs): + kwargs.setdefault("Dumper", Dumper) + return yaml.dump(obj, **kwargs) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/io.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/io.py new file mode 100644 index 0000000000000000000000000000000000000000..349d1046c797f263e53fb2e37d1e743efa5d4ada --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/io.py @@ -0,0 +1,110 @@ +from pathlib import Path + +from ..utils import is_list_of, is_str +from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler + +file_handlers = { + "json": JsonHandler(), + "yaml": YamlHandler(), + "yml": YamlHandler(), + "pickle": PickleHandler(), + "pkl": PickleHandler(), +} + + +def load(file, file_format=None, **kwargs): + """Load data from json/yaml/pickle files. + + This method provides a unified api for loading data from serialized files. + + Args: + file (str or :obj:`Path` or file-like object): Filename or a file-like + object. + file_format (str, optional): If not specified, the file format will be + inferred from the file extension, otherwise use the specified one. + Currently supported formats include "json", "yaml/yml" and + "pickle/pkl". + + Returns: + The content from the file. + """ + if isinstance(file, Path): + file = str(file) + if file_format is None and is_str(file): + file_format = file.split(".")[-1] + if file_format not in file_handlers: + raise TypeError("Unsupported format: {}".format(file_format)) + + handler = file_handlers[file_format] + if is_str(file): + obj = handler.load_from_path(file, **kwargs) + elif hasattr(file, "read"): + obj = handler.load_from_fileobj(file, **kwargs) + else: + raise TypeError('"file" must be a filepath str or a file-object') + return obj + + +def dump(obj, file=None, file_format=None, **kwargs): + """Dump data to json/yaml/pickle strings or files. + + This method provides a unified api for dumping data as strings or to files, + and also supports custom arguments for each file format. + + Args: + obj (any): The python object to be dumped. + file (str or :obj:`Path` or file-like object, optional): If not + specified, then the object is dump to a str, otherwise to a file + specified by the filename or file-like object. + file_format (str, optional): Same as :func:`load`. + + Returns: + bool: True for success, False otherwise. + """ + if isinstance(file, Path): + file = str(file) + if file_format is None: + if is_str(file): + file_format = file.split(".")[-1] + elif file is None: + raise ValueError("file_format must be specified since file is None") + if file_format not in file_handlers: + raise TypeError("Unsupported format: {}".format(file_format)) + + handler = file_handlers[file_format] + if file is None: + return handler.dump_to_str(obj, **kwargs) + elif is_str(file): + handler.dump_to_path(obj, file, **kwargs) + elif hasattr(file, "write"): + handler.dump_to_fileobj(obj, file, **kwargs) + else: + raise TypeError('"file" must be a filename str or a file-object') + + +def _register_handler(handler, file_formats): + """Register a handler for some file extensions. + + Args: + handler (:obj:`BaseFileHandler`): Handler to be registered. + file_formats (str or list[str]): File formats to be handled by this + handler. + """ + if not isinstance(handler, BaseFileHandler): + raise TypeError( + "handler must be a child of BaseFileHandler, not {}".format(type(handler)) + ) + if isinstance(file_formats, str): + file_formats = [file_formats] + if not is_list_of(file_formats, str): + raise TypeError("file_formats must be a str or a list of str") + for ext in file_formats: + file_handlers[ext] = handler + + +def register_handler(file_formats, **kwargs): + def wrap(cls): + _register_handler(cls(**kwargs), file_formats) + return cls + + return wrap diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/parse.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/parse.py new file mode 100644 index 0000000000000000000000000000000000000000..9fd1e8a01d2e9378bd9db7129bb1038b6683d0b1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/fileio/parse.py @@ -0,0 +1,50 @@ +def list_from_file(filename, prefix="", offset=0, max_num=0): + """Load a text file and parse the content as a list of strings. + + Args: + filename (str): Filename. + prefix (str): The prefix to be inserted to the begining of each item. + offset (int): The offset of lines. + max_num (int): The maximum number of lines to be read, + zeros and negatives mean no limitation. + + Returns: + list[str]: A list of strings. + """ + cnt = 0 + item_list = [] + with open(filename, "r") as f: + for _ in range(offset): + f.readline() + for line in f: + if max_num > 0 and cnt >= max_num: + break + item_list.append(prefix + line.rstrip("\n")) + cnt += 1 + return item_list + + +def dict_from_file(filename, key_type=str): + """Load a text file and parse the content as a dict. + + Each line of the text file will be two or more columns splited by + whitespaces or tabs. The first column will be parsed as dict keys, and + the following columns will be parsed as dict values. + + Args: + filename(str): Filename. + key_type(type): Type of the dict's keys. str is user by default and + type conversion will be performed if specified. + + Returns: + dict: The parsed contents. + """ + mapping = {} + with open(filename, "r") as f: + for line in f: + items = line.rstrip("\n").split() + assert len(items) >= 2 + key = key_type(items[0]) + val = items[1:] if len(items) > 2 else items[1] + mapping[key] = val + return mapping diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fdc3deada91ad86c43f665a2dadcd8aa0559d002 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/__init__.py @@ -0,0 +1,15 @@ +from .collate import collate, collate_kitti +from .data_container import DataContainer +from .data_parallel import MegDataParallel +from .distributed import MegDistributedDataParallel +from .scatter_gather import scatter, scatter_kwargs + +__all__ = [ + "collate", + "collate_kitti", + "DataContainer", + "MegDataParallel", + "MegDistributedDataParallel", + "scatter", + "scatter_kwargs", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/_functions.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..cd5fdf4e4a21139528121f7f10f2c87865807010 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/_functions.py @@ -0,0 +1,74 @@ +import torch +from torch.nn.parallel._functions import _get_stream + + +def scatter(input, devices, streams=None): + """Scatters tensor across multiple GPUs. + """ + if streams is None: + streams = [None] * len(devices) + + if isinstance(input, list): + chunk_size = (len(input) - 1) // len(devices) + 1 + outputs = [ + scatter(input[i], [devices[i // chunk_size]], [streams[i // chunk_size]]) + for i in range(len(input)) + ] + return outputs + elif isinstance(input, torch.Tensor): + output = input.contiguous() + # TODO: copy to a pinned buffer first (if copying from CPU) + stream = streams[0] if output.numel() > 0 else None + with torch.cuda.device(devices[0]), torch.cuda.stream(stream): + output = output.cuda(devices[0], non_blocking=True) + return output + else: + raise Exception("Unknown type {}.".format(type(input))) + + +def synchronize_stream(output, devices, streams): + if isinstance(output, list): + chunk_size = len(output) // len(devices) + for i in range(len(devices)): + for j in range(chunk_size): + synchronize_stream( + output[i * chunk_size + j], [devices[i]], [streams[i]] + ) + elif isinstance(output, torch.Tensor): + if output.numel() != 0: + with torch.cuda.device(devices[0]): + main_stream = torch.cuda.current_stream() + main_stream.wait_stream(streams[0]) + output.record_stream(main_stream) + else: + raise Exception("Unknown type {}.".format(type(output))) + + +def get_input_device(input): + if isinstance(input, list): + for item in input: + input_device = get_input_device(item) + if input_device != -1: + return input_device + return -1 + elif isinstance(input, torch.Tensor): + return input.get_device() if input.is_cuda else -1 + else: + raise Exception("Unknown type {}.".format(type(input))) + + +class Scatter(object): + @staticmethod + def forward(target_gpus, input): + input_device = get_input_device(input) + streams = None + if input_device == -1: + # Perform CPU to GPU copies in a background stream + streams = [_get_stream(device) for device in target_gpus] + + outputs = scatter(input, target_gpus, streams) + # Synchronize with the copy stream + if streams is not None: + synchronize_stream(outputs, target_gpus, streams) + + return tuple(outputs) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/collate.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/collate.py new file mode 100644 index 0000000000000000000000000000000000000000..fce4d0e5bf33d12b04f3e7b0d7ad92b14a034864 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/collate.py @@ -0,0 +1,163 @@ +import collections +from collections import defaultdict + +import numpy as np +import torch +import torch.nn.functional as F +from torch.utils.data.dataloader import default_collate + +from .data_container import DataContainer + + +def collate(batch, samples_per_gpu=1): + """Puts each data field into a tensor/DataContainer with outer dimension + batch size. + + Extend default_collate to add support for + :type:`~torchie.parallel.DataContainer`. There are 3 cases. + + 1. cpu_only = True, e.g., meta data + 2. cpu_only = False, stack = True, e.g., images tensors + 3. cpu_only = False, stack = False, e.g., gt bboxes + """ + + if not isinstance(batch, collections.Sequence): + raise TypeError("{} is not supported.".format(batch.dtype)) + + if isinstance(batch[0], DataContainer): + assert len(batch) % samples_per_gpu == 0 + stacked = [] + if batch[0].cpu_only: + for i in range(0, len(batch), samples_per_gpu): + stacked.append( + [sample.data for sample in batch[i : i + samples_per_gpu]] + ) + return DataContainer( + stacked, batch[0].stack, batch[0].padding_value, cpu_only=True + ) + elif batch[0].stack: + for i in range(0, len(batch), samples_per_gpu): + assert isinstance(batch[i].data, torch.Tensor) + + if batch[i].pad_dims is not None: + ndim = batch[i].dim() + assert ndim > batch[i].pad_dims + max_shape = [0 for _ in range(batch[i].pad_dims)] + for dim in range(1, batch[i].pad_dims + 1): + max_shape[dim - 1] = batch[i].size(-dim) + for sample in batch[i : i + samples_per_gpu]: + for dim in range(0, ndim - batch[i].pad_dims): + assert batch[i].size(dim) == sample.size(dim) + for dim in range(1, batch[i].pad_dims + 1): + max_shape[dim - 1] = max( + max_shape[dim - 1], sample.size(-dim) + ) + padded_samples = [] + for sample in batch[i : i + samples_per_gpu]: + pad = [0 for _ in range(batch[i].pad_dims * 2)] + for dim in range(1, batch[i].pad_dims + 1): + pad[2 * dim - 1] = max_shape[dim - 1] - sample.size(-dim) + padded_samples.append( + F.pad(sample.data, pad, value=sample.padding_value) + ) + stacked.append(default_collate(padded_samples)) + elif batch[i].pad_dims is None: + stacked.append( + default_collate( + [sample.data for sample in batch[i : i + samples_per_gpu]] + ) + ) + else: + raise ValueError("pad_dims should be either None or integers (1-3)") + + else: + for i in range(0, len(batch), samples_per_gpu): + stacked.append( + [sample.data for sample in batch[i : i + samples_per_gpu]] + ) + return DataContainer(stacked, batch[0].stack, batch[0].padding_value) + elif isinstance(batch[0], collections.Sequence): + transposed = zip(*batch) + return [collate(samples, samples_per_gpu) for samples in transposed] + elif isinstance(batch[0], collections.Mapping): + return { + key: collate([d[key] for d in batch], samples_per_gpu) for key in batch[0] + } + else: + return default_collate(batch) + + + +def collate_kitti(batch_list, samples_per_gpu=1): + example_merged = collections.defaultdict(list) + for example in batch_list: + if type(example) is list: + for subexample in example: + for k, v in subexample.items(): + example_merged[k].append(v) + else: + for k, v in example.items(): + example_merged[k].append(v) + batch_size = len(example_merged['metadata']) + ret = {} + # voxel_nums_list = example_merged["num_voxels"] + # example_merged.pop("num_voxels") + for key, elems in example_merged.items(): + if key in ["voxels", "num_points", "num_gt", "voxel_labels", "num_voxels", + "cyv_voxels", "cyv_num_points", "cyv_num_voxels"]: + ret[key] = torch.tensor(np.concatenate(elems, axis=0)) + elif key in [ + "gt_boxes", + ]: + task_max_gts = [] + for task_id in range(len(elems[0])): + max_gt = 0 + for k in range(batch_size): + max_gt = max(max_gt, len(elems[k][task_id])) + task_max_gts.append(max_gt) + res = [] + for idx, max_gt in enumerate(task_max_gts): + batch_task_gt_boxes3d = np.zeros((batch_size, max_gt, 7)) + for i in range(batch_size): + batch_task_gt_boxes3d[i, : len(elems[i][idx]), :] = elems[i][idx] + res.append(batch_task_gt_boxes3d) + ret[key] = res + elif key == "metadata": + ret[key] = elems + elif key == "calib": + ret[key] = {} + for elem in elems: + for k1, v1 in elem.items(): + if k1 not in ret[key]: + ret[key][k1] = [v1] + else: + ret[key][k1].append(v1) + for k1, v1 in ret[key].items(): + ret[key][k1] = torch.tensor(np.stack(v1, axis=0)) + elif key == "points": + ret[key] = [torch.tensor(elem) for elem in elems] + elif key in ["coordinates", "cyv_coordinates"]: + coors = [] + for i, coor in enumerate(elems): + coor_pad = np.pad( + coor, ((0, 0), (1, 0)), mode="constant", constant_values=i + ) + coors.append(coor_pad) + ret[key] = torch.tensor(np.concatenate(coors, axis=0)) + elif key in ["anchors", "anchors_mask", "reg_targets", "reg_weights", "labels", "hm", "anno_box", + "ind", "mask", "cat"]: + + ret[key] = defaultdict(list) + res = [] + for elem in elems: + for idx, ele in enumerate(elem): + ret[key][str(idx)].append(torch.tensor(ele)) + for kk, vv in ret[key].items(): + res.append(torch.stack(vv)) + ret[key] = res + elif key == 'gt_boxes_and_cls': + ret[key] = torch.tensor(np.stack(elems, axis=0)) + else: + ret[key] = np.stack(elems, axis=0) + + return ret diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/data_container.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/data_container.py new file mode 100644 index 0000000000000000000000000000000000000000..46632ebbfef90344e8f84394c32f8fe79c2cfb13 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/data_container.py @@ -0,0 +1,81 @@ +import functools + +import torch + + +def assert_tensor_type(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not isinstance(args[0].data, torch.Tensor): + raise AttributeError( + "{} has no attribute {} for type {}".format( + args[0].__class__.__name__, func.__name__, args[0].datatype + ) + ) + return func(*args, **kwargs) + + return wrapper + + +class DataContainer(object): + """A container for any type of objects. + + Typically tensors will be stacked in the collate function and sliced along + some dimension in the scatter function. This behavior has some limitations. + 1. All tensors have to be the same size. + 2. Types are limited (numpy array or Tensor). + + We design `DataContainer` and `MMDataParallel` to overcome these + limitations. The behavior can be either of the following. + + - copy to GPU, pad all tensors to the same size and stack them + - copy to GPU without stacking + - leave the objects as is and pass it to the model + - pad_dims specifies the number of last few dimensions to do padding + """ + + def __init__(self, data, stack=False, padding_value=0, cpu_only=False, pad_dims=2): + self._data = data + self._cpu_only = cpu_only + self._stack = stack + self._padding_value = padding_value + assert pad_dims in [None, 1, 2, 3] + self._pad_dims = pad_dims + + def __repr__(self): + return "{}({})".format(self.__class__.__name__, repr(self.data)) + + @property + def data(self): + return self._data + + @property + def datatype(self): + if isinstance(self.data, torch.Tensor): + return self.data.type() + else: + return type(self.data) + + @property + def cpu_only(self): + return self._cpu_only + + @property + def stack(self): + return self._stack + + @property + def padding_value(self): + return self._padding_value + + @property + def pad_dims(self): + return self._pad_dims + + @assert_tensor_type + def size(self, *args, **kwargs): + return self.data.size(*args, **kwargs) + + @assert_tensor_type + def dim(self): + return self.data.dim() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/data_parallel.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/data_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..fe2869f3b1c217310b57dac840d2205e3997b757 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/data_parallel.py @@ -0,0 +1,8 @@ +from torch.nn.parallel import DataParallel + +from .scatter_gather import scatter_kwargs + + +class MegDataParallel(DataParallel): + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/distributed.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..51b22d94d53c8f7e2ff0dc1fd9c04e8ef2623b5f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/distributed.py @@ -0,0 +1,45 @@ +import torch +import torch.distributed as dist +import torch.nn as nn +from torch._utils import _flatten_dense_tensors, _take_tensors, _unflatten_dense_tensors + +from .scatter_gather import scatter_kwargs + + +class MegDistributedDataParallel(nn.Module): + def __init__(self, module, dim=0, broadcast_buffers=True, bucket_cap_mb=25): + super(MegDistributedDataParallel, self).__init__() + self.module = module + self.dim = dim + self.broadcast_buffers = broadcast_buffers + + self.broadcast_bucket_size = bucket_cap_mb * 1024 * 1024 + self._sync_params() + + def _dist_broadcast_coalesced(self, tensors, buffer_size): + for tensors in _take_tensors(tensors, buffer_size): + flat_tensors = _flatten_dense_tensors(tensors) + dist.broadcast(flat_tensors, 0) + for tensor, synced in zip( + tensors, _unflatten_dense_tensors(flat_tensors, tensors) + ): + tensor.copy_(synced) + + def _sync_params(self): + module_states = list(self.module.state_dict().values()) + if len(module_states) > 0: + self._dist_broadcast_coalesced(module_states, self.broadcast_bucket_size) + if self.broadcast_buffers: + if torch.__version__ < "1.0": + buffers = [b.data for b in self.module._all_buffers()] + else: + buffers = [b.data for b in self.module.buffers()] + if len(buffers) > 0: + self._dist_broadcast_coalesced(buffers, self.broadcast_bucket_size) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def forward(self, *inputs, **kwargs): + inputs, kwargs = self.scatter(inputs, kwargs, [torch.cuda.current_device()]) + return self.module(*inputs[0], **kwargs[0]) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/scatter_gather.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/scatter_gather.py new file mode 100644 index 0000000000000000000000000000000000000000..1ea64d3d4224e364b17c83a8899381894f3a2bf6 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/parallel/scatter_gather.py @@ -0,0 +1,54 @@ +import torch +from torch.nn.parallel._functions import Scatter as OrigScatter + +from ._functions import Scatter +from .data_container import DataContainer + + +def scatter(inputs, target_gpus, dim=0): + """Scatter inputs to target gpus. + + The only difference from original :func:`scatter` is to add support for + :type:`~mmcv.parallel.DataContainer`. + """ + + def scatter_map(obj): + if isinstance(obj, torch.Tensor): + return OrigScatter.apply(target_gpus, None, dim, obj) + if isinstance(obj, DataContainer): + if obj.cpu_only: + return obj.data + else: + return Scatter.forward(target_gpus, obj.data) + if isinstance(obj, tuple) and len(obj) > 0: + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list) and len(obj) > 0: + out = list(map(list, zip(*map(scatter_map, obj)))) + return out + if isinstance(obj, dict) and len(obj) > 0: + out = list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return out + return [obj for targets in target_gpus] + + # After scatter_map is called, a scatter_map cell will exist. This cell + # has a reference to the actual function scatter_map, which has references + # to a closure that has a reference to the scatter_map cell (because the + # fn is recursive). To avoid this reference cycle, we set the function to + # None, clearing the cell + try: + return scatter_map(inputs) + finally: + scatter_map = None + + +def scatter_kwargs(inputs, kwargs, target_gpus, dim=0): + """Scatter with support for kwargs dictionary""" + inputs = scatter(inputs, target_gpus, dim) if inputs else [] + kwargs = scatter(kwargs, target_gpus, dim) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..52158ea5c2476d46a25084dc4568a82a343fc201 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/__init__.py @@ -0,0 +1,58 @@ +from .checkpoint import ( + load_checkpoint, + load_state_dict, + save_checkpoint, + weights_to_cpu, +) +from .hooks import ( + CheckpointHook, + ClosureHook, + DistSamplerSeedHook, + Hook, + IterTimerHook, + LoggerHook, + LrUpdaterHook, + OptimizerHook, + PaviLoggerHook, + TensorboardLoggerHook, + TextLoggerHook, +) +from .log_buffer import LogBuffer +from .parallel_test import parallel_test +from .priority import Priority, get_priority +from .trainer import Trainer +from .utils import ( + get_dist_info, + get_host_info, + get_time_str, + master_only, + obj_from_dict, +) + +__all__ = [ + "Trainer", + "LogBuffer", + "Hook", + "CheckpointHook", + "ClosureHook", + "LrUpdaterHook", + "OptimizerHook", + "IterTimerHook", + "DistSamplerSeedHook", + "LoggerHook", + "TextLoggerHook", + "PaviLoggerHook", + "TensorboardLoggerHook", + "load_state_dict", + "load_checkpoint", + "weights_to_cpu", + "save_checkpoint", + "parallel_test", + "Priority", + "get_priority", + "get_host_info", + "get_dist_info", + "master_only", + "get_time_str", + "obj_from_dict", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/checkpoint.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..61c1f670f3c1da994848dd83d06d05ef016a0147 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/checkpoint.py @@ -0,0 +1,260 @@ +import os +import os.path as osp +import pkgutil +import time +import warnings +from collections import OrderedDict +from importlib import import_module + +import torch +import torchvision +from det3d import torchie +from terminaltables import AsciiTable +from torch.utils import model_zoo + +from .utils import get_dist_info + +open_mmlab_model_urls = { + "vgg16_caffe": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/vgg16_caffe-292e1171.pth", # noqa: E501 + "resnet50_caffe": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnet50_caffe-788b5fa3.pth", # noqa: E501 + "resnet101_caffe": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnet101_caffe-3ad79236.pth", # noqa: E501 + "resnext50_32x4d": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnext50-32x4d-0ab1a123.pth", # noqa: E501 + "resnext101_32x4d": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnext101_32x4d-a5af3160.pth", # noqa: E501 + "resnext101_64x4d": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth", # noqa: E501 + "contrib/resnet50_gn": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth", # noqa: E501 + "detectron/resnet50_gn": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnet50_gn-9186a21c.pth", # noqa: E501 + "detectron/resnet101_gn": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnet101_gn-cac0ab98.pth", # noqa: E501 + "jhu/resnet50_gn_ws": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnet50_gn_ws-15beedd8.pth", # noqa: E501 + "jhu/resnet101_gn_ws": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth", # noqa: E501 + "jhu/resnext50_32x4d_gn_ws": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth", # noqa: E501 + "jhu/resnext101_32x4d_gn_ws": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth", # noqa: E501 + "jhu/resnext50_32x4d_gn": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth", # noqa: E501 + "jhu/resnext101_32x4d_gn": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth", # noqa: E501 + "msra/hrnetv2_w18": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/hrnetv2_w18-00eb2006.pth", # noqa: E501 + "msra/hrnetv2_w32": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth", # noqa: E501 + "msra/hrnetv2_w40": "https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/hrnetv2_w40-ed0b031c.pth", # noqa: E501 + "bninception_caffe": "https://open-mmlab.s3.ap-northeast-2.amazonaws.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth", # noqa: E501 + "kin400/i3d_r50_f32s2_k400": "https://open-mmlab.s3.ap-northeast-2.amazonaws.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth", # noqa: E501 + "kin400/nl3d_r50_f32s2_k400": "https://open-mmlab.s3.ap-northeast-2.amazonaws.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth", # noqa: E501 +} # yapf: disable + +import torch.nn as nn +from typing import Set + +try: + import spconv.pytorch as spconv +except: + import spconv as spconv + +def find_all_spconv_keys(model: nn.Module, prefix="") -> Set[str]: + """ + Finds all spconv keys that need to have weight's transposed + from https://github.com/acivgin1/OpenPCDet/blob/8fc1a5d57bcb418d71d5118fb3df4b58d4ea0244/pcdet/utils/spconv_utils.py + """ + found_keys: Set[str] = set() + for name, child in model.named_children(): + new_prefix = f"{prefix}.{name}" if prefix != "" else name + + if isinstance(child, spconv.conv.SparseConvolution): + new_prefix = f"{new_prefix}.weight" + found_keys.add(new_prefix) + + found_keys.update(find_all_spconv_keys(child, prefix=new_prefix)) + + return found_keys + + +def load_state_dict(module, state_dict, strict=False, logger=None): + """Load state_dict into a module + """ + unexpected_keys = [] + shape_mismatch_pairs = [] + + own_state = module.state_dict() + + spconv_keys = find_all_spconv_keys(module) + + for name, param in state_dict.items(): + + if name in spconv_keys and name in own_state and own_state[name].shape != param.shape: + # from https://github.com/acivgin1/OpenPCDet/blob/8fc1a5d57bcb418d71d5118fb3df4b58d4ea0244/pcdet/models/detectors/detector3d_template.py + # with different spconv versions, we need to adapt weight shapes for spconv blocks + # adapt spconv weights from version 1.x to version 2.x if you used weights from spconv 1.x + + param_native = param.transpose(-1, -2) # (k1, k2, k3, c_in, c_out) to (k1, k2, k3, c_out, c_in) + if param_native.shape == own_state[name].shape: + param = param_native.contiguous() + else: + assert param.shape.__len__() == 5, 'currently only spconv 3D is supported' + param_implicit = param.permute(4, 0, 1, 2, 3) # (k1, k2, k3, c_in, c_out) to (c_out, k1, k2, k3, c_in) + if param_implicit.shape == own_state[name].shape: + param = param_implicit.contiguous() + + + # a hacky fixed to load a new voxelnet + if name not in own_state: + unexpected_keys.append(name) + continue + if isinstance(param, torch.nn.Parameter): + # backwards compatibility for serialized parameters + param = param.data + if param.size() != own_state[name].size(): + shape_mismatch_pairs.append([name, own_state[name].size(), param.size()]) + continue + own_state[name].copy_(param) + + all_missing_keys = set(own_state.keys()) - set(state_dict.keys()) + # ignore "num_batches_tracked" of BN layers + missing_keys = [key for key in all_missing_keys if "num_batches_tracked" not in key] + + err_msg = [] + if unexpected_keys: + err_msg.append( + "unexpected key in source state_dict: {}\n".format( + ", ".join(unexpected_keys) + ) + ) + if missing_keys: + err_msg.append( + "missing keys in source state_dict: {}\n".format(", ".join(missing_keys)) + ) + if shape_mismatch_pairs: + mismatch_info = "these keys have mismatched shape:\n" + header = ["key", "expected shape", "loaded shape"] + table_data = [header] + shape_mismatch_pairs + table = AsciiTable(table_data) + err_msg.append(mismatch_info + table.table) + + rank, _ = get_dist_info() + if len(err_msg) > 0 and rank == 0: + err_msg.insert(0, "The model and loaded state dict do not match exactly\n") + err_msg = "\n".join(err_msg) + if strict: + raise RuntimeError(err_msg) + elif logger is not None: + logger.warning(err_msg) + else: + print(err_msg) + + +def load_url_dist(url): + """ In distributed setting, this function only download checkpoint at + local rank 0 """ + rank, world_size = get_dist_info() + rank = int(os.environ.get("LOCAL_RANK", rank)) + if rank == 0: + checkpoint = model_zoo.load_url(url) + if world_size > 1: + torch.distributed.barrier() + if rank > 0: + checkpoint = model_zoo.load_url(url) + return checkpoint + + +def get_torchvision_models(): + model_urls = dict() + for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): + if ispkg: + continue + _zoo = import_module("torchvision.models.{}".format(name)) + if hasattr(_zoo, "model_urls"): + _urls = getattr(_zoo, "model_urls") + model_urls.update(_urls) + return model_urls + + +def load_checkpoint(model, filename, map_location='cpu', strict=False, logger=None): + """Load checkpoint from a file or URI. + + Args: + model (Module): Module to load checkpoint. + filename (str): Either a filepath or URL or modelzoo://xxxxxxx. + map_location (str): Same as :func:`torch.load`. + strict (bool): Whether to allow different params for the model and + checkpoint. + logger (:mod:`logging.Logger` or None): The logger for error message. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + # load checkpoint from modelzoo or file or url + if filename.startswith("modelzoo://"): + warnings.warn( + 'The URL scheme of "modelzoo://" is deprecated, please ' + 'use "torchvision://" instead' + ) + model_urls = get_torchvision_models() + model_name = filename[11:] + checkpoint = load_url_dist(model_urls[model_name]) + elif filename.startswith("torchvision://"): + model_urls = get_torchvision_models() + model_name = filename[14:] + checkpoint = load_url_dist(model_urls[model_name]) + elif filename.startswith("open-mmlab://"): + model_name = filename[13:] + checkpoint = load_url_dist(open_mmlab_model_urls[model_name]) + elif filename.startswith(("http://", "https://")): + checkpoint = load_url_dist(filename) + else: + if not osp.isfile(filename): + raise IOError("{} is not a checkpoint file".format(filename)) + checkpoint = torch.load(filename, map_location=map_location) + # get state_dict from checkpoint + if isinstance(checkpoint, OrderedDict): + state_dict = checkpoint + elif isinstance(checkpoint, dict) and "state_dict" in checkpoint: + state_dict = checkpoint["state_dict"] + else: + raise RuntimeError("No state_dict found in checkpoint file {}".format(filename)) + # strip prefix of state_dict + if list(state_dict.keys())[0].startswith("module."): + state_dict = {k[7:]: v for k, v in checkpoint["state_dict"].items()} + # load state_dict + if hasattr(model, "module"): + load_state_dict(model.module, state_dict, strict, logger) + else: + load_state_dict(model, state_dict, strict, logger) + return checkpoint + + +def weights_to_cpu(state_dict): + """Copy a model state_dict to cpu. + + Args: + state_dict (OrderedDict): Model weights on GPU. + + Returns: + OrderedDict: Model weights on GPU. + """ + state_dict_cpu = OrderedDict() + for key, val in state_dict.items(): + state_dict_cpu[key] = val.cpu() + return state_dict_cpu + + +def save_checkpoint(model, filename, optimizer=None, meta=None): + """Save checkpoint to file. + + The checkpoint will have 3 fields: ``meta``, ``state_dict`` and + ``optimizer``. By default ``meta`` will contain version and time info. + + Args: + model (Module): Module whose params are to be saved. + filename (str): Checkpoint filename. + optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. + meta (dict, optional): Metadata to be saved in checkpoint. + """ + if meta is None: + meta = {} + elif not isinstance(meta, dict): + raise TypeError("meta must be a dict or None, but got {}".format(type(meta))) + + torchie.mkdir_or_exist(osp.dirname(filename)) + if hasattr(model, "module"): + model = model.module + + checkpoint = {"meta": meta, "state_dict": weights_to_cpu(model.state_dict())} + if optimizer is not None: + checkpoint["optimizer"] = optimizer.state_dict() + + torch.save(checkpoint, filename) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d45fb540b8f4848e62c2fdbd3c243dcffc7b58b6 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/__init__.py @@ -0,0 +1,24 @@ +from .checkpoint import CheckpointHook +from .closure import ClosureHook +from .hook import Hook +from .iter_timer import IterTimerHook +from .logger import LoggerHook, PaviLoggerHook, TensorboardLoggerHook, TextLoggerHook +from .lr_updater import LrUpdaterHook +from .memory import EmptyCacheHook +from .optimizer import OptimizerHook +from .sampler_seed import DistSamplerSeedHook + +__all__ = [ + "Hook", + "CheckpointHook", + "ClosureHook", + "LrUpdaterHook", + "OptimizerHook", + "IterTimerHook", + "DistSamplerSeedHook", + "EmptyCacheHook", + "LoggerHook", + "TextLoggerHook", + "PaviLoggerHook", + "TensorboardLoggerHook", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/checkpoint.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..ee4c497ef5e33f3df6491495a9b94da61d91e900 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/checkpoint.py @@ -0,0 +1,22 @@ +from ..utils import master_only +from .hook import Hook + + +class CheckpointHook(Hook): + def __init__(self, interval=1, save_optimizer=True, out_dir=None, **kwargs): + self.interval = interval + self.save_optimizer = save_optimizer + self.out_dir = out_dir + self.args = kwargs + + @master_only + def after_train_epoch(self, trainer): + if not self.every_n_epochs(trainer, self.interval): + return + + if not self.out_dir: + self.out_dir = trainer.work_dir + + trainer.save_checkpoint( + self.out_dir, save_optimizer=self.save_optimizer, **self.args + ) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/closure.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/closure.py new file mode 100644 index 0000000000000000000000000000000000000000..8af542179c35e65d61fcadd1acc21a8112f1e06f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/closure.py @@ -0,0 +1,8 @@ +from .hook import Hook + + +class ClosureHook(Hook): + def __init__(self, fn_name, fn): + assert hasattr(self, fn_name) + assert callable(fn) + setattr(self, fn_name, fn) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/hook.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/hook.py new file mode 100644 index 0000000000000000000000000000000000000000..d4b29504a70ee4ebcbae47c5777f41b61b18c543 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/hook.py @@ -0,0 +1,63 @@ +class Hook(object): + def before_run(self, trainer): + pass + + def after_run(self, trainer): + pass + + def before_epoch(self, trainer): + pass + + def after_epoch(self, trainer): + pass + + def before_iter(self, trainer): + pass + + def after_iter(self, trainer): + pass + + def after_data_to_device(self, trainer): + pass + + def after_forward(self, trainer): + pass + + def after_parse_loss(self, trainer): + pass + + def before_train_epoch(self, trainer): + self.before_epoch(trainer) + + def before_val_epoch(self, trainer): + self.before_epoch(trainer) + + def after_train_epoch(self, trainer): + self.after_epoch(trainer) + + def after_val_epoch(self, trainer): + self.after_epoch(trainer) + + def before_train_iter(self, trainer): + self.before_iter(trainer) + + def before_val_iter(self, trainer): + self.before_iter(trainer) + + def after_train_iter(self, trainer): + self.after_iter(trainer) + + def after_val_iter(self, trainer): + self.after_iter(trainer) + + def every_n_epochs(self, trainer, n): + return (trainer.epoch + 1) % n == 0 if n > 0 else False + + def every_n_iters(self, trainer, n): + return (trainer.iter + 1) % n == 0 if n > 0 else False + + def every_n_inner_iters(self, trainer, n): + return (trainer.inner_iter + 1) % n == 0 if n > 0 else False + + def end_of_epoch(self, trainer): + return trainer.inner_iter + 1 == len(trainer.data_loader) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/iter_timer.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/iter_timer.py new file mode 100644 index 0000000000000000000000000000000000000000..0b951603e14f84d76f0aa04033fdc049bb55103f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/iter_timer.py @@ -0,0 +1,24 @@ +import time + +from .hook import Hook + + +class IterTimerHook(Hook): + def before_epoch(self, runner): + self.t = time.time() + + def before_iter(self, runner): + runner.log_buffer.update({"data_time": time.time() - self.t}) + + def after_iter(self, runner): + runner.log_buffer.update({"time": time.time() - self.t}) + self.t = time.time() + + def after_data_to_device(self, runner): + runner.log_buffer.update({"transfer_time": time.time() - self.t}) + + def after_forward(self, runner): + runner.log_buffer.update({"forward_time": time.time() - self.t}) + + def after_parse_loss(self, runner): + runner.log_buffer.update({"loss_parse_time": time.time() - self.t}) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6da5ee168e039cbd0ace6a814032423eb4b9e34b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/__init__.py @@ -0,0 +1,6 @@ +from .base import LoggerHook +from .pavi import PaviLoggerHook +from .tensorboard import TensorboardLoggerHook +from .text import TextLoggerHook + +__all__ = ["LoggerHook", "TextLoggerHook", "PaviLoggerHook", "TensorboardLoggerHook"] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/base.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/base.py new file mode 100644 index 0000000000000000000000000000000000000000..08f5bfede4cdc21053c2fe85a4197ff6b622b1ae --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/base.py @@ -0,0 +1,57 @@ +from abc import ABCMeta, abstractmethod + +from ..hook import Hook + + +class LoggerHook(Hook): + """Base class for logger hooks + + Args: + interval (int) + ignore_last (bool) + reset_flag (bool) + """ + + __metaclass__ = ABCMeta + + def __init__(self, interval=10, ignore_last=True, reset_flag=False): + self.interval = interval + self.ignore_last = ignore_last + self.reset_flag = reset_flag + + @abstractmethod + def log(self, trainer): + pass + + def before_run(self, trainer): + for hook in trainer.hooks[::-1]: + if isinstance(hook, LoggerHook): + hook.reset_flag = True + break + + def before_epoch(self, trainer): + trainer.log_buffer.clear() + + def after_train_iter(self, trainer): + if self.every_n_inner_iters(trainer, self.interval): + trainer.log_buffer.average(self.interval) + elif self.end_of_epoch(trainer) and not self.ignore_last: + # not precise but more stable + trainer.log_buffer.average(self.interval) + + if trainer.log_buffer.ready: + self.log(trainer) + if self.reset_flag: + trainer.log_buffer.clear_output() + + def after_train_epoch(self, trainer): + if trainer.log_buffer.ready: + self.log(trainer) + if self.reset_flag: + trainer.log_buffer.clear_output() + + def after_val_epoch(self, trainer): + trainer.log_buffer.average() + self.log(trainer) + if self.reset_flag: + trainer.log_buffer.clear_output() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/pavi.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/pavi.py new file mode 100644 index 0000000000000000000000000000000000000000..9d91c27588cf162520949013e359c5871af2e696 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/pavi.py @@ -0,0 +1,177 @@ +from __future__ import print_function + +import logging +import os +import os.path as osp +import time +from datetime import datetime +from threading import Thread + +import requests +from six.moves.queue import Empty, Queue + +from ...utils import get_host_info, master_only +from .base import LoggerHook + + +class PaviClient(object): + def __init__(self, url, username=None, password=None, instance_id=None): + self.url = url + self.username = self._get_env_var(username, "PAVI_USERNAME") + self.password = self._get_env_var(password, "PAVI_PASSWORD") + self.instance_id = instance_id + self.log_queue = None + self.logger = None + + def _get_env_var(self, var, env_var): + if var is not None: + return str(var) + + var = os.getenv(env_var) + if not var: + raise ValueError( + '"{}" is neither specified nor defined as env variables'.format(env_var) + ) + return var + + def _print_log(self, msg, level=logging.INFO, *args, **kwargs): + if self.logger is not None: + self.logger.log(level, msg, *args, **kwargs) + else: + print(msg, *args, **kwargs) + + def connect(self, model_name, work_dir=None, info=dict(), timeout=5, logger=None): + if logger is not None: + self.logger = logger + self._print_log("connecting pavi service {}...".format(self.url)) + post_data = dict( + time=str(datetime.now()), + username=self.username, + password=self.password, + instance_id=self.instance_id, + model=model_name, + work_dir=osp.abspath(work_dir) if work_dir else "", + session_file=info.get("session_file", ""), + session_text=info.get("session_text", ""), + model_text=info.get("model_text", ""), + device=get_host_info(), + ) + try: + response = requests.post(self.url, json=post_data, timeout=timeout) + except Exception as ex: + self._print_log( + "fail to connect to pavi service: {}".format(ex), level=logging.ERROR + ) + else: + if response.status_code == 200: + self.instance_id = response.text + self._print_log( + "pavi service connected, instance_id: {}".format(self.instance_id) + ) + self.log_queue = Queue() + self.log_thread = Thread(target=self.post_worker_fn) + self.log_thread.daemon = True + self.log_thread.start() + return True + else: + self._print_log( + "fail to connect to pavi service, status code: " + "{}, err message: {}".format(response.status_code, response.reason), + level=logging.ERROR, + ) + return False + + def post_worker_fn(self, max_retry=3, queue_timeout=1, req_timeout=3): + while True: + try: + log = self.log_queue.get(timeout=queue_timeout) + except Empty: + time.sleep(1) + except Exception as ex: + self._print_log( + "fail to get logs from queue: {}".format(ex), level=logging.ERROR + ) + else: + retry = 0 + while retry < max_retry: + try: + response = requests.post( + self.url, json=log, timeout=req_timeout + ) + except Exception as ex: + retry += 1 + self._print_log( + "error when posting logs to pavi: {}".format(ex), + level=logging.ERROR, + ) + else: + status_code = response.status_code + if status_code == 200: + break + else: + self._print_log( + "unexpected status code: {}, err msg: {}".format( + status_code, response.reason + ), + level=logging.ERROR, + ) + retry += 1 + if retry == max_retry: + self._print_log( + "fail to send logs of iteration {}".format(log["iter_num"]), + level=logging.ERROR, + ) + + def log(self, phase, iter, outputs): + if self.log_queue is not None: + logs = { + "time": str(datetime.now()), + "instance_id": self.instance_id, + "flow_id": phase, + "iter_num": iter, + "outputs": outputs, + "msg": "", + } + self.log_queue.put(logs) + + +class PaviLoggerHook(LoggerHook): + def __init__( + self, + url, + username=None, + password=None, + instance_id=None, + config_file=None, + interval=10, + ignore_last=True, + reset_flag=True, + ): + self.pavi = PaviClient(url, username, password, instance_id) + self.config_file = config_file + super(PaviLoggerHook, self).__init__(interval, ignore_last, reset_flag) + + def before_run(self, runner): + super(PaviLoggerHook, self).before_run(runner) + self.connect(runner) + + @master_only + def connect(self, runner, timeout=5): + cfg_info = dict() + if self.config_file is not None: + with open(self.config_file, "r") as f: + config_text = f.read() + cfg_info.update(session_file=self.config_file, session_text=config_text) + return self.pavi.connect( + runner.model_name, runner.work_dir, cfg_info, timeout, runner.logger + ) + + @master_only + def log(self, runner): + log_outs = runner.log_buffer.output.copy() + log_outs.pop("time", None) + log_outs.pop("data_time", None) + for k, v in log_outs.items(): + if isinstance(v, str): + log_outs.pop(k) + self.pavi.log(runner.mode, runner.iter + 1, log_outs) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/tensorboard.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..2ad176f52f8e89fce90b65feab57ce17b3452db5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/tensorboard.py @@ -0,0 +1,53 @@ +import os.path as osp + +import torch + +from ...utils import master_only +from .base import LoggerHook + + +class TensorboardLoggerHook(LoggerHook): + def __init__(self, log_dir=None, interval=10, ignore_last=True, reset_flag=True): + super(TensorboardLoggerHook, self).__init__(interval, ignore_last, reset_flag) + self.log_dir = log_dir + + @master_only + def before_run(self, trainer): + if torch.__version__ >= "1.1": + try: + from torch.utils.tensorboard import SummaryWriter + except ImportError: + raise ImportError( + 'Please run "pip install future tensorboard" to install ' + "the dependencies to use torch.utils.tensorboard " + "(applicable to PyTorch 1.1 or higher)" + ) + else: + try: + from tensorboardX import SummaryWriter + except ImportError: + raise ImportError( + "Please install tensorboardX to use " "TensorboardLoggerHook." + ) + + if self.log_dir is None: + self.log_dir = osp.join(trainer.work_dir, "tf_logs") + self.writer = SummaryWriter(self.log_dir) + + @master_only + def log(self, trainer): + for var in trainer.log_buffer.output: + if var in ["time", "data_time"]: + continue + tag = "{}/{}".format(var, trainer.mode) + record = trainer.log_buffer.output[var] + if isinstance(record, str): + self.writer.add_text(tag, record, trainer.iter) + else: + self.writer.add_scalar( + tag, trainer.log_buffer.output[var], trainer.iter + ) + + @master_only + def after_run(self, trainer): + self.writer.close() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/text.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/text.py new file mode 100644 index 0000000000000000000000000000000000000000..32f38570cdd8111a900735c0f8b261a17ad93d0f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/logger/text.py @@ -0,0 +1,150 @@ +import datetime +import os.path as osp +from collections import OrderedDict + +import torch +import torch.distributed as dist +from det3d import torchie + +from .base import LoggerHook + + +class TextLoggerHook(LoggerHook): + def __init__(self, interval=10, ignore_last=True, reset_flag=False): + super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag) + self.time_sec_tot = 0 + + def before_run(self, trainer): + super(TextLoggerHook, self).before_run(trainer) + self.start_iter = trainer.iter + self.json_log_path = osp.join( + trainer.work_dir, "{}.log.json".format(trainer.timestamp) + ) + + def _get_max_memory(self, trainer): + mem = torch.cuda.max_memory_allocated() + mem_mb = torch.tensor( + [mem / (1024 * 1024)], dtype=torch.int, device=torch.device("cuda") + ) + if trainer.world_size > 1: + dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX) + return mem_mb.item() + + def _convert_to_precision4(self, val): + if isinstance(val, float): + val = "{:.4f}".format(val) + elif isinstance(val, list): + val = [self._convert_to_precision4(v) for v in val] + + return val + + def _log_info(self, log_dict, trainer): + if trainer.mode == "train": + log_str = "Epoch [{}/{}][{}/{}]\tlr: {:.5f}, ".format( + log_dict["epoch"], + trainer._max_epochs, + log_dict["iter"], + len(trainer.data_loader), + log_dict["lr"], + ) + if "time" in log_dict.keys(): + self.time_sec_tot += log_dict["time"] * self.interval + time_sec_avg = self.time_sec_tot / (trainer.iter - self.start_iter + 1) + eta_sec = time_sec_avg * (trainer.max_iters - trainer.iter - 1) + eta_str = str(datetime.timedelta(seconds=int(eta_sec))) + log_str += "eta: {}, ".format(eta_str) + log_str += "time: {:.3f}, data_time: {:.3f}, transfer_time: {:.3f}, forward_time: {:.3f}, loss_parse_time: {:.3f} ".format( + log_dict["time"], + log_dict["data_time"], + log_dict["transfer_time"] - log_dict["data_time"], + log_dict["forward_time"] - log_dict["transfer_time"], + log_dict["loss_parse_time"] - log_dict["forward_time"], + ) + log_str += "memory: {}, ".format(log_dict["memory"]) + else: + log_str = "Epoch({}) [{}][{}]\t".format( + log_dict["mode"], log_dict["epoch"] - 1, log_dict["iter"] + ) + + trainer.logger.info(log_str) + + if trainer.world_size > 1: + class_names = trainer.model.module.bbox_head.class_names + else: + class_names = trainer.model.bbox_head.class_names + + for idx, task_class_names in enumerate(class_names): + log_items = [f"task : {task_class_names}"] + log_str = "" + for name, val in log_dict.items(): + # TODO: + if name in [ + "mode", + "Epoch", + "iter", + "lr", + "time", + "data_time", + "memory", + "epoch", + "transfer_time", + "forward_time", + "loss_parse_time", + ]: + continue + + if isinstance(val, float): + val = "{:.4f}".format(val) + + if isinstance(val, list): + log_items.append( + "{}: {}".format(name, self._convert_to_precision4(val[idx])) + ) + else: + log_items.append("{}: {}".format(name, val)) + + log_str += ", ".join(log_items) + if idx == (len(class_names) - 1): + log_str += "\n" + trainer.logger.info(log_str) + + def _dump_log(self, log_dict, trainer): + json_log = OrderedDict() + for k, v in log_dict.items(): + json_log[k] = self._round_float(v) + + if trainer.rank == 0: + with open(self.json_log_path, "a+") as f: + torchie.dump(json_log, f, file_format="json") + f.write("\n") + + def _round_float(self, items): + if isinstance(items, list): + return [self._round_float(item) for item in items] + elif isinstance(items, float): + return round(items, 5) + else: + return items + + def log(self, trainer): + log_dict = OrderedDict() + # Training mode if the output contains the key time + mode = "train" if "time" in trainer.log_buffer.output else "val" + log_dict["mode"] = mode + log_dict["epoch"] = trainer.epoch + 1 + log_dict["iter"] = trainer.inner_iter + 1 + # Only record lr of the first param group + log_dict["lr"] = trainer.current_lr()[0] + if mode == "train": + log_dict["time"] = trainer.log_buffer.output["time"] + log_dict["data_time"] = trainer.log_buffer.output["data_time"] + # statistic memory + if torch.cuda.is_available(): + log_dict["memory"] = self._get_max_memory(trainer) + for name, val in trainer.log_buffer.output.items(): + if name in ["time", "data_time"]: + continue + log_dict[name] = val + + self._log_info(log_dict, trainer) + self._dump_log(log_dict, trainer) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/lr_updater.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/lr_updater.py new file mode 100644 index 0000000000000000000000000000000000000000..c450660bf532e3d96a4ab54eb5112ff0f6cff6fb --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/lr_updater.py @@ -0,0 +1,175 @@ +from __future__ import division + +from math import cos, pi + +from det3d.solver import learning_schedules_fastai as lsf + +from .hook import Hook + + +class LrUpdaterHook(Hook): + def __init__( + self, by_epoch=True, warmup=None, warmup_iters=0, warmup_ratio=0.1, **kwargs + ): + if warmup is not None: + if warmup not in ["constant", "linear", "exp"]: + raise ValueError( + '"{}" is not a supported type for warming up, valid types' + ' are "constant" and "linear"'.format(warmup) + ) + + if warmup is not None: + assert warmup_iters > 0, '"warmup_iters" must be a positive integer' + assert 0 < warmup_ratio <= 1.0, '"warmup_ratio" must be in range (0,1]' + + self.by_epoch = by_epoch + self.warmup = warmup + self.warmup_ratio = warmup_ratio + self.warmup_iters = warmup_iters + + self.base_lr = [] # initial lr for all param groups + self.regular_lr = [] # expected lr if no warming up is performed + + def _set_lr(self, trainer, lr_groups): + for param_group, lr in zip(trainer.optimizer.param_groups, lr_groups): + param_group["lr"] = lr + + def get_lr(self, runner, base_lr): + raise NotImplementedError + + def get_regular_lr(self, trainer): + return [self.get_lr(trainer, _base_lr) for _base_lr in self.base_lr] + + def get_warmup_lr(self, cur_iters): + if self.warmup == "constant": + warmup_lr = [_lr * self.warmup_ratio for _lr in self.regular_lr] + elif self.warmup == "linear": + k = (1 - cur_iters / self.warmup_iters) * (1 - self.warmup_ratio) + warmup_lr = [_lr * (1 - k) for _lr in self.regular_lr] + elif self.warmup == "exp": + k = self.warmup_ratio ** (1 - cur_iters / self.warmup_iters) + warmup_lr = [_lr * k for _lr in self.regular_lr] + + return warmup_lr + + def before_run(self, trainer): + for group in trainer.optimizer.param_groups: + group.setdefault("initial_lr", group["lr"]) + self.base_lr = [group["initial_lr"] for group in trainer.optimizer.param_groups] + + def before_train_epoch(self, trainer): + if not self.by_epoch: + return + self.regular_lr = self.get_regular_lr(trainer) + self._set_lr(trainer, self.regular_lr) + + def before_train_iter(self, trainer): + cur_iter = trainer.iter + if not self.by_epoch: + self.regular_lr = self.get_regular_lr(trainer) + if self.warmup is None or cur_iter >= self.warmup_iters: + self._set_lr(trainer, self.regular_lr) + else: + warmup_lr = self.get_warmup_lr(cur_iter) + self._set_lr(trainer, warmup_lr) + elif self.by_epoch: + if self.warmup is None or cur_iter > self.warmup_iters: + return + elif cur_iter == self.warmup_iters: + self._set_lr(trainer, self.regular_lr) + else: + warmup_lr = self.get_warmup_lr(cur_iter) + self._set_lr(trainer, warmup_lr) + + +class FixedLrUpdaterHook(LrUpdaterHook): + def __init__(self, **kwargs): + super(FixedLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, trainer, base_lr): + return base_lr + + +class StepLrUpdaterHook(LrUpdaterHook): + def __init__(self, step, gamma=0.1, **kwargs): + assert isinstance(step, (list, int)) + if isinstance(step, list): + for s in step: + assert isinstance(s, int) and s > 0 + elif isinstance(step, int): + assert step > 0 + else: + raise TypeError('"step" must be a list or integer') + self.step = step + self.gamma = gamma + super(StepLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + progress = runner.epoch if self.by_epoch else trainer.iter + + if isinstance(self.step, int): + return base_lr * (self.gamma ** (progress // self.step)) + + exp = len(self.step) + for i, s in enumerate(self.step): + if progress < s: + exp = i + break + + return base_lr * self.gamma ** exp + + +class ExpLrUpdaterHook(LrUpdaterHook): + def __init__(self, gamma, **kwargs): + self.gamma = gamma + super(ExpLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + progress = trainer.epoch if self.by_epoch else trainer.iter + return base_lr * self.gamma ** progress + + +class PolyLrUpdaterHook(LrUpdaterHook): + def __init__(self, power=1.0, min_lr=0.0, **kwargs): + self.power = power + self.min_lr = min_lr + super(PolyLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, trainer, base_lr): + if self.by_epoch: + progress = trainer.epoch + max_progress = trainer.max_epochs + else: + progress = trainer.iter + max_progress = trainer.max_iters + coeff = (1 - progress / max_progress) ** self.power + return (base_lr - self.min_lr) * coeff + self.min_lr + + +class InvLrUpdaterHook(LrUpdaterHook): + def __init__(self, gamma, power=1.0, **kwargs): + self.gamma = gamma + self.power = power + super(InvLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, trainer, base_lr): + progress = trainer.epoch if self.by_epoch else trainer.iter + return base_lr * (1 + self.gamma * progress) ** (-self.power) + + +class CosineLrUpdaterHook(LrUpdaterHook): + def __init__(self, target_lr=0, **kwargs): + self.target_lr = target_lr + super(CosineLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, trainer, base_lr): + if self.by_epoch: + progress = trainer.epoch + max_progress = trainer.max_epochs + else: + progress = trainer.iter + max_progress = trainer.max_iters + + return self.target_lr + 0.5 * (base_lr - self.target_lr) * ( + 1 + cos(pi * (progress / max_progress)) + ) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/memory.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/memory.py new file mode 100644 index 0000000000000000000000000000000000000000..990f8cecbc601045ca94849c77df0e3dd38f1d26 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/memory.py @@ -0,0 +1,22 @@ +import torch + +from .hook import Hook + + +class EmptyCacheHook(Hook): + def __init__(self, before_epoch=False, after_epoch=True, after_iter=False): + self._before_epoch = before_epoch + self._after_epoch = after_epoch + self._after_iter = after_iter + + def after_iter(self, trainer): + if self._after_iter: + torch.cuda.empty_cache() + + def before_epoch(self, trainer): + if self._before_epoch: + torch.cuda.empty_cache() + + def after_epoch(self, trainer): + if self._after_epoch: + torch.cuda.empty_cache() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/optimizer.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..9a50e620af4705cb66732dd2551adc4618a13323 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/optimizer.py @@ -0,0 +1,21 @@ +from torch.nn.utils import clip_grad + +from .hook import Hook + + +class OptimizerHook(Hook): + def __init__(self, grad_clip=None): + self.grad_clip = grad_clip + + def clip_grads(self, params): + clip_grad.clip_grad_norm_( + filter(lambda p: p.requires_grad, params), **self.grad_clip + ) + + def after_train_iter(self, trainer): + trainer.optimizer.zero_grad() + # print(trainer.outputs["loss"]) + trainer.outputs["loss"].backward() + if self.grad_clip is not None: + self.clip_grads(trainer.model.parameters()) + trainer.optimizer.step() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/sampler_seed.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/sampler_seed.py new file mode 100644 index 0000000000000000000000000000000000000000..f3f0df34fe00bd6ba83e886f2ec0262112dc5c0d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/hooks/sampler_seed.py @@ -0,0 +1,6 @@ +from .hook import Hook + + +class DistSamplerSeedHook(Hook): + def before_epoch(self, trainer): + trainer.data_loader.sampler.set_epoch(trainer.epoch) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/log_buffer.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/log_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..694dc3dfb8b2b4f426199e1344aca47d0453a3e1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/log_buffer.py @@ -0,0 +1,42 @@ +from collections import OrderedDict + +import numpy as np + + +class LogBuffer(object): + def __init__(self): + self.val_history = OrderedDict() + self.n_history = OrderedDict() + self.output = OrderedDict() + self.ready = False + + def clear(self): + self.val_history.clear() + self.n_history.clear() + self.clear_output() + + def clear_output(self): + self.output.clear() + self.ready = False + + def update(self, vars, count=1): + assert isinstance(vars, dict) + for key, var in vars.items(): + if key not in self.val_history: + self.val_history[key] = [] + self.n_history[key] = [] + self.val_history[key].append(var) + self.n_history[key].append(count) + + def average(self, n=0): + """Average latest n values or all values""" + assert n >= 0 + for key in self.val_history: + values = np.array(self.val_history[key][-n:]) + nums = np.array(self.n_history[key][-n:]) + if values.shape == nums.shape: + avg = np.sum(values * nums) / np.sum(nums) + else: + avg = np.mean(values, axis=0).tolist() + self.output[key] = avg + self.ready = True diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/parallel_test.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/parallel_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9947c27569ce3cc9bbc7190c082556e2edde4393 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/parallel_test.py @@ -0,0 +1,77 @@ +import multiprocessing + +import torch +from det3d import torchie + +from .checkpoint import load_checkpoint + + +def worker_func( + model_cls, + model_kwargs, + checkpoint, + dataset, + data_func, + gpu_id, + idx_queue, + result_queue, +): + model = model_cls(**model_kwargs) + load_checkpoint(model, checkpoint, map_location="cpu") + torch.cuda.set_device(gpu_id) + model.cuda() + model.eval() + with torch.no_grad(): + while True: + idx = idx_queue.get() + data = dataset[idx] + result = model(**data_func(data, gpu_id)) + result_queue.put((idx, result)) + + +def parallel_test( + model_cls, model_kwargs, checkpoint, dataset, data_func, gpus, workers_per_gpu=1 +): + """Parallel testing on multiple GPUs. + + Args: + model_cls (type): Model class type. + model_kwargs (dict): Arguments to init the model. + checkpoint (str): Checkpoint filepath. + dataset (:obj:`Dataset`): The dataset to be tested. + data_func (callable): The function that generates model inputs. + gpus (list[int]): GPU ids to be used. + workers_per_gpu (int): Number of processes on each GPU. It is possible + to run multiple workers on each GPU. + + Returns: + list: Test results. + """ + ctx = multiprocessing.get_context("spawn") + idx_queue = ctx.Queue() + result_queue = ctx.Queue() + num_workers = len(gpus) * workers_per_gpu + workers = [ + ctx.Process( + target=worker_func, + args=( + model_cls, + model_kwargs, + checkpoint, + dataset, + data_func, + gpus[i % len(gpus)], + idx_queue, + result_queue, + ), + ) + for i in range(num_workers) + ] + for w in workers: + w.daemon = True + w.start() + + for i in range(len(dataset)): + idx_queue.put(i) + + results = [None for _ in range(len(dataset))] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/priority.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/priority.py new file mode 100644 index 0000000000000000000000000000000000000000..8daf7fb9e1ec04ad89aa8a7f064db71290372bc1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/priority.py @@ -0,0 +1,53 @@ +from enum import Enum + + +class Priority(Enum): + """Hook priority levels. + + +------------+------------+ + | Level | Value | + +============+============+ + | HIGHEST | 0 | + +------------+------------+ + | VERY_HIGH | 10 | + +------------+------------+ + | HIGH | 30 | + +------------+------------+ + | NORMAL | 50 | + +------------+------------+ + | LOW | 70 | + +------------+------------+ + | VERY_LOW | 90 | + +------------+------------+ + | LOWEST | 100 | + +------------+------------+ + """ + + HIGHEST = 0 + VERY_HIGH = 10 + HIGH = 30 + NORMAL = 50 + LOW = 70 + VERY_LOW = 90 + LOWEST = 100 + + +def get_priority(priority): + """Get priority value. + + Args: + priority (int or str or :obj:`Priority`): Priority. + + Returns: + int: The priority value. + """ + if isinstance(priority, int): + if priority < 0 or priority > 100: + raise ValueError("priority must be between 0 and 100") + return priority + elif isinstance(priority, Priority): + return priority.value + elif isinstance(priority, str): + return Priority[priority.upper()].value + else: + raise TypeError("priority must be an integer or Priority enum value") diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/trainer.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..18bfc9679025fee7bd558527574bcd24ac317cee --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/trainer.py @@ -0,0 +1,594 @@ +import logging +import os.path as osp +import queue +import sys +import threading +import time +from collections import OrderedDict + +import torch +from det3d import torchie + +from . import hooks +from .checkpoint import load_checkpoint, save_checkpoint +from .hooks import ( + CheckpointHook, + Hook, + IterTimerHook, + LrUpdaterHook, + OptimizerHook, + lr_updater, +) +from .log_buffer import LogBuffer +from .priority import get_priority +from .utils import ( + all_gather, + get_dist_info, + get_host_info, + get_time_str, + obj_from_dict, + synchronize, +) + + +def example_to_device(example, device, non_blocking=False) -> dict: + example_torch = {} + float_names = ["voxels", "bev_map"] + for k, v in example.items(): + if k in ["anchors", "anchors_mask", "reg_targets", "reg_weights", "labels", "hm", + "anno_box", "ind", "mask", 'cat', 'points']: + example_torch[k] = [res.to(device, non_blocking=non_blocking) for res in v] + elif k in [ + "voxels", + "bev_map", + "coordinates", + "num_points", + "num_voxels", + "cyv_voxels", + "cyv_num_voxels", + "cyv_coordinates", + "cyv_num_points", + "gt_boxes_and_cls" + ]: + example_torch[k] = v.to(device, non_blocking=non_blocking) + elif k == "calib": + calib = {} + for k1, v1 in v.items(): + calib[k1] = v1.to(device, non_blocking=non_blocking) + example_torch[k] = calib + else: + example_torch[k] = v + + return example_torch + + +def parse_second_losses(losses): + + log_vars = OrderedDict() + loss = sum(losses["loss"]) + for loss_name, loss_value in losses.items(): + if loss_name == "loc_loss_elem": + log_vars[loss_name] = [[i.item() for i in j] for j in loss_value] + else: + log_vars[loss_name] = [i.item() for i in loss_value] + + return loss, log_vars + + +class BackgroundGenerator(threading.Thread): + def __init__(self, generator, max_prefetch=1): + threading.Thread.__init__(self) + self.queue = queue.Queue(max_prefetch) + self.generator = generator + self.daemon = True + self.start() + + def run(self): + for item in self.generator: + self.queue.put(item) + self.queue.put(None) + + def next(self): + next_item = self.queue.get() + if next_item is None: + raise StopIteration + return next_item + + # Python 3 compatibility + def __next__(self): + return self.next() + + def __iter__(self): + return self + + +class Prefetcher(object): + def __init__(self, dataloader): + self.loader = iter(dataloader) + self.stream = torch.cuda.Stream() + self.preload() + + def preload(self): + try: + self.next_input = next(self.loader) + except StopIteration: + self.next_input = None + return + with torch.cuda.stream(self.stream): + self.next_input = example_to_device( + self.next_input, torch.cuda.current_device(), non_blocking=False + ) + + def next(self): + torch.cuda.current_stream().wait_stream(self.stream) + input = self.next_input + self.preload() + return input + + +class Trainer(object): + """ A training helper for PyTorch + + Args: + model: + batch_processor: + optimizer: + workdir: + log_level: + logger: + """ + + def __init__( + self, + model, + batch_processor, + optimizer=None, + lr_scheduler=None, + work_dir=None, + log_level=logging.INFO, + logger=None, + **kwargs, + ): + assert callable(batch_processor) + self.model = model + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + + self.batch_processor = batch_processor + + # Create work_dir + if torchie.is_str(work_dir): + self.work_dir = osp.abspath(work_dir) + torchie.mkdir_or_exist(self.work_dir) + elif work_dir is None: + self.work_dir = None + else: + raise TypeError("'work_dir' must be a str or None") + + # Get model name from the model class + if hasattr(self.model, "module"): + self._model_name = self.model.module.__class__.__name__ + else: + self._model_name = self.model.__class__.__name__ + + self._rank, self._world_size = get_dist_info() + self.timestamp = get_time_str() + if logger is None: + self.logger = self.init_logger(work_dir, log_level) + else: + self.logger = logger + self.log_buffer = LogBuffer() + + self.mode = None + self._hooks = [] + self._epoch = 0 + self._iter = 0 + self._inner_iter = 0 + self._max_epochs = 0 + self._max_iters = 0 + + @property + def model_name(self): + """str: Name of the model, usually the module class name.""" + return self._model_name + + @property + def rank(self): + """int: Rank of current process. (distributed training)""" + return self._rank + + @property + def world_size(self): + """int: Number of processes participating in the job. + (distributed training)""" + return self._world_size + + @property + def hooks(self): + """list[:obj:`Hook`]: A list of registered hooks.""" + return self._hooks + + @property + def epoch(self): + """int: Current epoch.""" + return self._epoch + + @property + def iter(self): + """int: Current iteration.""" + return self._iter + + @property + def inner_iter(self): + """int: Iteration in an epoch.""" + return self._inner_iter + + @property + def max_epochs(self): + """int: Maximum training epochs.""" + return self._max_epochs + + @property + def max_iters(self): + """int: Maximum training iterations.""" + return self._max_iters + + def init_optimizer(self, optimizer): + """Init the optimizer + + Args: + optimizer (dict or :obj:`~torch.optim.Optimizer`) + + Returns: + :obj:`~torch.optim.Optimizer` + + Examples: + >>> optimizer = dict(type='SGD', lr=0.01, momentum=0.9) + >>> type(runner.init_optimizer(optimizer)) + + """ + if isinstance(optimizer, dict): + optimizer = obj_from_dict( + optimizer, torch.optim, dict(params=self.model.parameters()) + ) + elif not isinstance(optimizer, torch.optim.Optimizer): + raise TypeError( + "optimizer must be either an Optimizer object or a dict, " + "but got {}".format(type(optimizer)) + ) + return optimizer + + def _add_file_handler(self, logger, filename=None, mode="w", level=logging.INFO): + # TODO: move this method out of runner + file_handler = logging.FileHandler(filename, mode) + file_handler.setFormatter( + logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + ) + file_handler.setLevel(level) + logger.addHandler(file_handler) + return logger + + def init_logger(self, log_dir=None, level=logging.INFO): + """Init the logger. + + Args: + + Returns: + :obj:`~logging.Logger`: Python logger. + """ + logging.basicConfig( + format="%(asctime)s - %(levelname)s - % (message)s", level=level + ) + logger = logging.getLogger(__name__) + if log_dir and self.rank == 0: + filename = "{}.log".format(self.timestamp) + log_file = osp.join(log_dir, filename) + self._add_file_handler(logger, log_file, level=level) + return logger + + def current_lr(self): + if self.optimizer is None: + raise RuntimeError("lr is not applicable because optimizer does not exist.") + return [group["lr"] for group in self.optimizer.param_groups] + + def register_hook(self, hook, priority="NORMAL"): + """Register a hook into the hook list. + + Args: + hook (:obj:`Hook`) + priority (int or str or :obj:`Priority`) + """ + assert isinstance(hook, Hook) + if hasattr(hook, "priority"): + raise ValueError('"priority" is a reserved attribute for hooks') + priority = get_priority(priority) + hook.priority = priority + # Insert the hook to a sorted list + inserted = False + for i in range(len(self._hooks) - 1, -1, -1): + if priority >= self._hooks[i].priority: + self._hooks.insert(i + 1, hook) + inserted = True + break + if not inserted: + self._hooks.insert(0, hook) + + def build_hook(self, args, hook_type=None): + if isinstance(args, Hook): + return args + elif isinstance(args, dict): + assert issubclass(hook_type, Hook) + return hook_type(**args) + else: + raise TypeError( + "'args' must be either a Hook object" + " or dict, not {}".format(type(args)) + ) + + def call_hook(self, fn_name): + for hook in self._hooks: + getattr(hook, fn_name)(self) + + def load_checkpoint(self, filename, map_location="cpu", strict=False): + self.logger.info("load checkpoint from %s", filename) + return load_checkpoint(self.model, filename, map_location, strict, self.logger) + + def save_checkpoint( + self, out_dir, filename_tmpl="epoch_{}.pth", save_optimizer=True, meta=None + ): + if meta is None: + meta = dict(epoch=self.epoch + 1, iter=self.iter) + else: + meta.update(epoch=self.epoch + 1, iter=self.iter) + + filename = filename_tmpl.format(self.epoch + 1) + filepath = osp.join(out_dir, filename) + linkpath = osp.join(out_dir, "latest.pth") + optimizer = self.optimizer if save_optimizer else None + save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta) + # Use relative symlink + torchie.symlink(filename, linkpath) + + def batch_processor_inline(self, model, data, train_mode, **kwargs): + + if "local_rank" in kwargs: + device = torch.device(kwargs["local_rank"]) + else: + device = None + + # data = example_convert_to_torch(data, device=device) + example = example_to_device( + data, torch.cuda.current_device(), non_blocking=False + ) + + self.call_hook("after_data_to_device") + + if train_mode: + losses = model(example, return_loss=True) + self.call_hook("after_forward") + loss, log_vars = parse_second_losses(losses) + del losses + + outputs = dict( + loss=loss, log_vars=log_vars, num_samples=-1 # TODO: FIX THIS + ) + self.call_hook("after_parse_loss") + + return outputs + else: + return model(example, return_loss=False) + + def train(self, data_loader, epoch, **kwargs): + + self.model.train() + self.mode = "train" + self.data_loader = data_loader + self.length = len(data_loader) + self._max_iters = self._max_epochs * self.length + self.call_hook("before_train_epoch") + + base_step = epoch * self.length + + # prefetcher = Prefetcher(data_loader) + # for data_batch in BackgroundGenerator(data_loader, max_prefetch=3): + for i, data_batch in enumerate(data_loader): + global_step = base_step + i + if self.lr_scheduler is not None: + #print(global_step) + self.lr_scheduler.step(global_step) + + self._inner_iter = i + + self.call_hook("before_train_iter") + + # outputs = self.batch_processor(self.model, + # data_batch, + # train_mode=True, + # **kwargs) + outputs = self.batch_processor_inline( + self.model, data_batch, train_mode=True, **kwargs + ) + + if not isinstance(outputs, dict): + raise TypeError("batch_processor() must return a dict") + if "log_vars" in outputs: + self.log_buffer.update(outputs["log_vars"], outputs["num_samples"]) + self.outputs = outputs + self.call_hook("after_train_iter") + self._iter += 1 + + self.call_hook("after_train_epoch") + self._epoch += 1 + + def val(self, data_loader, **kwargs): + self.model.eval() + self.mode = "val" + self.data_loader = data_loader + self.call_hook("before_val_epoch") + + self.logger.info(f"work dir: {self.work_dir}") + + if self.rank == 0: + prog_bar = torchie.ProgressBar(len(data_loader.dataset)) + + detections = {} + cpu_device = torch.device("cpu") + + for i, data_batch in enumerate(data_loader): + self._inner_iter = i + self.call_hook("before_val_iter") + with torch.no_grad(): + outputs = self.batch_processor( + self.model, data_batch, train_mode=False, **kwargs + ) + for output in outputs: + token = output["metadata"]["token"] + for k, v in output.items(): + if k not in [ + "metadata", + ]: + output[k] = v.to(cpu_device) + detections.update( + {token: output,} + ) + if self.rank == 0: + for _ in range(self.world_size): + prog_bar.update() + + synchronize() + + all_predictions = all_gather(detections) + + if self.rank != 0: + return + + predictions = {} + for p in all_predictions: + predictions.update(p) + + # torch.save(predictions, "final_predictions_debug.pkl") + # TODO fix evaluation module + result_dict, _ = self.data_loader.dataset.evaluation( + predictions, output_dir=self.work_dir + ) + + self.logger.info("\n") + for k, v in result_dict["results"].items(): + self.logger.info(f"Evaluation {k}: {v}") + + self.call_hook("after_val_epoch") + + def resume(self, checkpoint, resume_optimizer=True, map_location="default"): + if map_location == "default": + checkpoint = self.load_checkpoint( + checkpoint , map_location='cuda:{}'.format(torch.cuda.current_device()) # TODO: FIX THIS!! + ) + else: + checkpoint = self.load_checkpoint(checkpoint, map_location=map_location) + + self._epoch = checkpoint["meta"]["epoch"] + self._iter = checkpoint["meta"]["iter"] + if "optimizer" in checkpoint and resume_optimizer: + self.optimizer.load_state_dict(checkpoint["optimizer"]) + + self.logger.info("resumed epoch %d, iter %d", self.epoch, self.iter) + + def run(self, data_loaders, workflow, max_epochs, **kwargs): + """ Start running. + + Args: + data_loaders (list[:obj:`DataLoader`]) + workflow (list[tuple]): A list of (phase, epochs) to specify the + running order and epochs. + max_epochs (int) + """ + assert isinstance(data_loaders, list) + assert torchie.is_list_of(workflow, tuple) + assert len(data_loaders) == len(workflow) + + self._max_epochs = max_epochs + work_dir = self.work_dir if self.work_dir is not None else "NONE" + self.logger.info( + "Start running, host: %s, work_dir: %s", get_host_info(), work_dir + ) + self.logger.info("workflow: %s, max: %d epochs", workflow, max_epochs) + self.call_hook("before_run") + + while self.epoch < max_epochs: + for i, flow in enumerate(workflow): + mode, epochs = flow + if isinstance(mode, str): + if not hasattr(self, mode): + raise ValueError( + "Trainer has no method named '{}' to run an epoch".format( + mode + ) + ) + epoch_runner = getattr(self, mode) + elif callable(mode): + epoch_runner = mode + else: + raise TypeError( + "mode in workflow must be a str or " + "callable function not '{}'".format(type(mode)) + ) + + for _ in range(epochs): + if mode == "train" and self.epoch >= max_epochs: + return + elif mode == "val": + epoch_runner(data_loaders[i], **kwargs) + else: + epoch_runner(data_loaders[i], self.epoch, **kwargs) + + # time.sleep(1) + self.call_hook("after_run") + + def register_lr_hooks(self, lr_config): + if isinstance(lr_config, LrUpdaterHook): + self.register_hook(lr_config) + elif isinstance(lr_config, dict): + assert "policy" in lr_config + hook_name = lr_config["policy"].title() + "LrUpdaterHook" + if not hasattr(lr_updater, hook_name): + raise ValueError('"{}" does not exist'.format(hook_name)) + hook_cls = getattr(lr_updater, hook_name) + self.register_hook(hook_cls(**lr_config)) + else: + raise TypeError( + "'lr_config' must be eigher a LrUpdaterHook object" + " or dict, not '{}'".format(type(lr_config)) + ) + + def register_logger_hooks(self, log_config): + log_interval = log_config["interval"] + for info in log_config["hooks"]: + logger_hook = obj_from_dict( + info, hooks, default_args=dict(interval=log_interval) + ) + self.register_hook(logger_hook, priority="VERY_LOW") + + def register_training_hooks( + self, lr_config, optimizer_config=None, checkpoint_config=None, log_config=None + ): + """Register default hooks for training. + + Default hooks include: + - LrUpdaterHook + - OptimizerStepperHook + - CheckpointSaverHook + - IterTimerHook + - LoggerHook(s) + """ + if optimizer_config is None: + optimizer_config = {} + if checkpoint_config is None: + checkpoint_config = {} + if lr_config is not None: + assert self.lr_scheduler is None + self.register_lr_hooks(lr_config) + self.register_hook(self.build_hook(optimizer_config, OptimizerHook)) + self.register_hook(self.build_hook(checkpoint_config, CheckpointHook)) + self.register_hook(IterTimerHook()) + if log_config is not None: + self.register_logger_hooks(log_config) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/utils.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e0958900c67907ee66a0baa65c9a1c010dc4ef38 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/trainer/utils.py @@ -0,0 +1,183 @@ +""" +This file contains primitives for multi-gpu communication. +This is useful when doing distributed training. +""" + +import functools +import pickle +import sys +import time +from getpass import getuser +from socket import gethostname + +import torch +import torch.distributed as dist +from det3d import torchie + + +def get_host_info(): + return "{}@{}".format(getuser(), gethostname()) + + +def get_dist_info(): + if torch.__version__ < "1.0": + initialized = dist._initialized + else: + initialized = dist.is_initialized() + if initialized: + rank = dist.get_rank() + world_size = dist.get_world_size() + else: + rank = 0 + world_size = 1 + return rank, world_size + + +def master_only(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + rank, _ = get_dist_info() + if rank == 0: + return func(*args, **kwargs) + + return wrapper + + +def get_time_str(): + return time.strftime("%Y%m%d_%H%M%S", time.localtime()) + + +def obj_from_dict(info, parent=None, default_args=None): + """Initialize an object from dict. + + The dict must contain the key "type", which indicates the object type + + Args: + info (dict): Object types and arguments + parent (:class:`modules`): + default_args (dict, optional): + """ + assert isinstance(info, dict) and "type" in info + assert isinstance(default_args, dict) or default_args is None + args = info.copy() + obj_type = args.pop("type") + if torchie.is_str(obj_type): + if parent is not None: + obj_type = getattr(parent, obj_type) + else: + obj_type = sys.modules[obj_type] + elif not isinstance(obj_type, type): + raise TypeError( + "type must be a str or valid type, but got {}".format(type(obj_type)) + ) + if default_args is not None: + for name, value in default_args.items(): + args.setdefault(name, value) + return obj_type(**args) + + +def get_world_size(): + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def synchronize(): + """ + Helper function to synchronize (barrier) among all processes when + using distributed training + """ + if not dist.is_available(): + return + if not dist.is_initialized(): + return + world_size = dist.get_world_size() + if world_size == 1: + return + dist.barrier() + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + + # serialized to a Tensor + buffer = pickle.dumps(data) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to("cuda") + + # obtain Tensor size of each rank + local_size = torch.IntTensor([tensor.numel()]).to("cuda") + size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] + dist.all_gather(size_list, local_size) + size_list = [int(size.item()) for size in size_list] + max_size = max(size_list) + + # receiving Tensor from all ranks + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + tensor_list = [] + for _ in size_list: + tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) + if local_size != max_size: + padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") + tensor = torch.cat((tensor, padding), dim=0) + dist.all_gather(tensor_list, tensor) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def reduce_dict(input_dict, average=True): + """ + Args: + input_dict (dict): all the values will be reduced + average (bool): whether to do average or sum + Reduce the values in the dictionary from all processes so that process with rank + 0 has the averaged results. Returns a dict with the same fields as + input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + dist.reduce(values, dst=0) + if dist.get_rank() == 0 and average: + # only main process gets accumulated, so only divide by + # world_size in this case + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3375e4a5acd47847a97ae8c13fbd2fe842b97eac --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/__init__.py @@ -0,0 +1,62 @@ +from .config import Config, ConfigDict +from .misc import ( + check_prerequisites, + concat_list, + is_list_of, + is_seq_of, + is_str, + is_tuple_of, + iter_cast, + list_cast, + requires_executable, + requires_package, + slice_list, + tuple_cast, +) +from .path import ( + FileNotFoundError, + check_file_exist, + fopen, + is_filepath, + mkdir_or_exist, + scandir, + symlink, +) +from .progressbar import ( + ProgressBar, + track_iter_progress, + track_parallel_progress, + track_progress, +) +from .timer import Timer, TimerError, check_time + +__all__ = [ + "ConfigDict", + "Config", + "is_str", + "iter_cast", + "list_cast", + "tuple_cast", + "is_seq_of", + "is_list_of", + "is_tuple_of", + "slice_list", + "concat_list", + "check_prerequisites", + "requires_package", + "requires_executable", + "is_filepath", + "fopen", + "check_file_exist", + "mkdir_or_exist", + "symlink", + "scandir", + "FileNotFoundError", + "ProgressBar", + "track_progress", + "track_iter_progress", + "track_parallel_progress", + "Timer", + "TimerError", + "check_time", +] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/config.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..97c6ef22cc3b0d9dc8d6efe9f30cec1c6cd718a8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/config.py @@ -0,0 +1,162 @@ +import os.path as osp +import sys +from argparse import ArgumentParser +from importlib import import_module + +from addict import Dict + +from .misc import collections_abc +from .path import check_file_exist + + +class ConfigDict(Dict): + def __missing__(self, name): + raise KeyError(name) + + def __getattr__(self, name): + try: + value = super(ConfigDict, self).__getattr__(name) + except KeyError: + ex = AttributeError( + "'{}' object has no attribute '{}'".format( + self.__class__.__name__, name + ) + ) + except Exception as e: + ex = e + else: + return value + raise ex + + +def add_args(parser, cfg, prefix=""): + for k, v in cfg.items(): + if isinstance(v, str): + parser.add_argument("--" + prefix + k) + elif isinstance(v, int): + parser.add_argument("--" + prefix + k, type=int) + elif isinstance(v, float): + parser.add_argument("--" + prefix + k, type=float) + elif isinstance(v, bool): + parser.add_argument("--" + prefix + k, action="store_true") + elif isinstance(v, dict): + add_args(parser, v, k + ".") + elif isinstance(v, collections_abc.Iterable): + parser.add_argument("--" + prefix + k, type=type(v[0]), nargs="+") + else: + print("connot parse key {} of type {}".format(prefix + k, type(v))) + return parser + + +class Config(object): + """A facility for config and config files. + + It supports common file formats as configs: python/json/yaml. The interface + is the same as a dict object and also allows access config values as + attributes. + + Example: + >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1]))) + >>> cfg.a + 1 + >>> cfg.b + {'b1': [0, 1]} + >>> cfg.b.b1 + [0, 1] + >>> cfg = Config.fromfile('tests/data/config/a.py') + >>> cfg.filename + "/home/kchen/projects/torchie/tests/data/config/a.py" + >>> cfg.item4 + 'test' + >>> cfg + "Config [path: /home/kchen/projects/torchie/tests/data/config/a.py]: " + "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}" + + """ + + @staticmethod + def fromfile(filename): + filename = osp.abspath(osp.expanduser(filename)) + check_file_exist(filename) + if filename.endswith(".py"): + module_name = osp.basename(filename)[:-3] + if "." in module_name: + raise ValueError("Dots are not allowed in config file path.") + config_dir = osp.dirname(filename) + sys.path.insert(0, config_dir) + mod = import_module(module_name) + sys.path.pop(0) + cfg_dict = { + name: value + for name, value in mod.__dict__.items() + if not name.startswith("__") + } + elif filename.endswith((".yml", ".yaml", ".json")): + import torchie + + cfg_dict = torchie.load(filename) + else: + raise IOError("Only py/yml/yaml/json type are supported now!") + return Config(cfg_dict, filename=filename) + + @staticmethod + def auto_argparser(description=None): + """Generate argparser from config file automatically (experimental) + """ + partial_parser = ArgumentParser(description=description) + partial_parser.add_argument("config", help="config file path") + cfg_file = partial_parser.parse_known_args()[0].config + cfg = Config.fromfile(cfg_file) + parser = ArgumentParser(description=description) + parser.add_argument("config", help="config file path") + add_args(parser, cfg) + return parser, cfg + + def __init__(self, cfg_dict=None, filename=None): + if cfg_dict is None: + cfg_dict = dict() + elif not isinstance(cfg_dict, dict): + raise TypeError( + "cfg_dict must be a dict, but got {}".format(type(cfg_dict)) + ) + + super(Config, self).__setattr__("_cfg_dict", ConfigDict(cfg_dict)) + super(Config, self).__setattr__("_filename", filename) + if filename: + with open(filename, "r") as f: + super(Config, self).__setattr__("_text", f.read()) + else: + super(Config, self).__setattr__("_text", "") + + @property + def filename(self): + return self._filename + + @property + def text(self): + return self._text + + def __repr__(self): + return "Config (path: {}): {}".format(self.filename, self._cfg_dict.__repr__()) + + def __len__(self): + return len(self._cfg_dict) + + def __getattr__(self, name): + return getattr(self._cfg_dict, name) + + def __getitem__(self, name): + return self._cfg_dict.__getitem__(name) + + def __setattr__(self, name, value): + if isinstance(value, dict): + value = ConfigDict(value) + self._cfg_dict.__setattr__(name, value) + + def __setitem__(self, name, value): + if isinstance(value, dict): + value = ConfigDict(value) + self._cfg_dict.__setitem__(name, value) + + def __iter__(self): + return iter(self._cfg_dict) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/misc.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..5526dafaa5f12d4d2818620a431eafeac6a2d321 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/misc.py @@ -0,0 +1,221 @@ +import collections +import functools +import itertools +import subprocess +from importlib import import_module + +import six + +# ABCs from collections will be deprecated in python 3.8+, +# while collections.abc is not available in python 2.7 +try: + import collections.abc as collections_abc +except ImportError: + import collections as collections_abc + + +def is_str(x): + """Whether the input is an string instance.""" + return isinstance(x, six.string_types) + + +def iter_cast(inputs, dst_type, return_type=None): + """Cast elements of an iterable object into some type. + + Args: + inputs (Iterable): The input object. + dst_type (type): Destination type. + return_type (type, optional): If specified, the output object will be + converted to this type, otherwise an iterator. + + Returns: + iterator or specified type: The converted object. + """ + if not isinstance(inputs, collections_abc.Iterable): + raise TypeError("inputs must be an iterable object") + if not isinstance(dst_type, type): + raise TypeError('"dst_type" must be a valid type') + + out_iterable = six.moves.map(dst_type, inputs) + + if return_type is None: + return out_iterable + else: + return return_type(out_iterable) + + +def list_cast(inputs, dst_type): + """Cast elements of an iterable object into a list of some type. + + A partial method of :func:`iter_cast`. + """ + return iter_cast(inputs, dst_type, return_type=list) + + +def tuple_cast(inputs, dst_type): + """Cast elements of an iterable object into a tuple of some type. + + A partial method of :func:`iter_cast`. + """ + return iter_cast(inputs, dst_type, return_type=tuple) + + +def is_seq_of(seq, expected_type, seq_type=None): + """Check whether it is a sequence of some type. + + Args: + seq (Sequence): The sequence to be checked. + expected_type (type): Expected type of sequence items. + seq_type (type, optional): Expected sequence type. + + Returns: + bool: Whether the sequence is valid. + """ + if seq_type is None: + exp_seq_type = collections_abc.Sequence + else: + assert isinstance(seq_type, type) + exp_seq_type = seq_type + if not isinstance(seq, exp_seq_type): + return False + for item in seq: + if not isinstance(item, expected_type): + return False + return True + + +def is_list_of(seq, expected_type): + """Check whether it is a list of some type. + + A partial method of :func:`is_seq_of`. + """ + return is_seq_of(seq, expected_type, seq_type=list) + + +def is_tuple_of(seq, expected_type): + """Check whether it is a tuple of some type. + + A partial method of :func:`is_seq_of`. + """ + return is_seq_of(seq, expected_type, seq_type=tuple) + + +def slice_list(in_list, lens): + """Slice a list into several sub lists by a list of given length. + + Args: + in_list (list): The list to be sliced. + lens(int or list): The expected length of each out list. + + Returns: + list: A list of sliced list. + """ + if not isinstance(lens, list): + raise TypeError('"indices" must be a list of integers') + elif sum(lens) != len(in_list): + raise ValueError( + "sum of lens and list length does not match: {} != {}".format( + sum(lens), len(in_list) + ) + ) + out_list = [] + idx = 0 + for i in range(len(lens)): + out_list.append(in_list[idx : idx + lens[i]]) + idx += lens[i] + return out_list + + +def concat_list(in_list): + """Concatenate a list of list into a single list. + + Args: + in_list (list): The list of list to be merged. + + Returns: + list: The concatenated flat list. + """ + return list(itertools.chain(*in_list)) + + +def check_prerequisites( + prerequisites, + checker, + msg_tmpl='Prerequisites "{}" are required in method "{}" but not ' + "found, please install them first.", +): + """A decorator factory to check if prerequisites are satisfied. + + Args: + prerequisites (str of list[str]): Prerequisites to be checked. + checker (callable): The checker method that returns True if a + prerequisite is meet, False otherwise. + msg_tmpl (str): The message template with two variables. + + Returns: + decorator: A specific decorator. + """ + + def wrap(func): + @functools.wraps(func) + def wrapped_func(*args, **kwargs): + requirements = ( + [prerequisites] if isinstance(prerequisites, str) else prerequisites + ) + missing = [] + for item in requirements: + if not checker(item): + missing.append(item) + if missing: + print(msg_tmpl.format(", ".join(missing), func.__name__)) + raise RuntimeError("Prerequisites not meet.") + else: + return func(*args, **kwargs) + + return wrapped_func + + return wrap + + +def _check_py_package(package): + try: + import_module(package) + except ImportError: + return False + else: + return True + + +def _check_executable(cmd): + if subprocess.call("which {}".format(cmd), shell=True) != 0: + return False + else: + return True + + +def requires_package(prerequisites): + """A decorator to check if some python packages are installed. + + Example: + >>> @requires_package('numpy') + >>> func(arg1, args): + >>> return numpy.zeros(1) + array([0.]) + >>> @requires_package(['numpy', 'non_package']) + >>> func(arg1, args): + >>> return numpy.zeros(1) + ImportError + """ + return check_prerequisites(prerequisites, checker=_check_py_package) + + +def requires_executable(prerequisites): + """A decorator to check if some executable files are installed. + + Example: + >>> @requires_executable('ffmpeg') + >>> func(arg1, args): + >>> print(1) + 1 + """ + return check_prerequisites(prerequisites, checker=_check_executable) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/path.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/path.py new file mode 100644 index 0000000000000000000000000000000000000000..6722f483fa52c510ee9fba32a58b777d4cb4bc8f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/path.py @@ -0,0 +1,79 @@ +import os +import os.path as osp +import sys +from pathlib import Path + +import six + +from .misc import is_str + +if sys.version_info <= (3, 3): + FileNotFoundError = IOError +else: + FileNotFoundError = FileNotFoundError + + +def is_filepath(x): + if is_str(x) or isinstance(x, Path): + return True + else: + return False + + +def fopen(filepath, *args, **kwargs): + if is_str(filepath): + return open(filepath, *args, **kwargs) + elif isinstance(filepath, Path): + return filepath.open(*args, **kwargs) + + +def check_file_exist(filename, msg_tmpl='file "{}" does not exist'): + if not osp.isfile(filename): + raise FileNotFoundError(msg_tmpl.format(filename)) + + +def mkdir_or_exist(dir_name, mode=0o777): + if dir_name == "": + return + dir_name = osp.expanduser(dir_name) + if six.PY3: + os.makedirs(dir_name, mode=mode, exist_ok=True) + else: + if not osp.isdir(dir_name): + os.makedirs(dir_name, mode=mode) + + +def symlink(src, dst, overwrite=True, **kwargs): + if os.path.lexists(dst) and overwrite: + os.remove(dst) + os.symlink(src, dst, **kwargs) + + +def _scandir_py35(dir_path, suffix=None): + for entry in os.scandir(dir_path): + if not entry.is_file(): + continue + filename = entry.name + if suffix is None: + yield filename + elif filename.endswith(suffix): + yield filename + + +def _scandir_py(dir_path, suffix=None): + for filename in os.listdir(dir_path): + if not osp.isfile(osp.join(dir_path, filename)): + continue + if suffix is None: + yield filename + elif filename.endswith(suffix): + yield filename + + +def scandir(dir_path, suffix=None): + if suffix is not None and not isinstance(suffix, (str, tuple)): + raise TypeError('"suffix" must be a string or tuple of strings') + if sys.version_info >= (3, 5): + return _scandir_py35(dir_path, suffix) + else: + return _scandir_py(dir_path, suffix) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/progressbar.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/progressbar.py new file mode 100644 index 0000000000000000000000000000000000000000..a572449f48254b9ef6a7ec16e57df7bfb46bd6b7 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/progressbar.py @@ -0,0 +1,216 @@ +import sys +from multiprocessing import Pool + +from .misc import collections_abc +from .timer import Timer + + +class ProgressBar(object): + """A progress bar which can print the progress""" + + def __init__(self, task_num=0, bar_width=50, start=True): + self.task_num = task_num + max_bar_width = self._get_max_bar_width() + self.bar_width = bar_width if bar_width <= max_bar_width else max_bar_width + self.completed = 0 + if start: + self.start() + + def _get_max_bar_width(self): + if sys.version_info > (3, 3): + from shutil import get_terminal_size + else: + from backports.shutil_get_terminal_size import get_terminal_size + terminal_width, _ = get_terminal_size() + max_bar_width = min(int(terminal_width * 0.6), terminal_width - 50) + if max_bar_width < 10: + print( + "terminal width is too small ({}), please consider " + "widen the terminal for better progressbar " + "visualization".format(terminal_width) + ) + max_bar_width = 10 + return max_bar_width + + def start(self): + if self.task_num > 0: + sys.stdout.write( + "[{}] 0/{}, elapsed: 0s, ETA:".format( + " " * self.bar_width, self.task_num + ) + ) + else: + sys.stdout.write("completed: 0, elapsed: 0s") + sys.stdout.flush() + self.timer = Timer() + + def update(self): + self.completed += 1 + elapsed = self.timer.since_start() + fps = self.completed / elapsed + if self.task_num > 0: + percentage = self.completed / float(self.task_num) + eta = int(elapsed * (1 - percentage) / percentage + 0.5) + mark_width = int(self.bar_width * percentage) + bar_chars = ">" * mark_width + " " * (self.bar_width - mark_width) + sys.stdout.write( + "\r[{}] {}/{}, {:.1f} task/s, elapsed: {}s, ETA: {:5}s".format( + bar_chars, + self.completed, + self.task_num, + fps, + int(elapsed + 0.5), + eta, + ) + ) + else: + sys.stdout.write( + "completed: {}, elapsed: {}s, {:.1f} tasks/s".format( + self.completed, int(elapsed + 0.5), fps + ) + ) + sys.stdout.flush() + + +def track_progress(func, tasks, bar_width=50, **kwargs): + """Track the progress of tasks execution with a progress bar. + + Tasks are done with a simple for-loop. + + Args: + func (callable): The function to be applied to each task. + tasks (list or tuple[Iterable, int]): A list of tasks or + (tasks, total num). + bar_width (int): Width of progress bar. + + Returns: + list: The task results. + """ + if isinstance(tasks, tuple): + assert len(tasks) == 2 + assert isinstance(tasks[0], collections_abc.Iterable) + assert isinstance(tasks[1], int) + task_num = tasks[1] + tasks = tasks[0] + elif isinstance(tasks, collections_abc.Iterable): + task_num = len(tasks) + else: + raise TypeError('"tasks" must be an iterable object or a (iterator, int) tuple') + prog_bar = ProgressBar(task_num, bar_width) + results = [] + for task in tasks: + results.append(func(task, **kwargs)) + prog_bar.update() + sys.stdout.write("\n") + return results + + +def init_pool(process_num, initializer=None, initargs=None): + if initializer is None: + return Pool(process_num) + elif initargs is None: + return Pool(process_num, initializer) + else: + if not isinstance(initargs, tuple): + raise TypeError('"initargs" must be a tuple') + return Pool(process_num, initializer, initargs) + + +def track_parallel_progress( + func, + tasks, + nproc, + initializer=None, + initargs=None, + bar_width=50, + chunksize=1, + skip_first=False, + keep_order=True, +): + """Track the progress of parallel task execution with a progress bar. + + The built-in :mod:`multiprocessing` module is used for process pools and + tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`. + + Args: + func (callable): The function to be applied to each task. + tasks (list or tuple[Iterable, int]): A list of tasks or + (tasks, total num). + nproc (int): Process (worker) number. + initializer (None or callable): Refer to :class:`multiprocessing.Pool` + for details. + initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for + details. + chunksize (int): Refer to :class:`multiprocessing.Pool` for details. + bar_width (int): Width of progress bar. + skip_first (bool): Whether to skip the first sample for each worker + when estimating fps, since the initialization step may takes + longer. + keep_order (bool): If True, :func:`Pool.imap` is used, otherwise + :func:`Pool.imap_unordered` is used. + + Returns: + list: The task results. + """ + if isinstance(tasks, tuple): + assert len(tasks) == 2 + assert isinstance(tasks[0], collections_abc.Iterable) + assert isinstance(tasks[1], int) + task_num = tasks[1] + tasks = tasks[0] + elif isinstance(tasks, collections_abc.Iterable): + task_num = len(tasks) + else: + raise TypeError('"tasks" must be an iterable object or a (iterator, int) tuple') + pool = init_pool(nproc, initializer, initargs) + start = not skip_first + task_num -= nproc * chunksize * int(skip_first) + prog_bar = ProgressBar(task_num, bar_width, start) + results = [] + if keep_order: + gen = pool.imap(func, tasks, chunksize) + else: + gen = pool.imap_unordered(func, tasks, chunksize) + for result in gen: + results.append(result) + if skip_first: + if len(results) < nproc * chunksize: + continue + elif len(results) == nproc * chunksize: + prog_bar.start() + continue + prog_bar.update() + sys.stdout.write("\n") + pool.close() + pool.join() + return results + + +def track_iter_progress(tasks, bar_width=50, **kwargs): + """Track the progress of tasks iteration or enumeration with a progress bar. + + Tasks are yielded with a simple for-loop. + + Args: + tasks (list or tuple[Iterable, int]): A list of tasks or + (tasks, total num). + bar_width (int): Width of progress bar. + + Yields: + list: The task results. + """ + if isinstance(tasks, tuple): + assert len(tasks) == 2 + assert isinstance(tasks[0], collections_abc.Iterable) + assert isinstance(tasks[1], int) + task_num = tasks[1] + tasks = tasks[0] + elif isinstance(tasks, collections_abc.Iterable): + task_num = len(tasks) + else: + raise TypeError('"tasks" must be an iterable object or a (iterator, int) tuple') + prog_bar = ProgressBar(task_num, bar_width) + for task in tasks: + yield task + prog_bar.update() + sys.stdout.write("\n") diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/timer.py b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/timer.py new file mode 100644 index 0000000000000000000000000000000000000000..f562937a9441bfc56faa189a41163bc4856acec6 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/torchie/utils/timer.py @@ -0,0 +1,116 @@ +from time import time + + +class TimerError(Exception): + def __init__(self, message): + self.message = message + super(TimerError, self).__init__(message) + + +class Timer(object): + """A flexible Timer class. + + :Example: + + >>> import time + >>> import mmcv + >>> with mmcv.Timer(): + >>> # simulate a code block that will run for 1s + >>> time.sleep(1) + 1.000 + >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'): + >>> # simulate a code block that will run for 1s + >>> time.sleep(1) + it takes 1.0 seconds + >>> timer = mmcv.Timer() + >>> time.sleep(0.5) + >>> print(timer.since_start()) + 0.500 + >>> time.sleep(0.5) + >>> print(timer.since_last_check()) + 0.500 + >>> print(timer.since_start()) + 1.000 + """ + + def __init__(self, start=True, print_tmpl=None): + self._is_running = False + self.print_tmpl = print_tmpl if print_tmpl else "{:.3f}" + if start: + self.start() + + @property + def is_running(self): + """bool: indicate whether the timer is running""" + return self._is_running + + def __enter__(self): + self.start() + return self + + def __exit__(self, type, value, traceback): + print(self.print_tmpl.format(self.since_last_check())) + self._is_running = False + + def start(self): + """Start the timer.""" + if not self._is_running: + self._t_start = time() + self._is_running = True + self._t_last = time() + + def since_start(self): + """Total time since the timer is started. + + Returns (float): Time in seconds. + """ + if not self._is_running: + raise TimerError("timer is not running") + self._t_last = time() + return self._t_last - self._t_start + + def since_last_check(self): + """Time since the last checking. + + Either :func:`since_start` or :func:`since_last_check` is a checking + operation. + + Returns (float): Time in seconds. + """ + if not self._is_running: + raise TimerError("timer is not running") + dur = time() - self._t_last + self._t_last = time() + return dur + + +_g_timers = {} # global timers + + +def check_time(timer_id): + """Add check points in a single line. + + This method is suitable for running a task on a list of items. A timer will + be registered when the method is called for the first time. + + :Example: + + >>> import time + >>> import mmcv + >>> for i in range(1, 6): + >>> # simulate a code block + >>> time.sleep(i) + >>> mmcv.check_time('task1') + 2.000 + 3.000 + 4.000 + 5.000 + + Args: + timer_id (str): Timer identifier. + """ + if timer_id not in _g_timers: + _g_timers[timer_id] = Timer() + return 0 + else: + return _g_timers[timer_id].since_last_check() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..499c4f91b58bdd44f796dc1ac7b323165638e0df --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/__init__.py @@ -0,0 +1,4 @@ +from .flops_counter import get_model_complexity_info +from .registry import Registry, build_from_cfg + +__all__ = ["Registry", "build_from_cfg", "get_model_complexity_info"] diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/buildtools/__init__.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/buildtools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/buildtools/command.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/buildtools/command.py new file mode 100644 index 0000000000000000000000000000000000000000..9d5ed565c3ebdf8b2624360ae8d94ddeb422327d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/buildtools/command.py @@ -0,0 +1,292 @@ +import multiprocessing +import os +import re +import subprocess +from concurrent.futures import ProcessPoolExecutor +from enum import Enum +from functools import partial +from pathlib import Path + +import fire +from det3d.utils.find import find_cuda, find_cuda_device_arch + + +class Gpp: + def __init__( + self, + sources, + target, + std="c++11", + includes: list = None, + defines: dict = None, + cflags: str = None, + compiler="g++", + link=False, + libraries: dict = None, + lflags: str = None, + extra_cflags: str = None, + extra_lflags: str = None, + build_directory: str = None, + ): + if not isinstance(sources, (list, tuple)): + sources = [sources] + if build_directory is not None: + build_directory = Path(build_directory) + new_sources = [] + for p in sources: + if not Path(p).is_absolute(): + new_sources.append(str(build_directory / p)) + else: + new_sources.append(p) + sources = new_sources + target = Path(target) + if not target.is_absolute(): + target = str(build_directory / target) + self.sources = [str(p) for p in sources] + self.target = str(target) + self.std = std + self.includes = includes or [] + self.cflags = cflags or "-fPIC -O3" + self.defines = defines or {} + self.compiler = compiler + self.link = link + self.libraries = libraries or {} + self.lflags = lflags or "" + self.extra_cflags = extra_cflags or "" + self.extra_lflags = extra_lflags or "" + + def shell(self, target: str = None, compiler: str = None): + defines = [f"-D {n}={v}" for n, v in self.defines.items()] + includes = [f"-I{inc}" for inc in self.includes] + libraries = [ + f"-L{n} {' '.join(['-l' + l for l in v])}" + for n, v in self.libraries.items() + ] + compiler = compiler or self.compiler + string = f"{compiler} -std={self.std} " + if self.link: + string += " -shared " + else: + string += " -c " + target = target or self.target + string += ( + f"-o {target} {' '.join(self.sources)} " + f"{' '.join(defines)} " + f"{' '.join(includes)} " + f"{self.cflags} {self.extra_cflags}" + f"{' '.join(libraries)} " + f"{self.lflags} {self.extra_lflags}" + ) + return re.sub(r" +", r" ", string) + + +class Link: + def __init__(self, outs, target, compiler="ld", build_directory: str = None): + if not isinstance(outs, (list, tuple)): + outs = [outs] + if build_directory is not None: + build_directory = Path(build_directory) + new_outs = [] + for p in outs: + if not Path(p).is_absolute(): + new_outs.append(str(build_directory / p)) + else: + new_outs.append(p) + outs = new_outs + target = Path(target) + if target.is_absolute(): + target = str(build_directory / target) + self.outs = [str(p) for p in outs] + self.target = str(target) + self.compiler = compiler + + def shell(self, target: str = None): + string = f"{self.compiler} -r " + if target is None: + target = self.target + string += f"-o {target} {' '.join(self.outs)} " + return string + + +class Nvcc(Gpp): + def __init__( + self, + sources, + target, + arch=None, + std="c++11", + includes: list = None, + defines: dict = None, + cflags: str = None, + extra_cflags: str = None, + extra_lflags: str = None, + build_directory: str = None, + ): + if arch is None: + arch = find_cuda_device_arch() + if arch is None: + raise ValueError("you must specify arch if use cuda.") + + cflags = ( + cflags or f"-x cu -Xcompiler -fPIC -arch={arch} --expt-relaxed-constexpr" + ) + try: + cuda_home = find_cuda() + except: + cuda_home = None + if cuda_home is not None: + cuda_include = Path(cuda_home) / "include" + includes = includes or [] + includes += [str(cuda_include)] + super().__init__( + sources, + target, + std, + includes, + defines, + cflags, + compiler="nvcc", + extra_cflags=extra_cflags, + extra_lflags=extra_lflags, + build_directory=build_directory, + ) + + +class CUDALink(Gpp): + def __init__( + self, + sources, + target, + std="c++11", + includes: list = None, + defines: dict = None, + cflags: str = None, + libraries: dict = None, + lflags: str = None, + extra_cflags: str = None, + extra_lflags: str = None, + build_directory: str = None, + ): + includes = includes or [] + defines = defines or {} + libraries = libraries or {} + cflags = cflags or "-fPIC -O3" + try: + cuda_home = find_cuda() + except: + cuda_home = None + if cuda_home is not None: + cuda_include = Path(cuda_home) / "include" + includes += [str(cuda_include)] + cuda_lib_path = Path(cuda_home) / "lib64" + cuda_libs = {str(cuda_lib_path): ["cublas", "cudart"]} + libraries = {**libraries, **cuda_libs} + super().__init__( + sources, + target, + std, + includes, + defines, + cflags, + link=True, + libraries=libraries, + lflags=lflags, + extra_cflags=extra_cflags, + extra_lflags=extra_lflags, + build_directory=build_directory, + ) + + +class NodeState(Enum): + Evaled = "evaled" + Normal = "normal" + Error = "error" + + +class Node: + def __init__(self, name=None): + self.name = name + self.prev = [] + self.next = [] + self.state = NodeState.Normal + + def __call__(self, *nodes): + for node in nodes: + self.prev.append(node) + node.next.append(self) + return self + + def _eval(self, *args, **kw): + return True + + def eval(self, *args, **kw): + for p in self.prev: + if not p.eval(*args, **kw): + self.state = NodeState.Error + return False + if self.state == NodeState.Normal: + if self._eval(*args, **kw): + self.state = NodeState.Evaled + else: + self.state = NodeState.Error + return True + return True + + def reset(self): + self.state = NodeState.Normal + self.prev = [] + self.next = [] + for node in self.prev: + node.reset() + + +class TargetNode(Node): + def __init__(self, srcs, hdrs, deps, copts, name=None): + super().__init__(name) + self.srcs = srcs + self.hdrs = hdrs + self.deps = deps + self.copts = copts + + def _eval(self, executor): + pass + + +def compile_func(cmd, code_folder, compiler): + if not isinstance(cmd, (Link, Nvcc)): + shell = cmd.shell(compiler=compiler) + else: + shell = cmd.shell() + print(shell) + cwd = None + if code_folder is not None: + cwd = str(code_folder) + ret = subprocess.run(shell, shell=True, cwd=cwd) + if ret.returncode != 0: + raise RuntimeError("compile failed with retcode", ret.returncode) + return ret + + +def compile_libraries(cmds, code_folder=None, compiler: str = None, num_workers=-1): + if num_workers == -1: + num_workers = min(len(cmds), multiprocessing.cpu_count()) + # for cmd in cmds: + # print(cmd.shell()) + if num_workers == 0: + rets = map( + partial(compile_func, code_folder=code_folder, compiler=compiler), cmds + ) + else: + with ProcessPoolExecutor(num_workers) as pool: + func = partial(compile_func, code_folder=code_folder, compiler=compiler) + rets = pool.map(func, cmds) + + if any([r.returncode != 0 for r in rets]): + cmds.clear() + return False + cmds.clear() + return True + + +def out(path): + return Path(path).parent / (Path(path).stem + ".o") diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/buildtools/pybind11_build.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/buildtools/pybind11_build.py new file mode 100644 index 0000000000000000000000000000000000000000..606ebae10bb34c69ccaee3ff3818ebfdd4263ef5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/buildtools/pybind11_build.py @@ -0,0 +1,128 @@ +import shutil +import subprocess +import tempfile +from pathlib import Path + +from det3d.utils.find import find_cuda_device_arch +from det3d.utils.loader import import_file + +from .command import CUDALink, Gpp, Nvcc, compile_libraries, out + + +class Pybind11Link(Gpp): + def __init__( + self, + sources, + target, + std="c++11", + includes: list = None, + defines: dict = None, + cflags: str = None, + libraries: dict = None, + lflags: str = None, + extra_cflags: str = None, + extra_lflags: str = None, + build_directory: str = None, + ): + pb11_includes = ( + subprocess.check_output("python3 -m pybind11 --includes", shell=True) + .decode("utf8") + .strip("\n") + ) + cflags = cflags or "-fPIC -O3 " + cflags += pb11_includes + super().__init__( + sources, + target, + std, + includes, + defines, + cflags, + link=True, + libraries=libraries, + lflags=lflags, + extra_cflags=extra_cflags, + extra_lflags=extra_lflags, + build_directory=build_directory, + ) + + +class Pybind11CUDALink(CUDALink): + def __init__( + self, + sources, + target, + std="c++11", + includes: list = None, + defines: dict = None, + cflags: str = None, + libraries: dict = None, + lflags: str = None, + extra_cflags: str = None, + extra_lflags: str = None, + build_directory: str = None, + ): + pb11_includes = ( + subprocess.check_output("python3 -m pybind11 --includes", shell=True) + .decode("utf8") + .strip("\n") + ) + cflags = cflags or "-fPIC -O3 " + cflags += pb11_includes + super().__init__( + sources, + target, + std, + includes, + defines, + cflags, + libraries=libraries, + lflags=lflags, + extra_cflags=extra_cflags, + extra_lflags=extra_lflags, + build_directory=build_directory, + ) + + +def load_pb11( + sources, + target, + cwd=".", + cuda=False, + arch=None, + num_workers=4, + includes: list = None, + build_directory=None, + compiler="g++", +): + cmd_groups = [] + cmds = [] + outs = [] + main_sources = [] + if arch is None: + arch = find_cuda_device_arch() + + for s in sources: + s = str(s) + if ".cu" in s or ".cu.cc" in s: + assert cuda is True, "cuda must be true if contain cuda file" + cmds.append(Nvcc(s, out(s), arch)) + outs.append(out(s)) + else: + main_sources.append(s) + + if cuda is True and arch is None: + raise ValueError("you must specify arch if sources contains" " cuda files") + cmd_groups.append(cmds) + if cuda: + cmd_groups.append( + [Pybind11CUDALink(outs + main_sources, target, includes=includes)] + ) + else: + cmd_groups.append( + [Pybind11Link(outs + main_sources, target, includes=includes)] + ) + for cmds in cmd_groups: + compile_libraries(cmds, cwd, num_workers=num_workers, compiler=compiler) + + return import_file(target, add_to_sys=False, disable_warning=True) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/check.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/check.py new file mode 100644 index 0000000000000000000000000000000000000000..16a4137d1fd77377782859d5d11413dcf1284bff --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/check.py @@ -0,0 +1,17 @@ +import numpy as np + + +def is_array_like(x): + return isinstance(x, (list, tuple, np.ndarray)) + + +def shape_mergeable(x, expected_shape): + mergeable = True + if is_array_like(x) and is_array_like(expected_shape): + x = np.array(x) + if len(x.shape) == len(expected_shape): + for s, s_ex in zip(x.shape, expected_shape): + if s_ex is not None and s != s_ex: + mergeable = False + break + return mergeable diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/checkpoint.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..3cf85013e54430f6f716fd0aa585bf49db86f89f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/checkpoint.py @@ -0,0 +1,325 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import json +import logging +import os +from collections import OrderedDict +from pathlib import Path + +import torch +from tensorboardX import SummaryWriter + + +def _flat_nested_json_dict(json_dict, flatted, sep=".", start=""): + for k, v in json_dict.items(): + if isinstance(v, dict): + _flat_nested_json_dict(v, flatted, sep, start + sep + str(k)) + else: + flatted[start + sep + str(k)] = v + + +def flat_nested_json_dict(json_dict, sep=".") -> dict: + """flat a nested json-like dict. this function make shadow copy. + """ + flatted = {} + for k, v in json_dict.items(): + if isinstance(v, dict): + _flat_nested_json_dict(v, flatted, sep, str(k)) + else: + flatted[str(k)] = v + return flatted + + +def metric_to_str(metrics, sep="."): + flatted_metrics = flat_nested_json_dict(metrics, sep) + metrics_str_list = [] + for k, v in flatted_metrics.items(): + if isinstance(v, float): + metrics_str_list.append(f"{k}={v:.4}") + elif isinstance(v, (list, tuple)): + if v and isinstance(v[0], float): + v_str = ", ".join([f"{e:.4}" for e in v]) + metrics_str_list.append(f"{k}=[{v_str}]") + else: + metrics_str_list.append(f"{k}={v}") + else: + metrics_str_list.append(f"{k}={v}") + return ", ".join(metrics_str_list) + + +def align_and_update_state_dicts(model_state_dict, loaded_state_dict, logger=None): + """ + Strategy: suppose that the models that we will create will have prefixes appended + to each of its keys, for example due to an extra level of nesting that the original + pre-trained weights from ImageNet won't contain. For example, model.state_dict() + might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains + res2.conv1.weight. We thus want to match both parameters together. + For that, we look for each model weight, look among all loaded keys if there is one + that is a suffix of the current weight name, and use it if that's the case. + If multiple matches exist, take the one with longest size + of the corresponding name. For example, for the same model as before, the pretrained + weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, + we want to match backbone[0].body.conv1.weight to conv1.weight, and + backbone[0].body.res2.conv1.weight to res2.conv1.weight. + """ + current_keys = sorted(list(model_state_dict.keys())) + loaded_keys = sorted(list(loaded_state_dict.keys())) + # get a matrix of string matches, where each (i, j) entry correspond to the size of the + # loaded_key string, if it matches + match_matrix = [ + len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys + ] + match_matrix = torch.as_tensor(match_matrix).view( + len(current_keys), len(loaded_keys) + ) + max_match_size, idxs = match_matrix.max(1) + # remove indices that correspond to no-match + idxs[max_match_size == 0] = -1 + + # used for logging + max_size = max([len(key) for key in current_keys]) if current_keys else 1 + max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1 + log_str_template = "{: <{}} loaded from {: <{}} of shape {}" + if logger is None: + logger = logging.getLogger(__name__) + for idx_new, idx_old in enumerate(idxs.tolist()): + if idx_old == -1: + continue + key = current_keys[idx_new] + key_old = loaded_keys[idx_old] + model_state_dict[key] = loaded_state_dict[key_old] + logger.info( + log_str_template.format( + key, + max_size, + key_old, + max_size_loaded, + tuple(loaded_state_dict[key_old].shape), + ) + ) + + +def strip_prefix_if_present(state_dict, prefix): + keys = sorted(state_dict.keys()) + if not all(key.startswith(prefix) for key in keys): + return state_dict + stripped_state_dict = OrderedDict() + for key, value in state_dict.items(): + stripped_state_dict[key.replace(prefix, "")] = value + return stripped_state_dict + + +def load_state_dict(model, loaded_state_dict, logger=None): + model_state_dict = model.state_dict() + # if the state_dict comes from a model that was wrapped in a + # DataParallel or DistributedDataParallel during serialization, + # remove the "module" prefix before performing the matching + loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.") + align_and_update_state_dicts(model_state_dict, loaded_state_dict, logger=logger) + + # use strict loading + model.load_state_dict(model_state_dict) + + +def finetune_load_state_dict(model, loaded_state_dict, logger=None): + model_state_dict = model.state_dict() + # if the state_dict comes from a model that was wrapped in a + # DataParallel or DistributedDataParallel during serialization, + # remove the "module" prefix before performing the matching + loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.") + loaded_state_dict = { + k: v for k, v in loaded_state_dict.items() if not k.startswith("rpn.tasks") + } + align_and_update_state_dicts(model_state_dict, loaded_state_dict, logger=logger) + + # use strict loading + model.load_state_dict(model_state_dict) + + +class Checkpointer(object): + def __init__( + self, + model, + optimizer=None, + scheduler=None, + save_dir="", + ckpt_path=None, + save_to_disk=None, + logger=None, + ): + self.model = model + self.optimizer = optimizer + self.scheduler = scheduler + self.pretrained_path = ckpt_path # whether pretrained + self.finetune = False + self.save_dir = save_dir + self.save_to_disk = save_to_disk + if logger is None: + logger = logging.getLogger(__name__) + self.logger = logger + + def save(self, name, **kwargs): + self.logger.info(name) + if not self.save_dir: + return + + if not self.save_to_disk: + return + + data = {} + data["model"] = self.model.state_dict() + if self.optimizer is not None: + data["optimizer"] = self.optimizer.state_dict() + if self.scheduler is not None: + print(dir(self.scheduler)) + data["scheduler"] = self.scheduler.state_dict() + data.update(kwargs) + + save_file = os.path.join(self.save_dir, "{}.pth".format(name)) + self.logger.info("Saving checkpoint to {}".format(save_file)) + torch.save(data, save_file) + self.tag_last_checkpoint(save_file) + + def load(self, f=None): + if f is not None: + f = self.get_checkpoint_file(f) + elif self.has_checkpoint(self.save_dir): + # override argument with existing checkpoint + f = self.get_checkpoint_file(self.save_dir) + + if not f: + # no checkpoint could be found + self.logger.info("No checkpoint found. Initializing model from scratch") + return {} + self.logger.info("Loading checkpoint from {}".format(f)) + checkpoint = self._load_file(f) + self._load_model(checkpoint) + if "optimizer" in checkpoint and self.optimizer: + self.logger.info("Loading optimizer from {}".format(f)) + self.optimizer.load_state_dict(checkpoint.pop("optimizer")) + if "scheduler" in checkpoint and self.scheduler: + self.logger.info("Loading scheduler from {}".format(f)) + self.scheduler.load_state_dict(checkpoint.pop("scheduler")) + + # return any further checkpoint data + return checkpoint + + def finetune_load(self, ckpt_path=None, f=None): + if ckpt_path is not None: + self.pretrained_path = ckpt_path + self.finetune = True + f = self.get_checkpoint_file(ckpt_path) + assert f is not None, "Finetune should provide a valid ckpt path" + self.logger.info("Loading pretrained model from {}".format(f)) + checkpoint = self._load_file(f) + self._load_model(checkpoint) + + def has_checkpoint(self, save_dir): + save_file = os.path.join(save_dir, "last_checkpoint") + return os.path.exists(save_file) + + def get_checkpoint_file(self, save_dir): + save_file = os.path.join(save_dir, "last_checkpoint") + try: + with open(save_file, "r") as f: + last_saved = f.read() + last_saved = last_saved.strip() + except IOError: + # if file doesn't exist, maybe because it has just been + # deleted by a separate process + last_saved = "" + return last_saved + + def tag_last_checkpoint(self, last_filename): + save_file = os.path.join(self.save_dir, "last_checkpoint") + with open(save_file, "w") as f: + f.write(last_filename) + + def _load_file(self, f): + return torch.load(f, map_location=torch.device("cpu")) + + def _load_model(self, checkpoint): + if self.finetune: + finetune_load_state_dict( + self.model, checkpoint.pop("model"), logger=self.logger + ) + else: + load_state_dict(self.model, checkpoint.pop("model"), logger=self.logger) + + +class det3dCheckpointer(Checkpointer): + def __init__( + self, + # cfg, + model, + optimizer=None, + scheduler=None, + save_dir="", + save_to_disk=None, + logger=None, + ): + super(det3dCheckpointer, self).__init__( + model, optimizer, scheduler, save_dir, save_to_disk, logger + ) + # self.cfg = cfg.clone() + # self.writer = Writer(save_dir) + self.logger = logger + + def _load_file(self, f): + # load native detectron.pytorch checkpoint + loaded = super(det3dCheckpointer, self)._load_file(f) + if "model" not in loaded: + loaded = dict(model=loaded) + return loaded + + +class Writer: + def __init__(self, save_dir): + self.save_dir = Path(save_dir) + self.log_mjson_file = None + self.summary_writter = None + self.metrics = [] + self._text_current_gstep = -1 + self._tb_texts = [] + + def open(self): + save_dir = self.save_dir + assert save_dir.exists() + summary_dir = save_dir / "summary" + summary_dir.mkdir(parents=True, exist_ok=True) + self.summary_writter = SummaryWriter(str(summary_dir)) + return self + + def close(self): + assert self.summary_writter is not None + tb_json_path = str(self.save_dir / "tensorboard_scalars.json") + self.summary_writter.export_scalars_to_json(tb_json_path) + self.summary_writter.close() + self.summary_writter = None + + def log_text(self, text, step, tag="regular log"): + """This function only add text to log.txt and tensorboard texts + """ + if step > self._text_current_gstep and self._text_current_gstep != -1: + total_text = "\n".join(self._tb_texts) + self.summary_writter.add_text(tag, total_text, global_step=step) + self._tb_texts = [] + self._text_current_gstep = step + else: + self._tb_texts.append(text) + + if self._text_current_gstep == -1: + self._text_current_gstep = step + + def log_metrics(self, metrics: dict, step): + flatted_summarys = flat_nested_json_dict(metrics, "/") + for k, v in flatted_summarys.items(): + if isinstance(v, (list, tuple)): + if any([isinstance(e, str) for e in v]): + continue + v_dict = {str(i): e for i, e in enumerate(v)} + for k1, v1 in v_dict.items(): + self.summary_writter.add_scalar(k + "/" + k1, v1, step) + else: + if isinstance(v, str): + continue + self.summary_writter.add_scalar(k, v, step) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/config_tool.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/config_tool.py new file mode 100644 index 0000000000000000000000000000000000000000..96c6d1f9717f9818b346191b8235ffec0d2d9aa2 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/config_tool.py @@ -0,0 +1,53 @@ +# This file contains some config modification function. +# some functions should be only used for KITTI dataset. + +from pathlib import Path + +import numpy as np +from google.protobuf import text_format + + +def change_detection_range(model_config, new_range): + assert len(new_range) == 4, "you must provide a list such as [-50, -50, 50, 50]" + old_pc_range = list(model_config.voxel_generator.point_cloud_range) + old_pc_range[:2] = new_range[:2] + old_pc_range[3:5] = new_range[2:] + model_config.voxel_generator.point_cloud_range[:] = old_pc_range + for anchor_generator in model_config.target_assigner.anchor_generators: + a_type = anchor_generator.WhichOneof("anchor_generator") + if a_type == "anchor_generator_range": + a_cfg = anchor_generator.anchor_generator_range + old_a_range = list(a_cfg.anchor_ranges) + old_a_range[:2] = new_range[:2] + old_a_range[3:5] = new_range[2:] + a_cfg.anchor_ranges[:] = old_a_range + elif a_type == "anchor_generator_stride": + a_cfg = anchor_generator.anchor_generator_stride + old_offset = list(a_cfg.offsets) + stride = list(a_cfg.strides) + old_offset[0] = new_range[0] + stride[0] / 2 + old_offset[1] = new_range[1] + stride[1] / 2 + a_cfg.offsets[:] = old_offset + else: + raise ValueError("unknown") + old_post_range = list(model_config.post_center_limit_range) + old_post_range[:2] = new_range[:2] + old_post_range[3:5] = new_range[2:] + model_config.post_center_limit_range[:] = old_post_range + + +def get_downsample_factor(model_config): + try: + neck_cfg = model_config["neck"] + except: + model_config = model_config['first_stage_cfg'] + neck_cfg = model_config['neck'] + downsample_factor = np.prod(neck_cfg.get("ds_layer_strides", [1])) + if len(neck_cfg.get("us_layer_strides", [])) > 0: + downsample_factor /= neck_cfg.get("us_layer_strides", [])[-1] + + backbone_cfg = model_config['backbone'] + downsample_factor *= backbone_cfg["ds_factor"] + downsample_factor = int(downsample_factor) + assert downsample_factor > 0 + return downsample_factor diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/find.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/find.py new file mode 100644 index 0000000000000000000000000000000000000000..3397cff4abe3535b820004425cf4b885b37953b4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/find.py @@ -0,0 +1,214 @@ +import glob +import json +import os +import subprocess +import sys +import tempfile +from pathlib import Path + +import fire + + +def _get_info_from_anaconda_info(info, split=":"): + info = info.strip("\n").replace(" ", "") + info_dict = {} + latest_key = "" + for line in info.splitlines(): + if split in line: + pair = line.split(split) + info_dict[pair[0]] = pair[1] + latest_key = pair[0] + else: + if not isinstance(info_dict[latest_key], list): + info_dict[latest_key] = [info_dict[latest_key]] + info_dict[latest_key].append(line) + return info_dict + + +def find_anaconda(): + # try find in default path + path = Path.home() / "anaconda3" + if path.exists(): + return path + # try conda in cmd + try: + info = subprocess.check_output("conda info", shell=True).decode("utf-8") + info_dict = _get_info_from_anaconda_info(info) + return info_dict["activeenvlocation"] + except subprocess.CalledProcessError: + raise RuntimeError("find anadonda failed") + + +def find_cuda(): + """Finds the CUDA install path.""" + # Guess #1 + cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") + if cuda_home is None: + # Guess #2 + if sys.platform == "win32": + cuda_homes = glob.glob( + "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*" + ) + if len(cuda_homes) == 0: + cuda_home = "" + else: + cuda_home = cuda_homes[0] + else: + cuda_home = "/usr/local/cuda" + if not os.path.exists(cuda_home): + # Guess #3 + try: + which = "where" if sys.platform == "win32" else "which" + nvcc = subprocess.check_output([which, "nvcc"]).decode().rstrip("\r\n") + cuda_home = os.path.dirname(os.path.dirname(nvcc)) + except Exception: + cuda_home = None + if cuda_home is None: + raise RuntimeError( + "No CUDA runtime is found, using CUDA_HOME='{}'".format(cuda_home) + ) + return cuda_home + + +def find_cuda_device_arch(): + if sys.platform == "win32": + # TODO: add windows support + return None + cuda_home = find_cuda() + if cuda_home is None: + return None + cuda_home = Path(cuda_home) + try: + device_query_path = cuda_home / "extras/demo_suite/deviceQuery" + if not device_query_path.exists(): + source = """ + #include + #include + int main(){ + int nDevices; + cudaGetDeviceCount(&nDevices); + for (int i = 0; i < nDevices; i++) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + std::cout << prop.major << "." << prop.minor << std::endl; + } + return 0; + } + """ + with tempfile.NamedTemporaryFile("w", suffix=".cc") as f: + f_path = Path(f.name) + f.write(source) + f.flush() + try: + # TODO: add windows support + cmd = ( + f"g++ {f.name} -o {f_path.stem}" + f" -I{cuda_home / 'include'} -L{cuda_home / 'lib64'} -lcudart" + ) + print(cmd) + subprocess.check_output(cmd, shell=True, cwd=f_path.parent) + cmd = f"./{f_path.stem}" + arches = ( + subprocess.check_output(cmd, shell=True, cwd=f_path.parent) + .decode() + .rstrip("\r\n") + .split("\n") + ) + if len(arches) < 1: + return None + arch = arches[0] + except Exception: + return None + else: + cmd = f"{str(device_query_path)} | grep 'CUDA Capability'" + arch = ( + subprocess.check_output(cmd, shell=True) + .decode() + .rstrip("\r\n") + .split(" ")[-1] + ) + # assert len(arch) == 2 + arch_list = [int(s) for s in arch.split(".")] + arch_int = arch_list[0] * 10 + arch_list[1] + find_work_arch = False + while arch_int > 10: + try: + res = subprocess.check_output( + "nvcc -arch=sm_{}".format(arch_int), + shell=True, + stderr=subprocess.STDOUT, + ) + except subprocess.CalledProcessError as e: + if "No input files specified" in e.output.decode(): + find_work_arch = True + break + elif ( + "is not defined for option 'gpu-architecture'" in e.output.decode() + ): + arch_int -= 1 + else: + raise RuntimeError("unknown error") + if find_work_arch: + arch = f"sm_{arch_int}" + else: + arch = None + + except Exception: + arch = None + return arch + + +def get_gpu_memory_usage(): + if sys.platform == "win32": + # TODO: add windows support + return None + cuda_home = find_cuda() + if cuda_home is None: + return None + cuda_home = Path(cuda_home) + source = """ + #include + #include + int main(){ + int nDevices; + cudaGetDeviceCount(&nDevices); + size_t free_m, total_m; + // output json format. + std::cout << "["; + for (int i = 0; i < nDevices; i++) { + cudaSetDevice(i); + cudaMemGetInfo(&free_m, &total_m); + std::cout << "[" << free_m << "," << total_m << "]"; + if (i != nDevices - 1) + std::cout << "," << std::endl; + } + std::cout << "]" << std::endl; + return 0; + } + """ + with tempfile.NamedTemporaryFile("w", suffix=".cc") as f: + f_path = Path(f.name) + f.write(source) + f.flush() + try: + # TODO: add windows support + cmd = ( + f"g++ {f.name} -o {f_path.stem} -std=c++11" + f" -I{cuda_home / 'include'} -L{cuda_home / 'lib64'} -lcudart" + ) + print(cmd) + subprocess.check_output(cmd, shell=True, cwd=f_path.parent) + cmd = f"./{f_path.stem}" + usages = subprocess.check_output( + cmd, shell=True, cwd=f_path.parent + ).decode() + usages = json.loads(usages) + return usages + except Exception: + return None + return None + + +if __name__ == "__main__": + print(find_cuda_device_arch()) + # fire.Fire() diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/flops_counter.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/flops_counter.py new file mode 100644 index 0000000000000000000000000000000000000000..4fc092710a9c2724f8f8106a391ac16e8a7e9378 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/flops_counter.py @@ -0,0 +1,446 @@ +# Modified from flops-counter.pytorch by Vladislav Sovrasov +# original repo: https://github.com/sovrasov/flops-counter.pytorch + +# MIT License + +# Copyright (c) 2018 Vladislav Sovrasov + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import sys + +import numpy as np +import torch +import torch.nn as nn +from torch.nn.modules.batchnorm import _BatchNorm +from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin +from torch.nn.modules.pooling import ( + _AdaptiveAvgPoolNd, + _AdaptiveMaxPoolNd, + _AvgPoolNd, + _MaxPoolNd, +) + +CONV_TYPES = (_ConvNd,) +DECONV_TYPES = (_ConvTransposeMixin,) +LINEAR_TYPES = (nn.Linear,) +POOLING_TYPES = (_AvgPoolNd, _MaxPoolNd, _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd) +RELU_TYPES = (nn.ReLU, nn.PReLU, nn.ELU, nn.LeakyReLU, nn.ReLU6) +BN_TYPES = (_BatchNorm,) +UPSAMPLE_TYPES = (nn.Upsample,) + +SUPPORTED_TYPES = ( + CONV_TYPES + + DECONV_TYPES + + LINEAR_TYPES + + POOLING_TYPES + + RELU_TYPES + + BN_TYPES + + UPSAMPLE_TYPES +) + + +def get_model_complexity_info( + model, + input_res, + print_per_layer_stat=True, + as_strings=True, + input_constructor=None, + ost=sys.stdout, +): + assert type(input_res) is tuple + assert len(input_res) >= 2 + flops_model = add_flops_counting_methods(model) + flops_model.eval().start_flops_count() + if input_constructor: + input = input_constructor(input_res) + _ = flops_model(**input) + else: + batch = torch.ones(()).new_empty( + (1, *input_res), + dtype=next(flops_model.parameters()).dtype, + device=next(flops_model.parameters()).device, + ) + flops_model(batch) + + if print_per_layer_stat: + print_model_with_flops(flops_model, ost=ost) + flops_count = flops_model.compute_average_flops_cost() + params_count = get_model_parameters_number(flops_model) + flops_model.stop_flops_count() + + if as_strings: + return flops_to_string(flops_count), params_to_string(params_count) + + return flops_count, params_count + + +def flops_to_string(flops, units="GMac", precision=2): + if units is None: + if flops // 10 ** 9 > 0: + return str(round(flops / 10.0 ** 9, precision)) + " GMac" + elif flops // 10 ** 6 > 0: + return str(round(flops / 10.0 ** 6, precision)) + " MMac" + elif flops // 10 ** 3 > 0: + return str(round(flops / 10.0 ** 3, precision)) + " KMac" + else: + return str(flops) + " Mac" + else: + if units == "GMac": + return str(round(flops / 10.0 ** 9, precision)) + " " + units + elif units == "MMac": + return str(round(flops / 10.0 ** 6, precision)) + " " + units + elif units == "KMac": + return str(round(flops / 10.0 ** 3, precision)) + " " + units + else: + return str(flops) + " Mac" + + +def params_to_string(params_num): + """converting number to string + :param float params_num: number + :returns str: number + >>> params_to_string(1e9) + '1000.0 M' + >>> params_to_string(2e5) + '200.0 k' + >>> params_to_string(3e-9) + '3e-09' + """ + if params_num // 10 ** 6 > 0: + return str(round(params_num / 10 ** 6, 2)) + " M" + elif params_num // 10 ** 3: + return str(round(params_num / 10 ** 3, 2)) + " k" + else: + return str(params_num) + + +def print_model_with_flops(model, units="GMac", precision=3, ost=sys.stdout): + total_flops = model.compute_average_flops_cost() + + def accumulate_flops(self): + if is_supported_instance(self): + return self.__flops__ / model.__batch_counter__ + else: + sum = 0 + for m in self.children(): + sum += m.accumulate_flops() + return sum + + def flops_repr(self): + accumulated_flops_cost = self.accumulate_flops() + return ", ".join( + [ + flops_to_string( + accumulated_flops_cost, units=units, precision=precision + ), + "{:.3%} MACs".format(accumulated_flops_cost / total_flops), + self.original_extra_repr(), + ] + ) + + def add_extra_repr(m): + m.accumulate_flops = accumulate_flops.__get__(m) + flops_extra_repr = flops_repr.__get__(m) + if m.extra_repr != flops_extra_repr: + m.original_extra_repr = m.extra_repr + m.extra_repr = flops_extra_repr + assert m.extra_repr != m.original_extra_repr + + def del_extra_repr(m): + if hasattr(m, "original_extra_repr"): + m.extra_repr = m.original_extra_repr + del m.original_extra_repr + if hasattr(m, "accumulate_flops"): + del m.accumulate_flops + + model.apply(add_extra_repr) + print(model, file=ost) + model.apply(del_extra_repr) + + +def get_model_parameters_number(model): + params_num = sum(p.numel() for p in model.parameters() if p.requires_grad) + return params_num + + +def add_flops_counting_methods(net_main_module): + # adding additional methods to the existing module object, + # this is done this way so that each function has access to self object + net_main_module.start_flops_count = start_flops_count.__get__(net_main_module) + net_main_module.stop_flops_count = stop_flops_count.__get__(net_main_module) + net_main_module.reset_flops_count = reset_flops_count.__get__(net_main_module) + net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( + net_main_module + ) + + net_main_module.reset_flops_count() + + # Adding variables necessary for masked flops computation + net_main_module.apply(add_flops_mask_variable_or_reset) + + return net_main_module + + +def compute_average_flops_cost(self): + """ + A method that will be available after add_flops_counting_methods() is + called on a desired net object. + Returns current mean flops consumption per image. + """ + + batches_count = self.__batch_counter__ + flops_sum = 0 + for module in self.modules(): + if is_supported_instance(module): + flops_sum += module.__flops__ + + return flops_sum / batches_count + + +def start_flops_count(self): + """ + A method that will be available after add_flops_counting_methods() is + called on a desired net object. + Activates the computation of mean flops consumption per image. + Call it before you run the network. + """ + add_batch_counter_hook_function(self) + self.apply(add_flops_counter_hook_function) + + +def stop_flops_count(self): + """ + A method that will be available after add_flops_counting_methods() is + called on a desired net object. + Stops computing the mean flops consumption per image. + Call whenever you want to pause the computation. + """ + remove_batch_counter_hook_function(self) + self.apply(remove_flops_counter_hook_function) + + +def reset_flops_count(self): + """ + A method that will be available after add_flops_counting_methods() is + called on a desired net object. + Resets statistics computed so far. + """ + add_batch_counter_variables_or_reset(self) + self.apply(add_flops_counter_variable_or_reset) + + +def add_flops_mask(module, mask): + def add_flops_mask_func(module): + if isinstance(module, torch.nn.Conv2d): + module.__mask__ = mask + + module.apply(add_flops_mask_func) + + +def remove_flops_mask(module): + module.apply(add_flops_mask_variable_or_reset) + + +def is_supported_instance(module): + if isinstance(module, SUPPORTED_TYPES): + return True + else: + return False + + +def empty_flops_counter_hook(module, input, output): + module.__flops__ += 0 + + +def upsample_flops_counter_hook(module, input, output): + output_size = output[0] + batch_size = output_size.shape[0] + output_elements_count = batch_size + for val in output_size.shape[1:]: + output_elements_count *= val + module.__flops__ += int(output_elements_count) + + +def relu_flops_counter_hook(module, input, output): + active_elements_count = output.numel() + module.__flops__ += int(active_elements_count) + + +def linear_flops_counter_hook(module, input, output): + input = input[0] + batch_size = input.shape[0] + module.__flops__ += int(batch_size * input.shape[1] * output.shape[1]) + + +def pool_flops_counter_hook(module, input, output): + input = input[0] + module.__flops__ += int(np.prod(input.shape)) + + +def bn_flops_counter_hook(module, input, output): + module.affine + input = input[0] + + batch_flops = np.prod(input.shape) + if module.affine: + batch_flops *= 2 + module.__flops__ += int(batch_flops) + + +def deconv_flops_counter_hook(conv_module, input, output): + # Can have multiple inputs, getting the first one + input = input[0] + + batch_size = input.shape[0] + input_height, input_width = input.shape[2:] + + kernel_height, kernel_width = conv_module.kernel_size + in_channels = conv_module.in_channels + out_channels = conv_module.out_channels + groups = conv_module.groups + + filters_per_channel = out_channels // groups + conv_per_position_flops = ( + kernel_height * kernel_width * in_channels * filters_per_channel + ) + + active_elements_count = batch_size * input_height * input_width + overall_conv_flops = conv_per_position_flops * active_elements_count + bias_flops = 0 + if conv_module.bias is not None: + output_height, output_width = output.shape[2:] + bias_flops = out_channels * batch_size * output_height * output_height + overall_flops = overall_conv_flops + bias_flops + + conv_module.__flops__ += int(overall_flops) + + +def conv_flops_counter_hook(conv_module, input, output): + # Can have multiple inputs, getting the first one + input = input[0] + + batch_size = input.shape[0] + output_dims = list(output.shape[2:]) + + kernel_dims = list(conv_module.kernel_size) + in_channels = conv_module.in_channels + out_channels = conv_module.out_channels + groups = conv_module.groups + + filters_per_channel = out_channels // groups + conv_per_position_flops = np.prod(kernel_dims) * in_channels * filters_per_channel + + active_elements_count = batch_size * np.prod(output_dims) + + if conv_module.__mask__ is not None: + # (b, 1, h, w) + output_height, output_width = output.shape[2:] + flops_mask = conv_module.__mask__.expand( + batch_size, 1, output_height, output_width + ) + active_elements_count = flops_mask.sum() + + overall_conv_flops = conv_per_position_flops * active_elements_count + + bias_flops = 0 + + if conv_module.bias is not None: + + bias_flops = out_channels * active_elements_count + + overall_flops = overall_conv_flops + bias_flops + + conv_module.__flops__ += int(overall_flops) + + +def batch_counter_hook(module, input, output): + batch_size = 1 + if len(input) > 0: + # Can have multiple inputs, getting the first one + input = input[0] + batch_size = len(input) + else: + print( + "Warning! No positional inputs found for a module, " + "assuming batch size is 1." + ) + module.__batch_counter__ += batch_size + + +def add_batch_counter_variables_or_reset(module): + + module.__batch_counter__ = 0 + + +def add_batch_counter_hook_function(module): + if hasattr(module, "__batch_counter_handle__"): + return + + handle = module.register_forward_hook(batch_counter_hook) + module.__batch_counter_handle__ = handle + + +def remove_batch_counter_hook_function(module): + if hasattr(module, "__batch_counter_handle__"): + module.__batch_counter_handle__.remove() + del module.__batch_counter_handle__ + + +def add_flops_counter_variable_or_reset(module): + if is_supported_instance(module): + module.__flops__ = 0 + + +def add_flops_counter_hook_function(module): + if is_supported_instance(module): + if hasattr(module, "__flops_handle__"): + return + + if isinstance(module, CONV_TYPES): + handle = module.register_forward_hook(conv_flops_counter_hook) + elif isinstance(module, RELU_TYPES): + handle = module.register_forward_hook(relu_flops_counter_hook) + elif isinstance(module, LINEAR_TYPES): + handle = module.register_forward_hook(linear_flops_counter_hook) + elif isinstance(module, POOLING_TYPES): + handle = module.register_forward_hook(pool_flops_counter_hook) + elif isinstance(module, BN_TYPES): + handle = module.register_forward_hook(bn_flops_counter_hook) + elif isinstance(module, UPSAMPLE_TYPES): + handle = module.register_forward_hook(upsample_flops_counter_hook) + elif isinstance(module, DECONV_TYPES): + handle = module.register_forward_hook(deconv_flops_counter_hook) + else: + handle = module.register_forward_hook(empty_flops_counter_hook) + module.__flops_handle__ = handle + + +def remove_flops_counter_hook_function(module): + if is_supported_instance(module): + if hasattr(module, "__flops_handle__"): + module.__flops_handle__.remove() + del module.__flops_handle__ + + +# --- Masked flops counting +# Also being run in the initialization +def add_flops_mask_variable_or_reset(module): + if is_supported_instance(module): + module.__mask__ = None diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/imports.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/imports.py new file mode 100644 index 0000000000000000000000000000000000000000..50dc6373939cfee24cc9ca3c0fbb37816e60ce73 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/imports.py @@ -0,0 +1,24 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +if torch._six.PY3: + import importlib + import importlib.util + import sys + + # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa + def import_file(module_name, file_path, make_importable=False): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + if make_importable: + sys.modules[module_name] = module + return module + + +else: + import imp + + def import_file(module_name, file_path, make_importable=None): + module = imp.load_source(module_name, file_path) + return module diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/loader.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..4a360bf53dd0a25854c902b55be137d80fb0e885 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/loader.py @@ -0,0 +1,79 @@ +import importlib +import logging +import os +import sys +from pathlib import Path + +logger = logging.getLogger("det3d.utils.loader") + +CUSTOM_LOADED_MODULES = {} + + +def _get_possible_module_path(paths): + ret = [] + for p in paths: + p = Path(p) + for path in p.glob("*"): + if path.suffix in ["py", ".so"] or (path.is_dir()): + if path.stem.isidentifier(): + ret.append(path) + return ret + + +def _get_regular_import_name(path, module_paths): + path = Path(path) + for mp in module_paths: + mp = Path(mp) + if mp == path: + return path.stem + try: + relative_path = path.relative_to(Path(mp)) + parts = list((relative_path.parent / relative_path.stem).parts) + module_name = ".".join([mp.stem] + parts) + return module_name + except Exception: + pass + return None + + +def import_file(path, name: str = None, add_to_sys=True, disable_warning=False): + global CUSTOM_LOADED_MODULES + path = Path(path) + module_name = path.stem + try: + user_paths = os.environ["PYTHONPATH"].split(os.pathsep) + except KeyError: + user_paths = [] + possible_paths = _get_possible_module_path(user_paths) + model_import_name = _get_regular_import_name(path, possible_paths) + if model_import_name is not None: + return import_name(model_import_name) + if name is not None: + module_name = name + spec = importlib.util.spec_from_file_location(module_name, path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + if not disable_warning: + logger.warning( + ( + f"Failed to perform regular import for file {path}. " + "this means this file isn't in any folder in PYTHONPATH " + "or don't have __init__.py in that project. " + "directly file import may fail and some reflecting features are " + "disabled even if import succeed. please add your project to PYTHONPATH " + "or add __init__.py to ensure this file can be regularly imported. " + ) + ) + + if add_to_sys: # this will enable find objects defined in a file. + # avoid replace system modules. + if module_name in sys.modules and module_name not in CUSTOM_LOADED_MODULES: + raise ValueError(f"{module_name} exists in system.") + CUSTOM_LOADED_MODULES[module_name] = module + sys.modules[module_name] = module + return module + + +def import_name(name, package=None): + module = importlib.import_module(name, package) + return module diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/print_utils.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/print_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3916da8eefbeb60fdc52875b4bdd9ab76c1c3816 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/print_utils.py @@ -0,0 +1,35 @@ +def _flat_nested_json_dict(json_dict, flatted, sep=".", start=""): + for k, v in json_dict.items(): + if isinstance(v, dict): + _flat_nested_json_dict(v, flatted, sep, start + sep + str(k)) + else: + flatted[start + sep + str(k)] = v + + +def flat_nested_json_dict(json_dict, sep=".") -> dict: + """flat a nested json-like dict. this function make shadow copy. + """ + flatted = {} + for k, v in json_dict.items(): + if isinstance(v, dict): + _flat_nested_json_dict(v, flatted, sep, str(k)) + else: + flatted[str(k)] = v + return flatted + + +def metric_to_str(metrics, sep="."): + flatted_metrics = flat_nested_json_dict(metrics, sep) + metrics_str_list = [] + for k, v in flatted_metrics.items(): + if isinstance(v, float): + metrics_str_list.append(f"{k}={v:.4}") + elif isinstance(v, (list, tuple)): + if v and isinstance(v[0], float): + v_str = ", ".join([f"{e:.4}" for e in v]) + metrics_str_list.append(f"{k}=[{v_str}]") + else: + metrics_str_list.append(f"{k}={v}") + else: + metrics_str_list.append(f"{k}={v}") + return ", ".join(metrics_str_list) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/registry.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..e51fd3a3e01e7f33e98773969ba13fef1a4ad961 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/registry.py @@ -0,0 +1,78 @@ +import inspect + +from det3d import torchie + + +class Registry(object): + def __init__(self, name): + self._name = name + self._module_dict = dict() + + def __repr__(self): + format_str = self.__class__.__name__ + "(name={}, items={})".format( + self._name, list(self._module_dict.keys()) + ) + return format_str + + @property + def name(self): + return self._name + + @property + def module_dict(self): + return self._module_dict + + def get(self, key): + return self._module_dict.get(key, None) + + def _register_module(self, module_class): + """Register a module. + Args: + module (:obj:`nn.Module`): Module to be registered. + """ + if not inspect.isclass(module_class): + raise TypeError( + "module must be a class, but got {}".format(type(module_class)) + ) + module_name = module_class.__name__ + if module_name in self._module_dict: + raise KeyError( + "{} is already registered in {}".format(module_name, self.name) + ) + self._module_dict[module_name] = module_class + + def register_module(self, cls): + self._register_module(cls) + return cls + + +def build_from_cfg(cfg, registry, default_args=None): + """Build a module from config dict. + Args: + cfg (dict): Config dict. It should at least contain the key "type". + registry (:obj:`Registry`): The registry to search the type from. + default_args (dict, optional): Default initialization arguments. + Returns: + obj: The constructed object. + """ + assert isinstance(cfg, dict) and "type" in cfg + assert isinstance(default_args, dict) or default_args is None + args = cfg.copy() + obj_type = args.pop("type") + if torchie.is_str(obj_type): + obj_cls = registry.get(obj_type) + if obj_cls is None: + raise KeyError( + "{} is not in the {} registry".format(obj_type, registry.name) + ) + elif inspect.isclass(obj_type): + obj_cls = obj_type + else: + raise TypeError( + "type must be a str or valid type, but got {}".format(type(obj_type)) + ) + if default_args is not None: + for name, value in default_args.items(): + args.setdefault(name, value) + + return obj_cls(**args) diff --git a/cv/3d_detection/centerpoint/pytorch/det3d/utils/utils.py b/cv/3d_detection/centerpoint/pytorch/det3d/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc0a035a8a79d39857066da2924c1551a5c1b54 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/det3d/utils/utils.py @@ -0,0 +1,42 @@ +import numpy as np +import torch + + +def example_to_device( + example, dtype=torch.float32, device=None, non_blocking=True +) -> dict: + device = device or torch.device("cuda:0") + example_torch = {} + float_names = ["voxels", "bev_map"] + for k, v in example.items(): + if k in ["anchors", "reg_targets", "reg_weights", "labels", "anchors_mask"]: + res = [] + for kk, vv in v.items(): + vv = [vvv.unsqueeze_(0) for vvv in vv] + res.append(torch.cat(vv, dim=0).cuda(device, non_blocking=non_blocking)) + example_torch[k] = res + elif k in [ + "voxels", + "bev_map", + "coordinates", + "num_points", + "points", + "num_voxels", + ]: + # slow when directly provide fp32 data with dtype=torch.half + example_torch[k] = v.cuda(device, non_blocking=non_blocking) + elif k == "calib": + calib = {} + for k1, v1 in v.items(): + calib[k1] = v1.cuda(device, non_blocking=non_blocking) + example_torch[k] = calib + else: + example_torch[k] = v + + return example_torch + + +def _worker_init_fn(worker_id): + time_seed = np.array(time.time(), dtype=np.int32) + np.random.seed(time_seed + worker_id) + print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) diff --git a/cv/3d_detection/centerpoint/pytorch/docs/DEVELOP.md b/cv/3d_detection/centerpoint/pytorch/docs/DEVELOP.md new file mode 100644 index 0000000000000000000000000000000000000000..84e89cd36a4700723e833d797f70a0b0681fde88 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/docs/DEVELOP.md @@ -0,0 +1,21 @@ +# Develop + +This document provide tutorials to develop CenterPoint. + +## New dataset + +TODO + +## New Task + +- If you interested in developing tracking algorithms based on our detection results, please refer to [NUSC](NUSC.md) and [WAYMO](WAYMO.md). For more advanced tasks like motion prediction, you may need to store the final bev feature map computed [here](https://github.com/tianweiy/CenterPoint/blob/1ecebf980f75cfe7f53cc52032b184192891c9b9/det3d/models/necks/rpn.py#L159). + +- You will also need to add files to [`det3d/datasets/pipelines/preprocess.py`](../det3d/datasets/pipelines/preprocess.py) to specify the data generation during training and training. + +- You may also need to change the collate function in [collate.py](https://github.com/tianweiy/CenterPoint/blob/1ecebf980f75cfe7f53cc52032b184192891c9b9/det3d/torchie/parallel/collate.py#L91) and data_loading function in [trainer.py](https://github.com/tianweiy/CenterPoint/blob/1ecebf980f75cfe7f53cc52032b184192891c9b9/det3d/torchie/trainer/trainer.py#L34) + +## New Architecture + +Please add any 3D backbone in `det3d/models/backbones`, any 2D backbones in `det3d/models/necks`, and any two-stage refinement modules in `det3d/models/second_stage`. + +If you have any suggestions for improving this codebase for development, please open an issue or send us an email. \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/docs/GETTING_START.md b/cv/3d_detection/centerpoint/pytorch/docs/GETTING_START.md new file mode 100644 index 0000000000000000000000000000000000000000..b2a02a43ceef49c98c2aff1576499e38f85f4282 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/docs/GETTING_START.md @@ -0,0 +1,7 @@ +# Getting Started with CenterPoint + +## nuScenes +Please refer to [nuScenes](NUSC.md) for details. + +## Waymo +Please refer to [WAYMO](WAYMO.md) for details. \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/docs/INSTALL.md b/cv/3d_detection/centerpoint/pytorch/docs/INSTALL.md new file mode 100644 index 0000000000000000000000000000000000000000..ce23c0afa2e194d4db1ff5fd12df483184490495 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/docs/INSTALL.md @@ -0,0 +1,89 @@ +## Installation +Modified from [det3d](https://github.com/poodarchu/Det3D/tree/56402d4761a5b73acd23080f537599b0888cce07)'s original document. + +### Requirements + +- Linux +- Python 3.6+ +- PyTorch 1.1 or higher +- CUDA 10.0 or higher +- CMake 3.13.2 or higher +- [APEX](https://github.com/nvidia/apex) +- [spconv](https://github.com/traveller59/spconv/commit/73427720a539caf9a44ec58abe3af7aa9ddb8e39) + +#### Notes +- Both Spconv 1.x and 2.x work. +- Recent pytorch/spconv/cuda version will be faster and consume less memory. +- If you have problem installing apex, you can change the apex syncbn to torch's native sync bn at https://github.com/tianweiy/CenterPoint/blob/3fd0b8745b77575cb9810035aafc76796613f942/det3d/torchie/apis/train.py#L268. + +we have tested the following versions of OS and softwares: + +- OS: Ubuntu 16.04/18.04 +- Python: 3.6.5/3.7.10 +- PyTorch: 1.1/1.9/1.10.1 +- spconv: 1.0/1.2.1/master +- CUDA: 10.0/11.1 + +### Basic Installation + +```bash +# basic python libraries +conda create --name centerpoint python=3.6 +conda activate centerpoint +conda install pytorch==1.1.0 torchvision==0.3.0 cudatoolkit=10.0 -c pytorch +git clone https://github.com/tianweiy/CenterPoint.git +cd CenterPoint +pip install -r requirements.txt + +# add CenterPoint to PYTHONPATH by adding the following line to ~/.bashrc (change the path accordingly) +export PYTHONPATH="${PYTHONPATH}:PATH_TO_CENTERPOINT" +``` + +### Advanced Installation + +#### nuScenes dev-kit + +```bash +git clone https://github.com/tianweiy/nuscenes-devkit + +# add the following line to ~/.bashrc and reactivate bash (remember to change the PATH_TO_NUSCENES_DEVKIT value) +export PYTHONPATH="${PYTHONPATH}:PATH_TO_NUSCENES_DEVKIT/python-sdk" +``` + +#### Cuda Extensions + +```bash +# set the cuda path(change the path to your own cuda location) +export PATH=/usr/local/cuda-10.0/bin:$PATH +export CUDA_PATH=/usr/local/cuda-10.0 +export CUDA_HOME=/usr/local/cuda-10.0 +export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64:$LD_LIBRARY_PATH + +# Rotated NMS +cd ROOT_DIR/det3d/ops/iou3d_nms +python setup.py build_ext --inplace + +# Deformable Convolution (Optional and only works with old torch versions e.g. 1.1) +cd ROOT_DIR/det3d/ops/dcn +python setup.py build_ext --inplace +``` + +#### APEX (Optional) + +```bash +git clone https://github.com/NVIDIA/apex +cd apex +git checkout 5633f6 # recent commit doesn't build in our system +pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ +``` + +#### spconv +```bash +sudo apt-get install libboost-all-dev +git clone https://github.com/traveller59/spconv.git --recursive +cd spconv && git checkout 7342772 +python setup.py bdist_wheel +cd ./dist && pip install * +``` + +#### Check out [GETTING_START](GETTING_START.md) to prepare the data and play with all those pretrained models. diff --git a/cv/3d_detection/centerpoint/pytorch/docs/NOTICE b/cv/3d_detection/centerpoint/pytorch/docs/NOTICE new file mode 100644 index 0000000000000000000000000000000000000000..79545b8503dde462d81ad706fcfff2eee6c5ce5f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/docs/NOTICE @@ -0,0 +1,247 @@ +Portions of this software are derived from det3d(https://github.com/poodarchu/Det3D/tree/56402d4761a5b73acd23080f537599b0888cce07). + +============================================================================== +det3d licence +============================================================================== + +MIT License + +Copyright (c) 2019 朱本金 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Portions of this software are derived from second. + +============================================================================== +second license +============================================================================== + +MIT License + +Copyright (c) 2018 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Portions of this software are derived from CenterTrack. + +============================================================================== +CenterTrack license +============================================================================== + +MIT License + +Copyright (c) 2020 Xingyi Zhou + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Portions of this software are derived from CenterNet. + +MIT License + +============================================================================== +CenterNet license +============================================================================== + +Copyright (c) 2019 Xingyi Zhou +All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Portions of this software are derived from nuscenes-devkit. + +============================================================================== +nuscenes-devkit licence +============================================================================== + +Copyright 2019 Aptiv + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Portions of this software are derived from mmdetection. + +============================================================================== +mmdetection licence +============================================================================== + +Copyright 2018-2019 Open-MMLab. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Portions of this software are derived from mmcv. + +============================================================================== +mmcv licence +============================================================================== + +Copyright 2018-2020 Open-MMLab. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +Portions of this software are derived from PCDet. + +============================================================================== +PCDet licence +============================================================================== + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Portions of this software are derived from maskrcnn-benchmark. + +============================================================================== +maskrcnn-benchmark licence +============================================================================== + +MIT License + +Copyright (c) 2018 Facebook + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Portions of this software are derived from pillar-od. + +============================================================================== +pillar-od licence +============================================================================== + +MIT License + +Copyright (c) Massachusetts Institute of Technology and its affiliates. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/docs/NUSC.md b/cv/3d_detection/centerpoint/pytorch/docs/NUSC.md new file mode 100644 index 0000000000000000000000000000000000000000..8723f693eb6866fbb1fd28649ce634e54cf5e199 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/docs/NUSC.md @@ -0,0 +1,117 @@ +## Getting Started with CenterPoint on nuScenes +Modified from [det3d](https://github.com/poodarchu/Det3D/tree/56402d4761a5b73acd23080f537599b0888cce07)'s original document. + +### Prepare data + +#### Download data and organise as follows + +``` +# For nuScenes Dataset +└── NUSCENES_DATASET_ROOT + ├── samples <-- key frames + ├── sweeps <-- frames without annotation + ├── maps <-- unused + ├── v1.0-trainval <-- metadata +``` + +Create a symlink to the dataset root +```bash +mkdir data && cd data +ln -s DATA_ROOT +mv DATA_ROOT nuScenes # rename to nuScenes +``` +Remember to change the DATA_ROOT to the actual path in your system. + + +#### Create data + +Data creation should be under the gpu environment. + +``` +# nuScenes +python tools/create_data.py nuscenes_data_prep --root_path=NUSCENES_TRAINVAL_DATASET_ROOT --version="v1.0-trainval" --nsweeps=10 +``` + +In the end, the data and info files should be organized as follows + +``` +# For nuScenes Dataset +└── CenterPoint + └── data + └── nuScenes + ├── samples <-- key frames + ├── sweeps <-- frames without annotation + ├── maps <-- unused + |── v1.0-trainval <-- metadata and annotations + |── infos_train_10sweeps_withvelo_filter_True.pkl <-- train annotations + |── infos_val_10sweeps_withvelo_filter_True.pkl <-- val annotations + |── dbinfos_train_10sweeps_withvelo.pkl <-- GT database info files + |── gt_database_10sweeps_withvelo <-- GT database +``` + +### Train & Evaluate in Command Line + +**Now we only support training and evaluation with gpu. Cpu only mode is not supported.** + +Use the following command to start a distributed training using 4 GPUs. The models and logs will be saved to ```work_dirs/CONFIG_NAME``` + +```bash +python -m torch.distributed.launch --nproc_per_node=4 ./tools/train.py CONFIG_PATH +``` + +For distributed testing with 4 gpus, + +```bash +python -m torch.distributed.launch --nproc_per_node=4 ./tools/dist_test.py CONFIG_PATH --work_dir work_dirs/CONFIG_NAME --checkpoint work_dirs/CONFIG_NAME/latest.pth +``` + +For testing with one gpu and see the inference time, + +```bash +python ./tools/dist_test.py CONFIG_PATH --work_dir work_dirs/CONFIG_NAME --checkpoint work_dirs/CONFIG_NAME/latest.pth --speed_test +``` + +The pretrained models and configurations are in [MODEL ZOO](../configs/nusc/README.md). + +### Tracking + +You can find the detection files are in the [MODEL ZOO](../configs/nusc/README.md). After downloading the detection files, you can simply run + +```bash +# val set +python tools/nusc_tracking/pub_test.py --work_dir WORK_DIR_PATH --checkpoint DETECTION_PATH + +# test set +python tools/nusc_tracking/pub_test.py --work_dir WORK_DIR_PATH --checkpoint DETECTION_PATH --version v1.0-test --root data/nuScenes/v1.0-test +``` + +### Test Set + +Organize your dataset as follows + +``` +# For nuScenes Dataset +└── CenterPoint + └── data + └── nuScenes + ├── samples <-- key frames + ├── sweeps <-- frames without annotation + ├── maps <-- unused + |── v1.0-trainval <-- metadata and annotations + |── infos_train_10sweeps_withvelo_filter_True.pkl <-- train annotations + |── infos_val_10sweeps_withvelo_filter_True.pkl <-- val annotations + |── dbinfos_train_10sweeps_withvelo.pkl <-- GT database info files + |── gt_database_10sweeps_withvelo <-- GT database + └── v1.0-test <-- main test folder + ├── samples <-- key frames + ├── sweeps <-- frames without annotation + ├── maps <-- unused + |── v1.0-test <-- metadata and annotations + |── infos_test_10sweeps_withvelo.pkl <-- test info +``` + +Download the ```centerpoint_voxel_1440_flip``` [here](https://mitprod-my.sharepoint.com/:f:/g/personal/tianweiy_mit_edu/EhgzjwV2EghOnHFKyRgSadoBr2kUo7yPu52N-I3dG3c5dA?e=EP9G6L), save it into ```work_dirs/nusc_0075_flip```, then run the following commands in the main folder to get detection prediction + +```bash +python tools/dist_test.py configs/nusc/voxelnet/nusc_centerpoint_voxelnet_0075voxel_fix_bn_z_flip.py --work_dir work_dirs/nusc_centerpoint_voxelnet_dcn_0075voxel_flip_testset --checkpoint work_dirs/nusc_0075_flip/voxelnet_converted.pth --testset +``` diff --git a/cv/3d_detection/centerpoint/pytorch/docs/WAYMO.md b/cv/3d_detection/centerpoint/pytorch/docs/WAYMO.md new file mode 100644 index 0000000000000000000000000000000000000000..6b3ab3e9d28339b382bf112f768b832cf2864c4e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/docs/WAYMO.md @@ -0,0 +1,132 @@ +## Getting Started with CenterPoint on Waymo + +### Prerequisite + +- Follow [INSTALL.md](INSTALL.md) to install all required libraries. +- Tensorflow +- Waymo-open-dataset devkit + +```bash +conda activate centerpoint +pip install waymo-open-dataset-tf-1-15-0==1.2.0 +``` + +### Prepare data + +#### Download data and organise as follows + +``` +# For Waymo Dataset +└── WAYMO_DATASET_ROOT + ├── tfrecord_training + ├── tfrecord_validation + ├── tfrecord_testing +``` + +Convert the tfrecord data to pickle files. + +```bash +# train set +CUDA_VISIBLE_DEVICES=-1 python det3d/datasets/waymo/waymo_converter.py --record_path 'WAYMO_DATASET_ROOT/tfrecord_training/*.tfrecord' --root_path 'WAYMO_DATASET_ROOT/train/' + +# validation set +CUDA_VISIBLE_DEVICES=-1 python det3d/datasets/waymo/waymo_converter.py --record_path 'WAYMO_DATASET_ROOT/tfrecord_validation/*.tfrecord' --root_path 'WAYMO_DATASET_ROOT/val/' + +# testing set +CUDA_VISIBLE_DEVICES=-1 python det3d/datasets/waymo/waymo_converter.py --record_path 'WAYMO_DATASET_ROOT/tfrecord_testing/*.tfrecord' --root_path 'WAYMO_DATASET_ROOT/test/' +``` + +Create a symlink to the dataset root +```bash +mkdir data && cd data +ln -s WAYMO_DATASET_ROOT Waymo +``` +Remember to change the WAYMO_DATASET_ROOT to the actual path in your system. + + +#### Create info files + +```bash +# One Sweep Infos +python tools/create_data.py waymo_data_prep --root_path=data/Waymo --split train --nsweeps=1 + +python tools/create_data.py waymo_data_prep --root_path=data/Waymo --split val --nsweeps=1 + +python tools/create_data.py waymo_data_prep --root_path=data/Waymo --split test --nsweeps=1 + +# Two Sweep Infos (for two sweep detection and tracking models) +python tools/create_data.py waymo_data_prep --root_path=data/Waymo --split train --nsweeps=2 + +python tools/create_data.py waymo_data_prep --root_path=data/Waymo --split val --nsweeps=2 + +python tools/create_data.py waymo_data_prep --root_path=data/Waymo --split test --nsweeps=2 +``` + +In the end, the data and info files should be organized as follows + +``` +└── CenterPoint + └── data + └── Waymo + ├── tfrecord_training + ├── tfrecord_validation + ├── train <-- all training frames and annotations + ├── val <-- all validation frames and annotations + ├── test <-- all testing frames and annotations + ├── infos_train_01sweeps_filter_zero_gt.pkl + ├── infos_train_02sweeps_filter_zero_gt.pkl + ├── infos_val_01sweeps_filter_zero_gt.pkl + ├── infos_val_02sweeps_filter_zero_gt.pkl + ├── infos_test_01sweeps_filter_zero_gt.pkl + ├── infos_test_02sweeps_filter_zero_gt.pkl +``` + +### Train & Evaluate in Command Line + +Use the following command to start a distributed training using 4 GPUs. The models and logs will be saved to ```work_dirs/CONFIG_NAME```. + +```bash +python -m torch.distributed.launch --nproc_per_node=4 ./tools/train.py CONFIG_PATH +``` + +For distributed testing with 4 gpus, + +```bash +python -m torch.distributed.launch --nproc_per_node=4 ./tools/dist_test.py CONFIG_PATH --work_dir work_dirs/CONFIG_NAME --checkpoint work_dirs/CONFIG_NAME/latest.pth +``` + +For testing with one gpu and see the inference time, + +```bash +python ./tools/dist_test.py CONFIG_PATH --work_dir work_dirs/CONFIG_NAME --checkpoint work_dirs/CONFIG_NAME/latest.pth --speed_test +``` + +This will generate a `my_preds.bin` file in the work_dir. You can create submission to Waymo server using waymo-open-dataset code by following the instructions [here](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md). + +If you want to do local evaluation (e.g. for a subset), generate the gt prediction bin files using the script below and follow the waymo instructions [here](https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md). + +```bash +python det3d/datasets/waymo/waymo_common.py --info_path data/Waymo/infos_val_01sweeps_filter_zero_gt.pkl --result_path data/Waymo/ --gt +``` + +All pretrained models and configurations are in [MODEL ZOO](../configs/waymo/README.md). + +### Second-stage Training + +Our final model follows a two-stage training process. For example, to train the two-stage CenterPoint-Voxel model, you first need to train the one stage model using [ONE_STAGE](../configs/waymo/voxelnet/waymo_centerpoint_voxelnet_3x.py) and then train the second stage module using [TWO_STAGE](../configs/waymo/voxelnet/two_stage/waymo_centerpoint_voxelnet_two_stage_bev_5point_ft_6epoch_freeze.py). You can also contact us to access the pretrained models, see details [here](../configs/waymo/README.md). + +### Tracking + +Please refer to options in [test.py](../tools/waymo_tracking/test.py). The prediction file is an intermediate file generated using [dist_test.py](../tools/dist_test.py) that stores predictions in KITTI lidar format. + +### Visualization + +Please refer to [visual.py](../tools/visual.py). It will take a prediction file generated by [simple_inference_waymo.py](../tools/simple_inference_waymo.py) and visualize the point cloud and detections. + +### Test Set + +Add the ```--testset``` flag to the end. + +```bash +python ./tools/dist_test.py CONFIG_PATH --work_dir work_dirs/CONFIG_NAME --checkpoint work_dirs/CONFIG_NAME/latest.pth --testset +``` diff --git a/cv/3d_detection/centerpoint/pytorch/docs/demo.gif b/cv/3d_detection/centerpoint/pytorch/docs/demo.gif new file mode 100644 index 0000000000000000000000000000000000000000..f1d290fbe36693bbe181686c0ec8e5494472fd6f Binary files /dev/null and b/cv/3d_detection/centerpoint/pytorch/docs/demo.gif differ diff --git a/cv/3d_detection/centerpoint/pytorch/docs/teaser.png b/cv/3d_detection/centerpoint/pytorch/docs/teaser.png new file mode 100644 index 0000000000000000000000000000000000000000..9c7921110a372157e29d927b2c1417b75f284eab Binary files /dev/null and b/cv/3d_detection/centerpoint/pytorch/docs/teaser.png differ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.coveragerc b/cv/3d_detection/centerpoint/pytorch/numba/.coveragerc new file mode 100644 index 0000000000000000000000000000000000000000..e9eda9644d98a903c0edd5942085aa42ba7a79d6 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.coveragerc @@ -0,0 +1,24 @@ +# configuration file used by run_coverage.py +[run] +branch = True +source = numba +concurrency = multiprocessing +parallel = True + +[report] + +omit = + */__main__.py + # Vendored packages + numba/misc/appdirs.py + numba/cloudpickle/__init__.py + numba/cloudpickle/cloudpickle.py + numba/cloudpickle/cloudpickle_fast.py + numba/cloudpickle/compat.py + numba/_version.py + +exclude_lines = + pragma: no cover + if __name__ == .__main__.: + +[html] diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.flake8 b/cv/3d_detection/centerpoint/pytorch/numba/.flake8 new file mode 100644 index 0000000000000000000000000000000000000000..9d77a1d5e382d48aa526442aa036933832b94a6d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.flake8 @@ -0,0 +1,263 @@ +[flake8] +ignore = + E20, # Extra space in brackets + E231,E241, # Multiple spaces around "," + E26, # Comments + E731, # Assigning lambda expression + E741, # Ambiguous variable names + W503, # line break before binary operator + W504, # line break after binary operator +max-line-length = 80 + +exclude = + __pycache__ + .git + *.pyc + *~ + *.o + *.so + *.cpp + *.c + *.h + __init__.py + # Ignore vendored files + numba/cloudpickle/* + # Grandfather in existing failing files. This list should shrink over time + numba/stencils/stencil.py + numba/core/transforms.py + numba/core/tracing.py + numba/core/withcontexts.py + numba/_version.py + numba/core/inline_closurecall.py + numba/core/ir_utils.py + numba/core/pylowering.py + numba/python_utils.py + numba/parfors/parfor.py + numba/misc/numba_entry.py + numba/stencils/stencilparfor.py + numba/core/ir.py + numba/core/generators.py + numba/misc/appdirs.py + numba/core/caching.py + numba/core/debuginfo.py + numba/core/annotations/pretty_annotate.py + numba/misc/dummyarray.py + numba/core/dataflow.py + numba/core/pythonapi.py + numba/core/decorators.py + numba/core/typeconv/rules.py + numba/core/typeconv/castgraph.py + numba/core/rewrites/registry.py + numba/core/rewrites/macros.py + numba/core/rewrites/static_binop.py + numba/core/rewrites/ir_print.py + numba/core/types/abstract.py + numba/core/types/misc.py + numba/core/types/npytypes.py + numba/core/types/common.py + numba/core/types/iterators.py + numba/core/types/scalars.py + numba/core/fastmathpass.py + numba/cpython/setobj.py + numba/core/options.py + numba/cpython/printimpl.py + numba/cpython/cmathimpl.py + numba/cpython/tupleobj.py + numba/cpython/mathimpl.py + numba/core/registry.py + numba/core/imputils.py + numba/cpython/builtins.py + numba/core/cpu.py + numba/misc/quicksort.py + numba/core/callconv.py + numba/cpython/randomimpl.py + numba/np/npyimpl.py + numba/cpython/slicing.py + numba/cpython/numbers.py + numba/cpython/listobj.py + numba/core/removerefctpass.py + numba/core/boxing.py + numba/misc/cffiimpl.py + numba/np/linalg.py + numba/cpython/rangeobj.py + numba/np/npyfuncs.py + numba/cpython/iterators.py + numba/core/codegen.py + numba/np/polynomial.py + numba/misc/mergesort.py + numba/core/base.py + numba/np/npdatetime.py + numba/pycc/cc.py + numba/pycc/compiler.py + numba/pycc/llvm_types.py + numba/pycc/platform.py + numba/pycc/decorators.py + numba/core/runtime/nrtdynmod.py + numba/core/runtime/context.py + numba/tests/test_support.py + numba/tests/test_llvm_version_check.py + numba/tests/test_builtins.py + numba/tests/test_jitmethod.py + numba/tests/test_inlining.py + numba/tests/test_array_manipulation.py + numba/tests/test_dummyarray.py + numba/tests/test_smart_array.py + numba/tests/test_linalg.py + numba/tests/test_threadsafety.py + numba/tests/test_utils.py + numba/tests/cfunc_cache_usecases.py + numba/tests/enum_usecases.py + numba/tests/test_func_lifetime.py + numba/tests/test_typeinfer.py + numba/tests/test_return_values.py + numba/tests/test_npdatetime.py + numba/tests/test_fancy_indexing.py + numba/tests/support.py + numba/tests/test_print.py + numba/tests/test_debug.py + numba/tests/test_interproc.py + numba/tests/test_typeconv.py + numba/tests/test_tracing.py + numba/tests/usecases.py + numba/tests/test_vectorization_type_inference.py + numba/tests/matmul_usecase.py + numba/tests/complex_usecases.py + numba/tests/test_array_exprs.py + numba/tests/test_polynomial.py + numba/tests/test_wrapper.py + numba/tests/test_obj_lifetime.py + numba/tests/test_intwidth.py + numba/tests/test_remove_dead.py + numba/tests/serialize_usecases.py + numba/tests/test_del.py + numba/tests/test_gil.py + numba/tests/cffi_usecases.py + numba/tests/test_slices.py + numba/tests/test_mandelbrot.py + numba/tests/compile_with_pycc.py + numba/tests/test_deprecations.py + numba/tests/test_looplifting.py + numba/tests/test_storeslice.py + numba/tests/recursion_usecases.py + numba/tests/dummy_module.py + numba/tests/test_operators.py + numba/tests/test_comprehension.py + numba/tests/ctypes_usecases.py + numba/tests/test_locals.py + numba/tests/test_dicts.py + numba/tests/test_optional.py + numba/tests/test_mathlib.py + numba/tests/test_numberctor.py + numba/tests/test_globals.py + numba/tests/test_typingerror.py + numba/tests/test_copy_propagate.py + numba/tests/test_ctypes.py + numba/tests/test_typeof.py + numba/tests/test_usecases.py + numba/tests/test_auto_constants.py + numba/tests/test_cffi.py + numba/tests/test_sort.py + numba/tests/test_cfunc.py + numba/tests/test_conversion.py + numba/tests/test_indexing.py + numba/tests/test_pycc.py + numba/tests/annotation_usecases.py + numba/tests/test_extended_arg.py + numba/tests/test_alignment.py + numba/tests/test_multi3.py + numba/tests/test_overlap.py + numba/tests/test_array_attr.py + numba/tests/test_array_methods.py + numba/tests/test_enums.py + numba/tests/test_profiler.py + numba/tests/test_numpyadapt.py + numba/tests/test_stencils.py + numba/tests/cache_usecases.py + numba/tests/true_div_usecase.py + numba/tests/test_dataflow.py + numba/tests/test_tuples.py + numba/tests/test_svml.py + numba/tests/test_array_iterators.py + numba/tests/test_buffer_protocol.py + numba/tests/test_casting.py + numba/tests/test_lists.py + numba/tests/test_array_analysis.py + numba/tests/test_serialize.py + numba/tests/test_iteration.py + numba/tests/test_recarray_usecases.py + numba/tests/test_target_overloadselector.py + numba/tests/test_compile_cache.py + numba/tests/test_array_reductions.py + numba/tests/test_dyn_func.py + numba/tests/test_unpack_sequence.py + numba/tests/test_cgutils.py + numba/tests/test_complex.py + numba/tests/test_hashing.py + numba/tests/test_sys_stdin_assignment.py + numba/tests/test_ufuncs.py + numba/tests/pdlike_usecase.py + numba/tests/test_range.py + numba/tests/test_nrt_refct.py + numba/misc/timsort.py + numba/tests/test_nested_calls.py + numba/tests/test_chained_assign.py + numba/tests/test_withlifting.py + numba/tests/test_parfors.py + numba/tests/test_sets.py + numba/tests/test_dyn_array.py + numba/tests/test_objects.py + numba/tests/test_random.py + numba/tests/test_nan.py + numba/tests/pycc_distutils_usecase/source_module.py + numba/tests/npyufunc/test_ufuncbuilding.py + numba/tests/npyufunc/test_errors.py + numba/tests/npyufunc/test_vectorize_decor.py + numba/tests/npyufunc/test_parallel_ufunc_issues.py + numba/tests/npyufunc/test_parallel_env_variable.py + numba/tests/npyufunc/test_gufunc.py + numba/core/typing/cmathdecl.py + numba/core/typing/bufproto.py + numba/core/typing/mathdecl.py + numba/core/typing/listdecl.py + numba/core/typing/builtins.py + numba/core/typing/randomdecl.py + numba/core/typing/setdecl.py + numba/core/typing/npydecl.py + numba/core/typing/arraydecl.py + numba/core/typing/collections.py + numba/core/typing/ctypes_utils.py + numba/core/typing/enumdecl.py + numba/core/typing/cffi_utils.py + numba/core/typing/npdatetime.py + numba/core/annotations/type_annotations.py + numba/testing/ddt.py + numba/testing/loader.py + numba/testing/notebook.py + numba/testing/main.py + numba/np/unsafe/ndarray.py + numba/np/ufunc/deviceufunc.py + numba/np/ufunc/sigparse.py + numba/parfors/parfor_lowering.py + numba/np/ufunc/array_exprs.py + numba/np/ufunc/decorators.py + numba/core/datamodel/models.py + numba/core/datamodel/packer.py + numba/core/datamodel/testing.py + numba/core/datamodel/manager.py + +per-file-ignores = + # Ignore star imports, unused imports, and "may be defined by star imports" + # errors in device_init because its purpose is to bring together a lot of + # the public API to be star-imported in numba.cuda.__init__ + numba/cuda/device_init.py:F401,F403,F405 + # libdevice.py is an autogenerated file containing stubs for all the device + # functions. Some of the lines in docstrings are a little over-long, as they + # contain the URLs of the reference pages in the online libdevice + # documentation. + numba/cuda/libdevice.py:E501 + # Ignore too-long lines in the CUDA doc examples, prioritising readability + # in the docs over line length in the example source (especially given that + # the test code is already indented by 8 spaces) + numba/cuda/tests/doc_examples/test_random.py:E501 + numba/cuda/tests/doc_examples/test_cg.py:E501 + numba/cuda/tests/doc_examples/test_matmul.py:E501 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.gitattributes b/cv/3d_detection/centerpoint/pytorch/numba/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..972ba2b7f4f738094bfec52eae9eddd8c8a92995 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.gitattributes @@ -0,0 +1 @@ +numba/_version.py export-subst diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.github/CODEOWNERS b/cv/3d_detection/centerpoint/pytorch/numba/.github/CODEOWNERS new file mode 100644 index 0000000000000000000000000000000000000000..a9d8b42652c484e0cbe78885f178324e6847ead0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.github/CODEOWNERS @@ -0,0 +1,98 @@ +# Numba's codeowners file is dual purpose, it: +# +# 1. Provides information to github about who should be requested to review a PR +# 2. Provides contributors/czars general information about who to contact +# first about various parts of the code base. A lot of concepts in Numba are +# necessarily spread throughout the code base, consequently some of the +# "code ownership"/first contact is concept based opposed to file/directory +# based. +# +# ------------------------------------------------------------------------------ +# Information for github +# ------------------------------------------------------------------------------ +# These people are the default "owners" for everything in the repo unless a +# later match is made, they will automatically be requested to review PRs. +* @sklam @stuartarchibald @esc + +# Owners of specific parts of the code, will be requested to review if a PR +# touches code in the matched pattern +/numba/cuda/ @gmarkall +/numba/parfors/ @DrTodd13 +/numba/stencils/ @DrTodd13 + +# ------------------------------------------------------------------------------ +# Information for contributors +# ------------------------------------------------------------------------------ +# This section provides a rough list of who to contact first for help with +# various parts/concepts in the code base, first contact does not imply +# ownership! +# +# Parts of the code base: +# +# * Parfors/Parallel Accelerator (@DrTodd13) +# - Array Analysis (@DrTodd13) +# - Parfors transforms (@DrTodd13) +# * Stencils (@DrTodd13) +# * Experimental: +# - Jitclasses (@sklam) +# - StructRef (@sklam) +# * Typed containers: +# - Typed.List (@esc) +# - Typed.Dict (@sklam) +# * Documentation (Needs first contact/owner) +# * NumPy (Needs first contact/owner) +# - ufuncs (Needs first contact/owner) +# - linalg (@stuartarchibald) +# - Implementation of specific functions (Needs first contact/owner) +# - Parallel backends/threading layers (@stuartarchibald) +# * CPython implementation (Needs first contact/owner) +# * Extension API (Needs first contact/owner) +# * AOT (Needs first contact/owner) +# * Compiler: +# - Type inference (@sklam) +# - Bytecode analysis/CFA/DFA (@sklam) +# - Compiler Pipeline infrastructure (@stuartarchibald) +# - Compiler passes: +# - Rewrites (Needs first contact/owner) +# - Branch pruning (@stuartarchibald) +# - Literal unroll (@stuartarchibald) +# - Rewrite Semantic Constants (@stuartarchibald) +# - MakeFunction To Jit function (@stuartarchibald) +# - Overload and function inlining (@stuartarchibald) +# - With Lifting (@sklam) +# - Exception handling (@sklam) +# - Literally (@sklam) +# - SSA (@sklam) +# - lowering.py, codegen.py (@sklam) +# - Datamodels/call conventions (@sklam) +# - Inlining in general (@stuartarchibald) +# +# Additional Concepts: +# +# * Reference counting and NRT (@sklam) +# * Testing (Needs first contact/owner) +# * CI: +# - public CI (azure) (Needs first contact/owner) +# - Numba build farm (@esc) +# * Integration testing (https://github.com/numba/numba-integration-testing) +# (@esc) +# * ASV profiling (@esc) +# * Type Annotations (@luk-f-a and @EPronovost) +# * Ufunc/GUfunc (Needs first contact/owner) +# * Profiling (Needs first contact/owner (and code!)) +# * Debugging: +# - DWARF (@sklam) +# - gdb support (@stuartarchibald) +# * Hardware targets: +# - The CUDA target (@gmarkall) +# - The ROCm target (@stuartarchibald) +# - ARM* (@stuartarchibald) +# - POWER (Needs first contact/owner) +# - X86* (Needs first contact/owner) +# * OS: +# - Linux (@stuartarchibald) +# - OSX +# - Windows +# - BSD (@stuartarchibald) +# +# Anything not covered by someone else... ping @sklam and @stuartarchibald diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/Bug_report.md b/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/Bug_report.md new file mode 100644 index 0000000000000000000000000000000000000000..db91bced743433beb4058f82796666c9123cb009 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/Bug_report.md @@ -0,0 +1,34 @@ +--- +name: Bug Report +about: Report a bug. Not for asking general questions - see below. + +--- + + + +## Reporting a bug + + + +- [ ] I have tried using the latest released version of Numba (most recent is + visible in the change log (https://github.com/numba/numba/blob/main/CHANGE_LOG). +- [ ] I have included a self contained code sample to reproduce the problem. + i.e. it's possible to run as 'python bug.py'. + + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/Feature_request.md b/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/Feature_request.md new file mode 100644 index 0000000000000000000000000000000000000000..398c277a958f64624585fb7b681ad183f2c48b54 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/Feature_request.md @@ -0,0 +1,23 @@ +--- +name: Feature Request +about: Tell us about something in the Python language/NumPy you'd like Numba to support. Not for asking general questions - see below. + +--- + + + +## Feature request + + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/config.yml b/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..08310d750c6fadc78ffc7c4ae2bc259dfe0545d1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,11 @@ +blank_issues_enabled: false +contact_links: + - name: General Question + url: https://numba.discourse.group/c/numba/community-support/ + about: "If you have a general question (not a bug report or feature request) then please ask on Numba's discourse instance." + - name: Quick Question/Just want to say Hi! + url: https://gitter.im/numba/numba + about: "If you have a quick question or want chat to users/developers in real time then please use gitter.im/numba/numba" + - name: Discuss an involved feature + url: https://numba.discourse.group/c/numba/development/ + about: "If you would like to suggest a more involved feature like *Can a new compiler pass be added to do X* then please start a discussion on Numba's discourse instance." diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/first_rc_checklist.md b/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/first_rc_checklist.md new file mode 100644 index 0000000000000000000000000000000000000000..be572dfec711300208d481057ecba0aebf43a3c5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/first_rc_checklist.md @@ -0,0 +1,41 @@ +--- +name: First Release Candidate Checklist (maintainer only) +about: Checklist template for the first release of every series +title: Numba X.Y.Zrc1 Checklist (FIXME) +labels: task + +--- + + +## Numba X.Y.Z + +* [ ] Merge to main. + - [ ] "remaining Pull-Requests from milestone". +* [ ] Check Numba's version support table documentation. Update via PR if + needed. +* [ ] Review deprecation schedule and notices. Make PRs if need be. +* [ ] Merge change log changes. + - [ ] "PR with changelog entries". +* [ ] Create X.Y release branch. +* [ ] Dependency version pinning on release branch + * [ ] Pin llvmlite to `>=0.A.0rc1,<0.A+1.0`. + * [ ] Pin NumPy if needed + * [ ] Pin TBB if needed +* [ ] Annotated tag X.Y.Zrc1 on release branch (no `v` prefix). +* [ ] Build and upload conda packages on buildfarm (check "upload"). +* [ ] Build wheels and sdist on the buildfarm (check "upload"). +* [ ] Verify packages uploaded to Anaconda Cloud and move to `numba/label/main`. +* [ ] Upload wheels and sdist to PyPI (upload from `ci_artifacts`). +* [ ] Verify wheels for all platforms arrived on PyPi. +* [ ] Initialize and verify ReadTheDocs build. +* [ ] Send RC announcement email / post announcement to discourse group. +* [ ] Post link to Twitter. + +### Post Release: + +* [ ] Clean up `ci_artifacts` by moving files to sub-directories +* [ ] Tag X.Y+1.0dev0 to start new development cycle on `main`. +* [ ] Update llvmlite dependency spec to match next version via PR to `main`. +* [ ] Update release checklist template with any additional bullet points that + may have arisen during the release. +* [ ] Close milestone (and then close this release issue). diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/sub_rc_checklist.md b/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/sub_rc_checklist.md new file mode 100644 index 0000000000000000000000000000000000000000..5874922a612c6f2828bbd66a52db2f012aff77af --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.github/ISSUE_TEMPLATE/sub_rc_checklist.md @@ -0,0 +1,37 @@ +--- +name: Subsequent Release Candidate Checklist (maintainer only) +about: Checklist template for all subsequent releases (RC 2-N, FINAL and PATCH) of every series +title: Numba X.Y.Zrc1 Checklist (FIXME) +labels: task + +--- + + +## numba X.Y.Z + +* [ ] Cherry-pick items from the X.Y.Z milestone into a PR. +* [ ] Approve change log modifications and cherry-pick. +* [ ] Merge change log modifications and cherry-picks to X.Y release branch. + * [ ] https://github.com/numba/numba/pull/XXXX +* [ ] Review, merge and check execution of release notebook. (FINAL ONLY) +* [ ] Annotated tag X.Y.Z on release branch (no `v` prefix). +* [ ] Build and upload conda packages on buildfarm (check `upload`). +* [ ] Build wheels and sdist on the buildfarm (check "upload"). +* [ ] Verify packages uploaded to Anaconda Cloud and move to + `numba/label/main`. +* [ ] Upload wheels and sdist to PyPI (upload from `ci_artifacts`). +* [ ] Verify wheels for all platforms arrived on PyPi. +* [ ] Verify ReadTheDocs build. +* [ ] Send RC/FINAL announcement email / post announcement to discourse group. +* [ ] Post link to Twitter. +* [ ] Post link to python-announce-list@python.org. + +### Post release + +* [ ] Snapshot Build Farm config +* [ ] Clean up `ci_artifacts` by moving files to subdirectories +* [ ] Update release checklist template with any additional bullet points that + may have arisen during the release. +* [ ] Ping Anaconda Distro team to trigger a build for `defaults` (FINAL ONLY). +* [ ] Create a release on Github at https://github.com/numba/numba/releases (FINAL ONLY). +* [ ] Close milestone (and then close this release issue). diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.github/PULL_REQUEST_TEMPLATE.md b/cv/3d_detection/centerpoint/pytorch/numba/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000000000000000000000000000000000..18d4b105f387d22f2f284e7fbc3808f539a6396b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,39 @@ + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.github/workflows/stale.yml b/cv/3d_detection/centerpoint/pytorch/numba/.github/workflows/stale.yml new file mode 100644 index 0000000000000000000000000000000000000000..6415ada9883f66767432bf61bef5438e5b2ecf98 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.github/workflows/stale.yml @@ -0,0 +1,20 @@ +name: 'Mark stale issues' +on: + schedule: + - cron: '30 1 * * *' + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v3 + with: + stale-issue-message: > + This issue is marked as stale as it has had no activity in the past + 30 days. Please close this issue if no further response or action is + needed. Otherwise, please respond with any updates and confirm that + this issue still needs to be addressed. + stale-issue-label: 'stale' + any-of-labels: 'question,needtriage,more info needed' + days-before-issue-stale: 30 + days-before-issue-close: -1 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.gitignore b/cv/3d_detection/centerpoint/pytorch/numba/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f4a687d88747de18c0949074f90a80b1cccb7383 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.gitignore @@ -0,0 +1,28 @@ +*.pyc +*.o +*.so +*.dylib +*.pyd +*.pdb +*.egg-info +*.sw[po] +*.out +*.ll +.coverage +.nfs* +tags +MANIFEST + +build/ +docs/_build/ +docs/gh-pages/ +dist/ +htmlcov/ +.idea/ +.vscode/ +.ycm_extra_conf.py +.mypy_cache/ +.ipynb_checkpoints/ +__pycache__/ + +docs/source/developer/autogen* diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.pre-commit-config.yaml b/cv/3d_detection/centerpoint/pytorch/numba/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bf2fb6423c54a817fbfbc372a0c6e1cd6ca44c09 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.7.8 + hooks: + - id: flake8 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/.readthedocs.yml b/cv/3d_detection/centerpoint/pytorch/numba/.readthedocs.yml new file mode 100644 index 0000000000000000000000000000000000000000..f74302f7e900caff459b0c1c859bcb27a03a291a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/.readthedocs.yml @@ -0,0 +1,15 @@ +version: 2 +build: + os: ubuntu-20.04 + tools: + python: mambaforge-4.10 +sphinx: + configuration: docs/source/conf.py +python: + install: + - method: setuptools + path: . +conda: + environment: docs/environment.yml +formats: +- pdf diff --git a/cv/3d_detection/centerpoint/pytorch/numba/CHANGE_LOG b/cv/3d_detection/centerpoint/pytorch/numba/CHANGE_LOG new file mode 100644 index 0000000000000000000000000000000000000000..e94fa66734855d060da3db103248c6dfac46b5b9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/CHANGE_LOG @@ -0,0 +1,5717 @@ +Version 0.56.4 (3 November, 2022) +--------------------------------- + +This is a bugfix release to fix a regression in the CUDA target in relation to +the ``.view()`` method on CUDA device arrays that is present when using NumPy +version 1.23.0 or later. + +Pull-Requests: + +* PR `#8537 `_: Make ol_compatible_view accessible on all targets (`gmarkall `_) +* PR `#8552 `_: Update version support table for 0.56.4. (`stuartarchibald `_) +* PR `#8553 `_: Update CHANGE_LOG for 0.56.4 (`stuartarchibald `_) +* PR `#8570 `_: Release 0.56 branch: Fix overloads with ``target="generic"`` for CUDA (`gmarkall `_) +* PR `#8571 `_: Additional update to CHANGE_LOG for 0.56.4 (`stuartarchibald `_) + +Authors: + +* `gmarkall `_ +* `stuartarchibald `_ + +Version 0.56.3 (13 October, 2022) +--------------------------------- + +This is a bugfix release to remove the version restriction applied to the +``setuptools`` package and to fix a bug in the CUDA target in relation to +copying zero length device arrays to zero length host arrays. + +Pull-Requests: + +* PR `#8475 `_: Remove setuptools version pin (`gmarkall `_) +* PR `#8482 `_: Fix #8477: Allow copies with different strides for 0-length data (`gmarkall `_) +* PR `#8486 `_: Restrict the TBB development package to supported version in Azure. (`stuartarchibald `_) +* PR `#8503 `_: Update version support table for 0.56.3 (`stuartarchibald `_) +* PR `#8504 `_: Update CHANGE_LOG for 0.56.3 (`stuartarchibald `_) + +Authors: + +* `gmarkall `_ +* `stuartarchibald `_ + +Version 0.56.2 (1 September, 2022) +---------------------------------- + +This is a bugfix release that supports NumPy 1.23 and fixes CUDA function +caching. + +Pull-Requests: + +* PR `#8239 `_: Add decorator to run a test in a subprocess (`stuartarchibald `_) +* PR `#8276 `_: Move Azure to use macos-11 (`stuartarchibald `_) +* PR `#8310 `_: CUDA: Fix Issue #8309 - atomics don't work on complex components (`Graham Markall `_) +* PR `#8342 `_: Upgrade to ubuntu-20.04 for azure pipeline CI (`jamesobutler `_) +* PR `#8356 `_: Update setup.py, buildscripts, CI and docs to require setuptools<60 (`stuartarchibald `_) +* PR `#8374 `_: Don't pickle LLVM IR for CUDA code libraries (`Graham Markall `_) +* PR `#8377 `_: Add support for NumPy 1.23 (`stuartarchibald `_) +* PR `#8384 `_: Move strace() check into tests that actually need it (`stuartarchibald `_) +* PR `#8386 `_: Fix the docs for numba.get_thread_id (`stuartarchibald `_) +* PR `#8407 `_: Pin NumPy version to 1.18-1.24 (`Andre Masella `_) +* PR `#8411 `_: update version support table for 0.56.1 (`esc `_) +* PR `#8412 `_: Create changelog for 0.56.1 (`Andre Masella `_) +* PR `#8413 `_: Fix Azure CI for NumPy 1.23 and use conda-forge scipy (`Siu Kwan Lam `_) +* PR `#8414 `_: Hotfix for 0.56.2 (`Siu Kwan Lam `_) + +Version 0.56.1 (1 September, 2022) +---------------------------------- + +The release was skipped due to issues during the release process. + +Version 0.56.0 (25 July, 2022) +------------------------------ + +This release continues to add new features, bug fixes and stability improvements +to Numba. Please note that this will be the last release that has support for +Python 3.7 as the next release series (Numba 0.57) will support Python 3.11! +Also note that, this will be the last release to support linux-32 packages +produced by the Numba team. + +Python language support enhancements: + +* Previously missing support for large, in-line dictionaries and internal calls + to functions with large numbers of keyword arguments in Python 3.10 has been + added. +* ``operator.mul`` now works for ``list`` s. +* Literal slices, e.g. ``slice(1, 10, 2)`` can be returned from ``nopython`` + mode functions. +* The ``len`` function now works on ``dict_keys``, ``dict_values`` and + ``dict_items`` . +* Numba's ``set`` implementation now supports reference counted items e.g. + strings. + +Numba specific feature enhancements: + +* The experimental ``jitclass`` feature gains support for a large number of + ``builtin`` methods e.g. declaring ``__hash__`` or ``__getitem__`` for a + ``jitclass`` type. +* It's now possible to use ``@vectorize`` on an already ``@jit`` family + decorated function. +* Name mangling has been updated to emit compiled function names that exactly + match the function name in Python. This means debuggers, like GDB, can be set + to break directly on Python function names. +* A GDB "pretty printing" support module has been added, when loaded into GDB + Numba's internal representations of Python/NumPy types are rendered inside GDB + as they would be in Python. +* An experimental option is added to the ``@jit`` family decorators to entirely + turn off LLVM's optimisation passes for a given function (see + ``_dbg_optnone`` kwarg in the ``@jit`` decorator family). +* A new environment variable is added ``NUMBA_EXTEND_VARIABLE_LIFETIMES``, which + if set will extend the lifetime of variables to the end of their basic block, + this to permit a debugging experience in GDB similar to that found in compiled + C/C++/Fortran code. + +NumPy features/enhancements: + +* Initial support for passing, using and returning ``numpy.random.Generator`` + instances has been added, this currently includes support for the ``random`` + distribution. +* The broadcasting functions ``np.broadcast_shapes`` and ``np.broadcast_arrays`` + are now supported. +* The ``min`` and ``max`` functions now work with ``np.timedelta64`` and + ``np.datetime64`` types. +* Sorting multi-dimensional arrays along the last axis is now supported in + ``np.sort()``. +* The ``np.clip`` function is updated to accept NumPy arrays for the ``a_min`` + and ``a_max`` arguments. +* The NumPy allocation routines (``np.empty`` , ``np.ones`` etc.) support shape + arguments specified using members of ``enum.IntEnum`` s. +* The function ``np.random.noncentral_chisquare`` is now supported. +* The performance of functions ``np.full`` and ``np.ones`` has been improved. + +Parallel Accelerator enhancements: + +* The ``parallel=True`` functionality is enhanced through the addition of the + functions ``numba.set_parallel_chunksize`` and + ``numba.get_parallel_chunksize`` to permit a more fine grained scheduling of + work defined in a parallel region. There is also support for adjusting the + ``chunksize`` via a context manager. +* The ``ID`` of a thread is now defined to be predictable and within a known + range, it is available through calling the function ``numba.get_thread_id``. +* The performance of ``@stencil`` s has been improved in both serial and + parallel execution. + +CUDA enhancements: + +* New functionality: + + * Self-recursive device functions. + * Vector type support (``float4``, ``int2``, etc.). + * Shared / local arrays of extension types can now be created. + * Support for linking CUDA C / C++ device functions into Python kernels. + * PTX generation for Compute Capabilities 8.6 and 8.7 - e.g. RTX A series, + GTX 3000 series. + * Comparison operations for ``float16`` types. + +* Performance improvements: + + * Context queries are no longer made during launch configuration. + * Launch configurations are now LRU cached. + * On-disk caching of CUDA kernels is now supported. + +* Documentation: many new examples added. + +Docs: + +* Numba now has an official "mission statement". +* There's now a "version support table" in the documentation to act as an easy + to use, single reference point, for looking up information about Numba + releases and their required/supported dependencies. + +General Enhancements: + +* Numba imports more quickly in environments with large numbers of packages as + it now uses ``importlib-metadata`` for querying other packages. +* Emission of chrome tracing output is now supported for the internal + compilation event handling system. +* This release is tested and known to work when using the + `Pyston `_ Python interpreter. + +Pull-Requests: + +* PR `#5209 `_: Use importlib to load numba extensions (`Stepan Rakitin `_ `Graham Markall `_ `stuartarchibald `_) +* PR `#5877 `_: Jitclass builtin methods (`Ethan Pronovost `_ `Graham Markall `_) +* PR `#6490 `_: Stencil output allocated with np.empty now and new code to initialize the borders. (`Todd A. Anderson `_) +* PR `#7005 `_: Make `numpy.searchsorted` match NumPy when first argument is unsorted (`Brandon T. Willard `_) +* PR `#7363 `_: Update cuda.local.array to clarify "simple constant expression" (e.g. no NumPy ints) (`Sterling Baird `_) +* PR `#7364 `_: Removes an instance of signed integer overflow undefined behaviour. (`Tobias Sargeant `_) +* PR `#7537 `_: Add chrome tracing (`Hadia Ahmed `_ `Siu Kwan Lam `_) +* PR `#7556 `_: Testhound/fp16 comparison (`Michael Collison `_ `Graham Markall `_) +* PR `#7586 `_: Support for len on dict.keys, dict.values, and dict.items (`Nick Riasanovsky `_) +* PR `#7617 `_: Numba gdb-python extension for printing (`stuartarchibald `_) +* PR `#7619 `_: CUDA: Fix linking with PTX when compiling lazily (`Graham Markall `_) +* PR `#7621 `_: Add support for linking CUDA C / C++ with `@cuda.jit` kernels (`Graham Markall `_) +* PR `#7625 `_: Combined parfor chunking and caching PRs. (`stuartarchibald `_ `Todd A. Anderson `_ `Siu Kwan Lam `_) +* PR `#7651 `_: DOC: pypi and conda-forge badges (`Ray Bell `_) +* PR `#7660 `_: Add support for np.broadcast_arrays (`Guilherme Leobas `_) +* PR `#7664 `_: Flatten mangling dicts into a single dict (`Graham Markall `_) +* PR `#7680 `_: CUDA Docs: include example calling slow matmul (`Graham Markall `_) +* PR `#7682 `_: performance improvements to np.full and np.ones (`Rishi Kulkarni `_) +* PR `#7684 `_: DOC: remove incorrect warning in np.random reference (`Rishi Kulkarni `_) +* PR `#7685 `_: Don't convert setitems that have dimension mismatches to parfors. (`Todd A. Anderson `_) +* PR `#7690 `_: Implemented np.random.noncentral_chisquare for all size arguments (`Rishi Kulkarni `_) +* PR `#7695 `_: `IntEnumMember` support for `np.empty`, `np.zeros`, and `np.ones` (`Benjamin Graham `_) +* PR `#7699 `_: CUDA: Provide helpful error if the return type is missing for `declare_device` (`Graham Markall `_) +* PR `#7700 `_: Support for scalar arguments in Np.ascontiguousarray (`Dhruv Patel `_) +* PR `#7703 `_: Ignore unsupported types in `ShapeEquivSet._getnames()` (`Benjamin Graham `_) +* PR `#7704 `_: Move the type annotation pass to post legalization. (`stuartarchibald `_) +* PR `#7709 `_: CUDA: Fixes missing type annotation pass following #7704 (`stuartarchibald `_) +* PR `#7712 `_: Fixing issue 7693 (`stuartarchibald `_ `Graham Markall `_ `luk-f-a `_) +* PR `#7714 `_: Support for boxing SliceLiteral type (`Nick Riasanovsky `_) +* PR `#7718 `_: Bump llvmlite dependency to 0.39.0dev0 for Numba 0.56.0dev0 (`stuartarchibald `_) +* PR `#7724 `_: Update URLs in error messages to refer to RTD docs. (`stuartarchibald `_) +* PR `#7728 `_: Document that AOT-compiled functions do not check arg types (`Graham Markall `_) +* PR `#7729 `_: Handle Omitted/OmittedArgDataModel in DI generation. (`stuartarchibald `_) +* PR `#7732 `_: update release checklist following 0.55.0 RC1 (`esc `_) +* PR `#7736 `_: Update CHANGE_LOG for 0.55.0 final. (`stuartarchibald `_) +* PR `#7740 `_: CUDA Python 11.6 support (`Graham Markall `_) +* PR `#7744 `_: Fix issues with locating/parsing source during DebugInfo emission. (`stuartarchibald `_) +* PR `#7745 `_: Fix the release year for Numba 0.55 change log entry. (`stuartarchibald `_) +* PR `#7748 `_: Fix #7713: Ensure _prng_random_hash return has correct bitwidth (`Graham Markall `_) +* PR `#7749 `_: Refactor threading layer priority tests to not use stdout/stderr (`stuartarchibald `_) +* PR `#7752 `_: Fix #7751: Use original filename for array exprs (`Graham Markall `_) +* PR `#7755 `_: CUDA: Deprecate support for CC < 5.3 and CTK < 10.2 (`Graham Markall `_) +* PR `#7763 `_: Update Read the Docs configuration (automatic) (`readthedocs-assistant `_) +* PR `#7764 `_: Add dbg_optnone and dbg_extend_lifetimes flags (`Siu Kwan Lam `_) +* PR `#7771 `_: Move function unique ID to abi-tags (`stuartarchibald `_ `Siu Kwan Lam `_) +* PR `#7772 `_: CUDA: Add Support to Creating `StructModel` Array (`Michael Wang `_) +* PR `#7776 `_: Updates coverage.py config (`stuartarchibald `_) +* PR `#7777 `_: Remove reference existing issue from GH template. (`stuartarchibald `_) +* PR `#7778 `_: Remove long deprecated flags from the CLI. (`stuartarchibald `_) +* PR `#7780 `_: Fix sets with reference counted items (`Benjamin Graham `_) +* PR `#7782 `_: adding reminder to check on deprecations (`esc `_) +* PR `#7783 `_: remove upper limit on Python version (`esc `_) +* PR `#7786 `_: Remove dependency on intel-openmp for OSX (`stuartarchibald `_) +* PR `#7788 `_: Avoid issue with DI gen for arrayexprs. (`stuartarchibald `_) +* PR `#7796 `_: update change-log for 0.55.1 (`esc `_) +* PR `#7797 `_: prune README (`esc `_) +* PR `#7799 `_: update the release checklist post 0.55.1 (`esc `_) +* PR `#7801 `_: add sdist command and umask reminder (`esc `_) +* PR `#7804 `_: update local references from master -> main (`esc `_) +* PR `#7805 `_: Enhance source line finding logic for debuginfo (`Siu Kwan Lam `_) +* PR `#7809 `_: Updates the gdb configuration to accept a binary name or a path. (`stuartarchibald `_) +* PR `#7813 `_: Extend parfors test timeout for aarch64. (`stuartarchibald `_) +* PR `#7814 `_: CUDA Dispatcher refactor (`Graham Markall `_) +* PR `#7815 `_: CUDA Dispatcher refactor 2: inherit from `dispatcher.Dispatcher` (`Graham Markall `_) +* PR `#7817 `_: Update intersphinx URLs for NumPy and llvmlite. (`stuartarchibald `_) +* PR `#7823 `_: Add renamed vars to callee scope such that it is self consistent. (`stuartarchibald `_) +* PR `#7829 `_: CUDA: Support `Enum/IntEnum` in Kernel (`Michael Wang `_) +* PR `#7833 `_: Add version support information table to docs. (`stuartarchibald `_) +* PR `#7835 `_: Fix pickling error when module cannot be imported (`idorrington `_) +* PR `#7836 `_: min() and max() support for np.datetime and np.timedelta (`Benjamin Graham `_) +* PR `#7837 `_: Initial refactoring of parfor reduction lowering (`Siu Kwan Lam `_) +* PR `#7845 `_: change time.time() to time.perf_counter() in docs (`Nopileos2 `_) +* PR `#7846 `_: Fix CUDA enum vectorize test on Windows (`Graham Markall `_) +* PR `#7848 `_: Support for int * list (`Nick Riasanovsky `_) +* PR `#7850 `_: CUDA: Pass `fastmath` compiler flag down to `compile_ptx` and `compile_device`; Improve `fastmath` tests (`Michael Wang `_) +* PR `#7855 `_: Ensure np.argmin/no.argmax return type is intp (`stuartarchibald `_) +* PR `#7858 `_: CUDA: Deprecate `ptx` Attribute and Update Tests (`Graham Markall `_ `Michael Wang `_) +* PR `#7861 `_: Fix a spelling mistake in README (`Zizheng Guo `_) +* PR `#7864 `_: Fix cross_iter_dep check. (`Todd A. Anderson `_) +* PR `#7865 `_: Remove add_user_function (`Graham Markall `_) +* PR `#7866 `_: Support for large numbers of args/kws with Python 3.10 (`Nick Riasanovsky `_) +* PR `#7878 `_: CUDA: Remove some deprecated support, add CC 8.6 and 8.7 (`Graham Markall `_) +* PR `#7893 `_: Use uuid.uuid4() as the key in serialization. (`stuartarchibald `_) +* PR `#7895 `_: Remove use of `llvmlite.llvmpy` (`Andre Masella `_) +* PR `#7898 `_: Skip test_ptds under cuda-memcheck (`Graham Markall `_) +* PR `#7901 `_: Pyston compatibility for the test suite (`Kevin Modzelewski `_) +* PR `#7904 `_: Support m1 (`esc `_) +* PR `#7911 `_: added sys import (`Nightfurex `_) +* PR `#7915 `_: CUDA: Fix test checking debug info rendering. (`stuartarchibald `_) +* PR `#7918 `_: Add JIT examples to CUDA docs (`brandon-b-miller `_ `Graham Markall `_) +* PR `#7919 `_: Disallow //= reductions in pranges. (`Todd A. Anderson `_) +* PR `#7924 `_: Retain non-modified index tuple components. (`Todd A. Anderson `_) +* PR `#7939 `_: Fix rendering in feature request template. (`stuartarchibald `_) +* PR `#7940 `_: Implemented `np.allclose` in `numba/np/arraymath.py` (`Gagandeep Singh `_) +* PR `#7941 `_: Remove debug dump output from closure inlining pass. (`stuartarchibald `_) +* PR `#7946 `_: instructions for creating a build environment were outdated (`esc `_) +* PR `#7949 `_: Add Cuda Vector Types (`Michael Wang `_) +* PR `#7950 `_: mission statement (`esc `_) +* PR `#7956 `_: Stop using pip for 3.10 on public ci (Revert "start testing Python 3.10 on public CI") (`esc `_) +* PR `#7957 `_: Use cloudpickle for disk caches (`Siu Kwan Lam `_) +* PR `#7958 `_: `numpy.clip` accept `numpy.array` for `a_min`, `a_max` (`Gagandeep Singh `_) +* PR `#7959 `_: Permit a new array model to have a super set of array model fields. (`stuartarchibald `_) +* PR `#7961 `_: `numba.typed.typeddict.Dict.get` uses `castedkey` to avoid returning default value even if the key is present (`Gagandeep Singh `_) +* PR `#7963 `_: remove the roadmap from the sphinx based docs (`esc `_) +* PR `#7964 `_: Support for large constant dictionaries in Python 3.10 (`Nick Riasanovsky `_) +* PR `#7965 `_: Use uuid4 instead of PID in cache temp name to prevent collisions. (`stuartarchibald `_) +* PR `#7971 `_: lru cache for configure call (`Tingkai Liu `_) +* PR `#7972 `_: Fix fp16 support for cuda shared array (`Michael Collison `_ `Graham Markall `_) +* PR `#7986 `_: Small caching refactor to support target cache implementations (`Graham Markall `_) +* PR `#7994 `_: Supporting multidimensional arrays in quick sort (`Gagandeep Singh `_ `Siu Kwan Lam `_) +* PR `#7996 `_: Fix binding logic in `@overload_glue`. (`stuartarchibald `_) +* PR `#7999 `_: Remove `@overload_glue` for NumPy allocators. (`stuartarchibald `_) +* PR `#8003 `_: Add np.broadcast_shapes (`Guilherme Leobas `_) +* PR `#8004 `_: CUDA fixes for Windows (`Graham Markall `_) +* PR `#8014 `_: Fix support for {real,imag} array attrs in Parfors. (`stuartarchibald `_) +* PR `#8016 `_: [Docs] [Very Minor] Make `numba.jit` boundscheck doc line consistent (`Kyle Martin `_) +* PR `#8017 `_: Update FAQ to include details about using debug-only option (`Guilherme Leobas `_) +* PR `#8027 `_: Support for NumPy 1.22 (`stuartarchibald `_) +* PR `#8031 `_: Support for Numpy BitGenerators PR#1 - Core Generator Support (`Kaustubh `_) +* PR `#8035 `_: Fix a couple of typos RE implementation (`stuartarchibald `_) +* PR `#8037 `_: CUDA self-recursion tests (`Graham Markall `_) +* PR `#8044 `_: Make Python 3.10 kwarg peephole less restrictive (`Nick Riasanovsky `_) +* PR `#8046 `_: Fix caching test failures (`Siu Kwan Lam `_) +* PR `#8049 `_: support str(bool) syntax (`LI Da `_) +* PR `#8052 `_: Ensure pthread is linked in when building for ppc64le. (`Siu Kwan Lam `_) +* PR `#8056 `_: Move caching tests from test_dispatcher to test_caching (`Graham Markall `_) +* PR `#8057 `_: Fix coverage checking (`Graham Markall `_) +* PR `#8064 `_: Rename "nb:run_pass" to "numba:run_pass" and document it. (`Siu Kwan Lam `_) +* PR `#8065 `_: Fix PyLowering mishandling starargs (`Siu Kwan Lam `_) +* PR `#8068 `_: update changelog for 0.55.2 (`esc `_) +* PR `#8077 `_: change return type of np.broadcast_shapes to a tuple (`Guilherme Leobas `_) +* PR `#8080 `_: Fix windows test failure due to timeout when the machine is slow poss… (`Siu Kwan Lam `_) +* PR `#8081 `_: Fix erroneous array count in parallel gufunc kernel generation. (`stuartarchibald `_) +* PR `#8089 `_: Support on-disk caching in the CUDA target (`Graham Markall `_) +* PR `#8097 `_: Exclude libopenblas 0.3.20 on osx-arm64 (`esc `_) +* PR `#8099 `_: Fix Py_DECREF use in case of error state (for devicearray). (`stuartarchibald `_) +* PR `#8102 `_: Combine numpy run_constrained in meta.yaml to the run requirements (`Siu Kwan Lam `_) +* PR `#8109 `_: Pin TBB support with respect to incompatible 2021.6 API. (`stuartarchibald `_) +* PR `#8118 `_: Update release checklists post 0.55.2 (`esc `_) +* PR `#8123 `_: Fix CUDA print tests on Windows (`Graham Markall `_) +* PR `#8124 `_: Add explicit checks to all allocators in the NRT. (`stuartarchibald `_) +* PR `#8126 `_: Mark gufuncs as having mutable outputs (`Andre Masella `_) +* PR `#8133 `_: Fix #8132. Regression in Record.make_c_struct for handling nestedarray (`Siu Kwan Lam `_) +* PR `#8137 `_: CUDA: Fix #7806, Division by zero stops the kernel (`Graham Markall `_) +* PR `#8142 `_: CUDA: Fix some missed changes from dropping 9.2 (`Graham Markall `_) +* PR `#8144 `_: Fix NumPy capitalisation in docs. (`stuartarchibald `_) +* PR `#8145 `_: Allow ufunc builder to use previously JITed function (`Andre Masella `_) +* PR `#8151 `_: pin NumPy to build 0 of 1.19.2 on public CI (`esc `_) +* PR `#8163 `_: CUDA: Remove context query in launch config (`Graham Markall `_) +* PR `#8165 `_: Restrict strace based tests to be linux only via support feature. (`stuartarchibald `_) +* PR `#8170 `_: CUDA: Fix missing space in low occupancy warning (`Graham Markall `_) +* PR `#8175 `_: make build and upload order consistent (`esc `_) +* PR `#8181 `_: Fix various typos (`luzpaz `_) +* PR `#8187 `_: Update CHANGE_LOG for 0.55.2 (`stuartarchibald `_ `esc `_) +* PR `#8189 `_: updated version support information for 0.55.2/0.57 (`esc `_) +* PR `#8191 `_: CUDA: Update deprecation notes for 0.56. (`Graham Markall `_) +* PR `#8192 `_: Update CHANGE_LOG for 0.56.0 (`stuartarchibald `_ `esc `_ `Siu Kwan Lam `_) +* PR `#8195 `_: Make the workqueue threading backend once again fork safe. (`stuartarchibald `_) +* PR `#8196 `_: Fix numerical tolerance in parfors caching test. (`stuartarchibald `_) +* PR `#8197 `_: Fix `isinstance` warning check test. (`stuartarchibald `_) +* PR `#8203 `_: pin llvmlite 0.39 for public CI builds (`esc `_) +* PR `#8205 `_: Pin llvmlite and NumPy on release branch.(`esc `_ `Siu Kwan Lam `_) +* PR `#8255 `_: CUDA: Make numba.cuda.tests.doc_examples.ffi a module to fix #8252 (`Graham Markall `_) +* PR `#8274 `_: Update version support table doc for 0.56. (`stuartarchibald `_) +* PR `#8275 `_: Update CHANGE_LOG for 0.56.0 final (`stuartarchibald `_) + +Authors: + +* `Andre Masella `_ +* `Benjamin Graham `_ +* `brandon-b-miller `_ +* `Brandon T. Willard `_ +* `Gagandeep Singh `_ +* `Dhruv Patel `_ +* `LI Da `_ +* `Todd A. Anderson `_ +* `Ethan Pronovost `_ +* `esc `_ +* `Tobias Sargeant `_ +* `Graham Markall `_ +* `Guilherme Leobas `_ +* `Zizheng Guo `_ +* `Hadia Ahmed `_ +* `idorrington `_ +* `Michael Wang `_ +* `Kaustubh `_ +* `Kevin Modzelewski `_ +* `luk-f-a `_ +* `luzpaz `_ +* `Kyle Martin `_ +* `Nightfurex `_ +* `Nick Riasanovsky `_ +* `Nopileos2 `_ +* `Ray Bell `_ +* `readthedocs-assistant `_ +* `Rishi Kulkarni `_ +* `Sterling Baird `_ +* `Siu Kwan Lam `_ +* `stuartarchibald `_ +* `Stepan Rakitin `_ +* `Michael Collison `_ +* `Tingkai Liu `_ + +Version 0.55.2 (25 May, 2022) +----------------------------- + +This is a maintenance release to support NumPy 1.22 and Apple M1. + +Pull-Requests: + +* PR `#8067 `_: Backport #8027: Support for NumPy 1.22 (`stuartarchibald `_) +* PR `#8069 `_: Install llvmlite 0.38 for Numba 0.55.* (`esc `_) +* PR `#8075 `_: update max NumPy for 0.55.2 (`esc `_) +* PR `#8078 `_: Backport #7804: update local references from master -> main (`esc `_) +* PR `#8082 `_: Backport #8080: fix windows failure due to timeout (`Siu Kwan Lam `_) +* PR `#8084 `_: Pin meta.yaml to llvmlite 0.38 series (`Siu Kwan Lam `_) +* PR `#8093 `_: Backport #7904: Support m1 (`esc `_) +* PR `#8094 `_: Backport #8052 Ensure pthread is linked in when building for ppc64le. (`Siu Kwan Lam `_) +* PR `#8098 `_: Backport #8097: Exclude libopenblas 0.3.20 on osx-arm64 (`esc `_) +* PR `#8100 `_: Backport #7786 for 0.55.2: Remove dependency on intel-openmp for OSX (`stuartarchibald `_) +* PR `#8103 `_: Backport #8102 to fix numpy requirements (`Siu Kwan Lam `_) +* PR `#8114 `_: Backport #8109 Pin TBB support with respect to incompatible 2021.6 API. (`stuartarchibald `_) + +Total PRs: 12 + +Authors: + +* `esc `_ +* `Siu Kwan Lam `_ +* `stuartarchibald `_ + +Total authors: 3 + +Version 0.55.1 (27 January, 2022) +--------------------------------- + +This is a bugfix release that closes all the remaining issues from the +accelerated release of 0.55.0 and also any release critical regressions +discovered since then. + +CUDA target deprecation notices: + +* Support for CUDA toolkits < 10.2 is deprecated and will be removed in Numba + 0.56. +* Support for devices with Compute Capability < 5.3 is deprecated and will be + removed in Numba 0.56. + + +Pull-Requests: + +* PR `#7755 `_: CUDA: Deprecate support for CC < 5.3 and CTK < 10.2 (`Graham Markall `_) +* PR `#7749 `_: Refactor threading layer priority tests to not use stdout/stderr (`stuartarchibald `_) +* PR `#7744 `_: Fix issues with locating/parsing source during DebugInfo emission. (`stuartarchibald `_) +* PR `#7712 `_: Fixing issue 7693 (`Graham Markall `_ `luk-f-a `_ `stuartarchibald `_) +* PR `#7729 `_: Handle Omitted/OmittedArgDataModel in DI generation. (`stuartarchibald `_) +* PR `#7788 `_: Avoid issue with DI gen for arrayexprs. (`stuartarchibald `_) +* PR `#7752 `_: Fix #7751: Use original filename for array exprs (`Graham Markall `_) +* PR `#7748 `_: Fix #7713: Ensure _prng_random_hash return has correct bitwidth (`Graham Markall `_) +* PR `#7745 `_: Fix the release year for Numba 0.55 change log entry. (`stuartarchibald `_) +* PR `#7740 `_: CUDA Python 11.6 support (`Graham Markall `_) +* PR `#7724 `_: Update URLs in error messages to refer to RTD docs. (`stuartarchibald `_) +* PR `#7709 `_: CUDA: Fixes missing type annotation pass following #7704 (`stuartarchibald `_) +* PR `#7704 `_: Move the type annotation pass to post legalization. (`stuartarchibald `_) +* PR `#7619 `_: CUDA: Fix linking with PTX when compiling lazily (`Graham Markall `_) + +Authors: + +* `Graham Markall `_ +* `luk-f-a `_ +* `stuartarchibald `_ + +Version 0.55.0 (13 January, 2022) +--------------------------------- + +This release includes a significant number important dependency upgrades along +with a number of new features and bug fixes. + +NOTE: Due to NumPy CVE-2021-33430 this release has bypassed the usual release +process so as to promptly provide a Numba release that supports NumPy 1.21. A +single release candidate (RC1) was made and a few issues were reported, these +are summarised as follows and will be fixed in a subsequent 0.55.1 release. + +Known issues with this release: + +* Incorrect result copying array-typed field of structured array (`#7693 `_) +* Two issues in DebugInfo generation (`#7726 `_, `#7730 `_) +* Compilation failure for ``hash`` of floating point values on 32 bit Windows + when using Python 3.10 (`#7713 `_). + +Highlights of core dependency upgrades: + +* Support for Python 3.10 +* Support for NumPy 1.21 + +Python language support enhancements: + +* Experimental support for ``isinstance``. + +NumPy features/enhancements: + +The following functions are now supported: + +* ``np.broadcast_to`` +* ``np.float_power`` +* ``np.cbrt`` +* ``np.logspace`` +* ``np.take_along_axis`` +* ``np.average`` +* ``np.argmin`` gains support for the ``axis`` kwarg. +* ``np.ndarray.astype`` gains support for types expressed as literal strings. + +Highlights of core changes: + +* For users of the Numba extension API, Numba now has a new error handling mode + whereby it will treat all exceptions that do not inherit from + ``numba.errors.NumbaException`` as a "hard error" and immediately unwind the + stack. This makes it much easier to debug when writing ``@overload``\s etc + from the extension API as there's now no confusion between Python errors and + Numba errors. This feature can be enabled by setting the environment + variable: ``NUMBA_CAPTURED_ERRORS='new_style'``. +* The threading layer selection priority can now be changed via the environment + variable ``NUMBA_THREADING_LAYER_PRIORITY``. + +Highlights of changes for the CUDA target: + +* Support for NVIDIA's CUDA Python bindings. +* Support for 16-bit floating point numbers and their basic operations via + intrinsics. +* Streams are provided in the ``Stream.async_done`` result, making it easier to + implement asynchronous work queues. +* Support for structured types in device arrays, character sequences in NumPy + arrays, and some array operations on nested arrays. +* Much underlying refactoring to align the CUDA target more closely with the + CPU target, which lays the groudwork for supporting the high level extension + API in CUDA in future releases. + +Intel also kindly sponsored research and development into native debug (DWARF) +support and handling per-function compilation flags: + +* Line number/location tracking is much improved. +* Numba's internal representation of containers (e.g. tuples, arrays) are now + encoded as structures. +* Numba's per-function compilation flags are encoded into the ABI field of the + mangled name of the function such that it's possible to compile and + differentiate between versions of the same function with different flags set. + +General deprecation notices: + +* There are no new general deprecations. + +CUDA target deprecation notices: + +* There are no new CUDA target deprecations. + +Version support/dependency changes: + +* Python 3.10 is supported. +* NumPy version 1.21 is supported. +* The minimum supported NumPy version is raised to 1.18 for runtime (compilation + however remains compatible with NumPy 1.11). + + +Pull-Requests: + +* PR `#6075 `_: add np.float_power and np.cbrt (`Guilherme Leobas `_) +* PR `#7047 `_: Support __hash__ for numpy.datetime64 (`Guilherme Leobas `_ `stuartarchibald `_) +* PR `#7057 `_: Fix #7041: Add charseq registry to CUDA target (`Graham Markall `_ `stuartarchibald `_) +* PR `#7082 `_: Added Add/Sub between datetime64 array and timedelta64 scalar (`Nick Riasanovsky `_ `stuartarchibald `_) +* PR `#7119 `_: Add support for `np.broadcast_to` (`Guilherme Leobas `_) +* PR `#7129 `_: Add support for axis keyword argument to np.argmin() (`Itamar Turner-Trauring `_) +* PR `#7132 `_: gh #7131 Support for astype with literal strings (`Nick Riasanovsky `_) +* PR `#7177 `_: Add debug infomation support based on datamodel. (`stuartarchibald `_) +* PR `#7185 `_: Add get_impl_key as abstract method to types.Callable (`Alexey Kozlov `_) +* PR `#7186 `_: Add support for np.logspace. (`Guoqiang QI `_) +* PR `#7189 `_: CUDA: Skip IPC tests on ARM (`Graham Markall `_) +* PR `#7190 `_: CUDA: Fix test_pinned on Jetson (`Graham Markall `_) +* PR `#7192 `_: Fix missing import in array.argsort impl and add more tests. (`stuartarchibald `_) +* PR `#7196 `_: Fixes for lineinfo emission (`stuartarchibald `_) +* PR `#7197 `_: don't post to python announce on the first RC (`esc `_) +* PR `#7202 `_: Initial implementation of np.take_along_axis (`Itamar Turner-Trauring `_) +* PR `#7203 `_: remove duplicate changelog entries (`esc `_) +* PR `#7216 `_: Update CHANGE_LOG for 0.54.0rc2 (`stuartarchibald `_) +* PR `#7219 `_: bump llvmlite dependency to 0.38.0dev0 for Numba 0.55.0dev0 (`esc `_) +* PR `#7220 `_: update release checklist post 0.54rc1+2 (`esc `_) +* PR `#7221 `_: Show GPU UUIDs in cuda.detect() output (`Graham Markall `_) +* PR `#7222 `_: CUDA: Warn when debug=True and opt=True (`Graham Markall `_) +* PR `#7223 `_: Replace assertion errors on IR assumption violation (`Siu Kwan Lam `_) +* PR `#7226 `_: Add support for structured types in Device Arrays (`Michael Collison `_) +* PR `#7227 `_: FIX: Typo (`Srinath Kailasa `_) +* PR `#7230 `_: PR #7171 bugfix only (`stuartarchibald `_ `Todd A. Anderson `_) +* PR `#7234 `_: add THREADING_LAYER_PRIORITY & NUMBA_THREADING_LAYER_PRIORITY (`Kolen Cheung `_) +* PR `#7235 `_: replace wordings of WIP by draft PR (`Kolen Cheung `_) +* PR `#7236 `_: CUDA: Skip managed alloc tests on ARM (`Graham Markall `_) +* PR `#7237 `_: fix a typo in a string (`Kolen Cheung `_) +* PR `#7241 `_: Set aliasing information for inplace_binops.. (`Todd A. Anderson `_) +* PR `#7242 `_: FIX: typo (`Srinath Kailasa `_) +* PR `#7244 `_: Implement partial literal propagation pass (support 'isinstance') (`Guilherme Leobas `_ `stuartarchibald `_) +* PR `#7247 `_: Solve memory leak to fix issue #7210 (`Siu Kwan Lam `_ `Graham Markall `_ `ysheffer `_) +* PR `#7251 `_: Fix #6001: typed.List ignores ctor arguments with JIT disabled (`Graham Markall `_) +* PR `#7256 `_: Fix link to the discourse forum in README (`Kenichi Maehashi `_) +* PR `#7257 `_: Use normal list constructor in List.__new__() (`Graham Markall `_) +* PR `#7260 `_: Support typed lists in `heapq` (`Graham Markall `_) +* PR `#7263 `_: Updated issue URL for error messages #7261 (`DeviousLab `_) +* PR `#7265 `_: Fix linspace to use np.divide and clamp to stop. (`stuartarchibald `_) +* PR `#7266 `_: CUDA: Skip multi-GPU copy test with peer access disabled (`Graham Markall `_) +* PR `#7267 `_: Fix #7258. Bug in SROA optimization (`Siu Kwan Lam `_) +* PR `#7271 `_: Update 3rd party license text. (`stuartarchibald `_) +* PR `#7272 `_: Allow annotations in njit-ed functions (`LunarLanding `_) +* PR `#7273 `_: Update CHANGE_LOG for 0.54.0rc3. (`stuartarchibald `_) +* PR `#7283 `_: Added NPM to Glossary and linked to mentions (`Nihal Shetty `_) +* PR `#7285 `_: CUDA: Fix OOB in test_kernel_arg (`Graham Markall `_) +* PR `#7288 `_: Handle cval as a np attr in stencil generation. (`stuartarchibald `_) +* PR `#7294 `_: Continuation of PR #7280, fixing lifetime of TBB task_scheduler_handle (`Sergey Pokhodenko `_ `stuartarchibald `_) +* PR `#7296 `_: Fix generator lowering not casting to the actual yielded type (`Siu Kwan Lam `_) +* PR `#7298 `_: Use CBC to pin GCC to 7 on most linux and 9 on aarch64. (`stuartarchibald `_) +* PR `#7304 `_: Continue PR#3655: add support for np.average (`Hadia Ahmed `_ `slnguyen `_) +* PR `#7307 `_: Prevent mutation of arrays in global tuples. (`stuartarchibald `_) +* PR `#7309 `_: Update MapConstraint to handle type coercion for typed.Dict correctly. (`stuartarchibald `_) +* PR `#7312 `_: Fix #7302. Workaround missing pthread problem on ppc64le (`Siu Kwan Lam `_) +* PR `#7315 `_: Link ELF obj as DSO for radare2 disassembly CFG (`stuartarchibald `_) +* PR `#7316 `_: Use float64 for consistent typing in heapq tests. (`stuartarchibald `_) +* PR `#7317 `_: In TBB tsh test switch os.fork for mp fork ctx (`stuartarchibald `_) +* PR `#7319 `_: Update CHANGE_LOG for 0.54.0 final. (`stuartarchibald `_) +* PR `#7329 `_: Improve documentation in reference to CUDA local memory (`Sterling Baird `_) +* PR `#7330 `_: Cuda matmul docs (`Sterling Baird `_) +* PR `#7340 `_: Add size_t and ssize_t types (`Bruce Merry `_) +* PR `#7345 `_: Add check for ipykernel file in IPython cache locator (`Sahil Gupta `_) +* PR `#7347 `_: fix:updated url for error report and feature rquest using issue template (`DEBARGHA SAHA `_) +* PR `#7349 `_: Allow arbitrary walk-back in reduction nodes to find inplace_binop. (`Todd A. Anderson `_) +* PR `#7359 `_: Extend support for nested arrays inside numpy records (`Graham Markall `_ `luk-f-a `_) +* PR `#7375 `_: CUDA: Run doctests as part of numba.cuda.tests and fix test_cg (`Graham Markall `_) +* PR `#7395 `_: Fix #7394 and #6550 & Added test & improved error message (`MegaIng `_) +* PR `#7397 `_: Add option to catch only Numba `numba.core.errors` derived exceptions. (`stuartarchibald `_) +* PR `#7398 `_: Add support for arrayanalysis of tuple args. (`Todd A. Anderson `_) +* PR `#7403 `_: Fix for issue 7402: implement missing numpy ufunc interface (`Guilherme Leobas `_) +* PR `#7404 `_: fix typo in literal_unroll docs (`esc `_) +* PR `#7419 `_: insert missing backtick in comment (`esc `_) +* PR `#7422 `_: Update Omitted Type to use Hashable Values as Keys for Caching (`Nick Riasanovsky `_) +* PR `#7429 `_: Update CHANGE_LOG for 0.54.1 (`stuartarchibald `_) +* PR `#7432 `_: add github release task to checklist (`esc `_) +* PR `#7440 `_: Refactor TargetConfig naming. (`stuartarchibald `_) +* PR `#7441 `_: Permit any string as a key in literalstrkeydict type. (`stuartarchibald `_) +* PR `#7442 `_: Add some diagnostics to SVML test failures. (`stuartarchibald `_) +* PR `#7443 `_: Refactor template selection logic for targets. (`stuartarchibald `_) +* PR `#7444 `_: use correct variable name in closure (`esc `_) +* PR `#7447 `_: cleanup Numba metadata (`esc `_) +* PR `#7453 `_: CUDA: Provide stream in async_done result (`Graham Markall `_) +* PR `#7456 `_: Fix invalid codegen for #7451. (`stuartarchibald `_) +* PR `#7457 `_: Factor out target registry selection logic (`stuartarchibald `_) +* PR `#7459 `_: Include compiler flags in symbol mangling (`Siu Kwan Lam `_) +* PR `#7460 `_: Add FP16 support for CUDA (`Michael Collison `_ `Graham Markall `_) +* PR `#7461 `_: Support NVIDIA's CUDA Python bindings (`Graham Markall `_) +* PR `#7465 `_: Update changelog for 0.54.1 release (`Siu Kwan Lam `_) +* PR `#7477 `_: Fix unicode operator.eq handling of Optional types. (`stuartarchibald `_) +* PR `#7479 `_: CUDA: Print format string and warn for > 32 print() args (`Graham Markall `_) +* PR `#7483 `_: NumPy 1.21 support (`Sebastian Berg `_ `stuartarchibald `_) +* PR `#7484 `_: Fixed outgoing link to nvidia documentation. (`Dhruv Patel `_) +* PR `#7493 `_: Consolidate TLS stacks in target configuration (`Siu Kwan Lam `_) +* PR `#7496 `_: CUDA: Use a single dispatcher class for all kinds of functions (`Graham Markall `_) +* PR `#7498 `_: refactor with-detection logic (`stuartarchibald `_ `esc `_) +* PR `#7499 `_: Add build scripts for CUDA testing on gpuCI (`Charles Blackmon-Luca `_ `Graham Markall `_) +* PR `#7500 `_: Update parallel.rst (`Julius Bier Kirkegaard `_) +* PR `#7506 `_: Enhance Flags mangling/demangling (`Siu Kwan Lam `_) +* PR `#7514 `_: Fixup cuda debuginfo emission for 7177 (`Siu Kwan Lam `_) +* PR `#7525 `_: Make sure` demangle()` returns `str` type. (`Siu Kwan Lam `_) +* PR `#7538 `_: Fix `@overload_glue` performance regression. (`stuartarchibald `_) +* PR `#7539 `_: Fix str decode issue from merge #7525/#7506 (`stuartarchibald `_) +* PR `#7546 `_: Fix handling of missing const key in LiteralStrKeyDict (`Siu Kwan Lam `_ `stuartarchibald `_) +* PR `#7547 `_: Remove 32bit linux scipy installation. (`stuartarchibald `_) +* PR `#7548 `_: Correct evaluation order in assert statement (`Graham Markall `_) +* PR `#7552 `_: Prepend the inlined function name to inlined variables. (`stuartarchibald `_) +* PR `#7557 `_: Python3.10 v2 (`stuartarchibald `_ `esc `_) +* PR `#7560 `_: Refactor with detection py310 (`Siu Kwan Lam `_ `esc `_) +* PR `#7561 `_: fix a typo (`Kolen Cheung `_) +* PR `#7567 `_: Update docs to note meetings are public. (`stuartarchibald `_) +* PR `#7570 `_: Update the docs and error message for errors when importing Numba. (`stuartarchibald `_) +* PR `#7580 `_: Fix #7507. catch `NotImplementedError` in `.get_function()` (`Siu Kwan Lam `_) +* PR `#7581 `_: Add support for casting from int enums (`Michael Collison `_) +* PR `#7583 `_: Make numba.types.Optional __str__ less verbose. (`stuartarchibald `_) +* PR `#7588 `_: Fix casting of start/stop in linspace (`stuartarchibald `_) +* PR `#7591 `_: Remove deprecations (`Graham Markall `_) +* PR `#7596 `_: Fix max symbol match length for r2 (`stuartarchibald `_) +* PR `#7597 `_: Update gdb docs for new DWARF enhancements. (`stuartarchibald `_) +* PR `#7603 `_: Fix list.insert() for refcounted values (`Ehsan Totoni `_) +* PR `#7605 `_: Fix TBB 2021 DSO names on OSX/Win and make TBB reporting consistent (`stuartarchibald `_) +* PR `#7606 `_: Ensure a prescribed threading layer can load in CI. (`stuartarchibald `_) +* PR `#7610 `_: Fix #7609. Type should not be mutated. (`Siu Kwan Lam `_) +* PR `#7618 `_: Fix the doc build: docutils 0.18 not compatible with pinned sphinx (`stuartarchibald `_) +* PR `#7626 `_: Fix issues with package dependencies. (`stuartarchibald `_ `esc `_) +* PR `#7627 `_: PR 7321 continued (`stuartarchibald `_ `Eric Wieser `_) +* PR `#7628 `_: Move to using windows-2019 images in Azure (`stuartarchibald `_) +* PR `#7632 `_: Capture output in CUDA matmul doctest (`Graham Markall `_) +* PR `#7636 `_: Copy prange loop header to after the parfor. (`Todd A. Anderson `_) +* PR `#7637 `_: Increase the timeout on the SVML tests for loaded machines. (`stuartarchibald `_) +* PR `#7645 `_: In debuginfo, do not add noinline to functions marked alwaysinline (`stuartarchibald `_) +* PR `#7650 `_: Move Azure builds to OSX 10.15 (`stuartarchibald `_ `esc `_ `Siu Kwan Lam `_) + +Authors: + +* `Bruce Merry `_ +* `Charles Blackmon-Luca `_ +* `DeviousLab `_ +* `Dhruv Patel `_ +* `Todd A. Anderson `_ +* `Ehsan Totoni `_ +* `Eric Wieser `_ +* `esc `_ +* `Graham Markall `_ +* `Guilherme Leobas `_ +* `Guoqiang QI `_ +* `Hadia Ahmed `_ +* `Kolen Cheung `_ +* `Itamar Turner-Trauring `_ +* `Julius Bier Kirkegaard `_ +* `Kenichi Maehashi `_ +* `Alexey Kozlov `_ +* `luk-f-a `_ +* `LunarLanding `_ +* `MegaIng `_ +* `Nihal Shetty `_ +* `Nick Riasanovsky `_ +* `Sergey Pokhodenko `_ +* `Sahil Gupta `_ +* `Sebastian Berg `_ +* `Sterling Baird `_ +* `Srinath Kailasa `_ +* `Siu Kwan Lam `_ +* `slnguyen `_ +* `DEBARGHA SAHA `_ +* `stuartarchibald `_ +* `Michael Collison `_ +* `ysheffer `_ + +Version 0.54.1 (7 October, 2021) +-------------------------------- + +This is a bugfix release for 0.54.0. It fixes a regression in structured array +type handling, a potential leak on initialization failure in the CUDA target, a +regression caused by Numba's vendored cloudpickle module resetting dynamic +classes and a few minor testing/infrastructure related problems. + +* PR `#7348 `_: test_inspect_cli: Decode exception with default (utf-8) codec (`Graham Markall `_) +* PR `#7360 `_: CUDA: Fix potential leaks when initialization fails (`Graham Markall `_) +* PR `#7386 `_: Ensure the NRT is initialized prior to use in external NRT tests. (`stuartarchibald `_) +* PR `#7388 `_: Patch cloudpickle to not reset dynamic class each time it is unpickled (`Siu Kwan Lam `_) +* PR `#7393 `_: skip azure pipeline test if file not present (`esc `_) +* PR `#7428 `_: Fix regression #7355: cannot set items in structured array data types (`Siu Kwan Lam `_) + +Authors: + +* `esc `_ +* `Graham Markall `_ +* `Siu Kwan Lam `_ +* `stuartarchibald `_ + + +Version 0.54.0 (19 August, 2021) +-------------------------------- + +This release includes a significant number of new features, important +refactoring, critical bug fixes and a number of dependency upgrades. + +Python language support enhancements: + +* Basic support for ``f-strings``. +* ``dict`` comprehensions are now supported. +* The ``sum`` built-in function is implemented. + +NumPy features/enhancements: + +The following functions are now supported: + + * ``np.clip`` + * ``np.iscomplex`` + * ``np.iscomplexobj`` + * ``np.isneginf`` + * ``np.isposinf`` + * ``np.isreal`` + * ``np.isrealobj`` + * ``np.isscalar`` + * ``np.random.dirichlet`` + * ``np.rot90`` + * ``np.swapaxes`` + +Also ``np.argmax`` has gained support for the ``axis`` keyword argument and it's +now possible to use ``0d`` NumPy arrays as scalars in ``__setitem__`` calls. + +Internal changes: + +* Debugging support through DWARF has been fixed and enhanced. +* Numba now optimises the way in which locals are emitted to help reduce time + spent in LLVM's SROA passes. + +CUDA target changes: + +* Support for emitting ``lineinfo`` to be consumed by profiling tools such as + Nsight Compute +* Improved fastmath code generation for various trig, division, and other + functions +* Faster compilation using lazy addition of libdevice to compiled units +* Support for IPC on Windows +* Support for passing tuples to CUDA ufuncs +* Performance warnings: + + * When making implicit copies by calling a kernel on arrays in host memory + * When occupancy is poor due to kernel or ufunc/gufunc configuration + +* Support for implementing warp-aggregated intrinsics: + + * Using support for more CUDA functions: ``activemask()``, ``lanemask_lt()`` + * The ``ffs()`` function now works correctly! + +* Support for ``@overload`` in the CUDA target + +Intel kindly sponsored research and development that lead to a number of new +features and internal support changes: + +* Dispatchers can now be retargetted to a new target via a user defined context + manager. +* Support for custom NumPy array subclasses has been added (including an + overloadable memory allocator). +* An inheritance based model for targets that permits targets to share + ``@overload`` implementations. +* Per function compiler flags with inheritance behaviours. +* The extension API now has support for overloading class methods via the + ``@overload_classmethod`` decorator. + +Deprecations: + +* The ``ROCm`` target (for AMD ROC GPUs) has been moved to an "unmaintained" + status and a seperate repository stub has been created for it at: + https://github.com/numba/numba-rocm + +CUDA target deprecations and breaking changes: + +* Relaxed strides checking is now the default when computing the contiguity of + device arrays. +* The ``inspect_ptx()`` method is deprecated. For use cases that obtain PTX for + further compilation outside of Numba, use ``compile_ptx()`` instead. +* Eager compilation of device functions (the case when ``device=True`` and a + signature is provided) is deprecated. + +Version support/dependency changes: + +* LLVM 11 is now supported on all platforms via llvmlite. +* The minimum supported Python version is raised to 3.7. +* NumPy version 1.20 is supported. +* The minimum supported NumPy version is raised to 1.17 for runtime (compilation + however remains compatible with NumPy 1.11). +* Vendor `cloudpickle `_ `v1.6.0` -- + now used for all ``pickle`` operations. +* TBB >= 2021 is now supported and all prior versions are unsupported (not + easily possible to maintain the ABI breaking changes). + +Pull-Requests: + +* PR `#4516 `_: Make setitem accept 0d np-arrays (`Guilherme Leobas `_) +* PR `#4610 `_: Implement np.is* functions (`Guilherme Leobas `_) +* PR `#5984 `_: Handle idx and size unification in wrap_index manually. (`Todd A. Anderson `_) +* PR `#6468 `_: Access ``replace_functions_map`` via PreParforPass instance (`Sergey Pokhodenko `_ `Reazul Hoque `_) +* PR `#6469 `_: Add address space in pointer type (`Sergey Pokhodenko `_ `Reazul Hoque `_) +* PR `#6608 `_: Support f-strings for common cases (`Ehsan Totoni `_) +* PR `#6619 `_: Improved fastmath code generation for trig, log, and exp/pow. (`Graham Markall `_ `Michael Collison `_) +* PR `#6681 `_: Explicitly catch ``with..as`` and raise error. (`stuartarchibald `_) +* PR `#6689 `_: Fix setup.py build command detection (`Hannes Pahl `_) +* PR `#6695 `_: Enable negative indexing for cuda atomic operations (`Ashutosh Varma `_) +* PR `#6696 `_: flake8: made more files flake8 compliant (`Ashutosh Varma `_) +* PR `#6698 `_: Fix #6697: Wrong dtype when using np.asarray on DeviceNDArray (`Ashutosh Varma `_) +* PR `#6700 `_: Add UUID to CUDA devices (`Graham Markall `_) +* PR `#6709 `_: Block matplotlib in test examples (`Graham Markall `_) +* PR `#6718 `_: doc: fix typo in rewrites.rst (extra iterates) (`Alexander-Makaryev `_) +* PR `#6720 `_: Faster compile (`Siu Kwan Lam `_) +* PR `#6730 `_: Fix Typeguard error (`Graham Markall `_) +* PR `#6731 `_: Add CUDA-specific pipeline (`Graham Markall `_) +* PR `#6735 `_: CUDA: Don't parse IR for modules with llvmlite (`Graham Markall `_) +* PR `#6736 `_: Support for dict comprehension (`stuartarchibald `_) +* PR `#6742 `_: Do not add overload function definitions to index. (`stuartarchibald `_) +* PR `#6750 `_: Bump to llvmlite 0.37 series (`Siu Kwan Lam `_) +* PR `#6751 `_: Suppress typeguard warnings that affect testing. (`Siu Kwan Lam `_) +* PR `#6753 `_: The check for internal types in RewriteArrayExprs (`Alexander-Makaryev `_) +* PR `#6755 `_: install llvmlite from numba/label/dev (`esc `_) +* PR `#6758 `_: patch to compile _devicearray.cpp with c++11 (`esc `_) +* PR `#6760 `_: Fix scheduler bug where it rounds to 0 divisions for a chunk. (`Todd A. Anderson `_) +* PR `#6762 `_: Glue wrappers to create @overload from split typing and lowering. (`stuartarchibald `_ `Siu Kwan Lam `_) +* PR `#6766 `_: Fix DeviceNDArray null shape issue (`Michael Collison `_) +* PR `#6769 `_: CUDA: Replace ``CachedPTX`` and ``CachedCUFunction`` with ``CUDACodeLibrary`` functionality (`Graham Markall `_) +* PR `#6776 `_: Fix issue with TBB interface causing warnings and parfors counting them (`stuartarchibald `_) +* PR `#6779 `_: Fix wrap_index type unification. (`Todd A. Anderson `_) +* PR `#6786 `_: Fix gufunc kwargs support (`Siu Kwan Lam `_) +* PR `#6788 `_: Add support for fastmath 32-bit floating point divide (`Michael Collison `_) +* PR `#6789 `_: Fix warnings struct ref typeguard (`stuartarchibald `_ `Siu Kwan Lam `_ `esc `_) +* PR `#6794 `_: refactor and move create_temp_module into numba.tests.support (`Alexander-Makaryev `_) +* PR `#6795 `_: CUDA: Lazily add libdevice to compilation units (`Graham Markall `_) +* PR `#6798 `_: CUDA: Add optional Driver API argument logging (`Graham Markall `_) +* PR `#6799 `_: Print Numba and llvmlite versions in sysinfo (`Graham Markall `_) +* PR `#6800 `_: Make a common standard API for querying ufunc impl (`Sergey Pokhodenko `_ `Siu Kwan Lam `_) +* PR `#6801 `_: ParallelAccelerator no long will convert StaticSetItem to SetItem because record arrays require StaticSetItems. (`Todd A. Anderson `_) +* PR `#6802 `_: Add lineinfo flag to PTX and SASS compilation (`Graham Markall `_ `Max Katz `_) +* PR `#6804 `_: added runtime version to ``numba -s`` (`Kalyan `_) +* PR `#6808 `_: #3468 continued: Add support for ``np.clip`` (`Graham Markall `_ `Aaron Russell Voelker `_) +* PR `#6809 `_: #3203 additional info in cuda detect (`Kalyan `_) +* PR `#6810 `_: Fix tiny formatting error in ROC kernel docs (`Felix Divo `_) +* PR `#6811 `_: CUDA: Remove test of runtime being a supported version (`Graham Markall `_) +* PR `#6813 `_: Mostly CUDA: Replace llvmpy API usage with llvmlite APIs (`Graham Markall `_) +* PR `#6814 `_: Improving context stack (`stuartarchibald `_ `Siu Kwan Lam `_) +* PR `#6818 `_: CUDA: Support IPC on Windows (`Graham Markall `_) +* PR `#6822 `_: Add support for np.rot90 (`stuartarchibald `_ `Daniel Nagel `_) +* PR `#6829 `_: Fix accuracy of np.arange and np.linspace (`stuartarchibald `_) +* PR `#6830 `_: CUDA: Use relaxed strides checking to compute contiguity (`Graham Markall `_) +* PR `#6833 `_: Raise TypeError exception if numpy array is cast to scalar (`Michael Collison `_) +* PR `#6834 `_: Remove illegal "debug" kw argument (`Shaun Cutts `_) +* PR `#6836 `_: CUDA: Documentation updates (`Graham Markall `_) +* PR `#6840 `_: CUDA: Remove items deprecated in 0.53 + simulator test fixes (`Graham Markall `_) +* PR `#6841 `_: CUDA: Fix source location on kernel entry and enable breakpoints to be set on kernels by mangled name (`Graham Markall `_) +* PR `#6843 `_: cross-referenced Array type in docs (`Kalyan `_) +* PR `#6844 `_: CUDA: Remove NUMBAPRO env var warnings, envvars.py + other small tidy-ups (`Graham Markall `_) +* PR `#6848 `_: Ignore .ycm_extra_conf.py (`Graham Markall `_) +* PR `#6849 `_: Add __hash__ for IntEnum (`Hannes Pahl `_) +* PR `#6850 `_: Fix up more internal warnings (`stuartarchibald `_) +* PR `#6854 `_: PR 6096 continued (`stuartarchibald `_ `Ivan Butygin `_) +* PR `#6861 `_: updated reference to hsa with roc (`Kalyan `_) +* PR `#6867 `_: Update changelog for 0.53.1 (`esc `_) +* PR `#6869 `_: Implement builtin sum() (`stuartarchibald `_) +* PR `#6870 `_: Add support for dispatcher retargeting using with-context (`stuartarchibald `_ `Siu Kwan Lam `_) +* PR `#6871 `_: Force text-align:left when using Annotate (`Guilherme Leobas `_) +* PR `#6873 `_: docs: Update reference to @jitclass location (`David Nadlinger `_) +* PR `#6876 `_: Add trailing slashes to dir paths in CODEOWNERS (`Graham Markall `_) +* PR `#6877 `_: Add doc for recent target extension features (`Siu Kwan Lam `_) +* PR `#6878 `_: CUDA: Support passing tuples to ufuncs (`Graham Markall `_) +* PR `#6879 `_: CUDA: NumPy and string dtypes for local and shared arrays (`Graham Markall `_) +* PR `#6880 `_: Add attribute lower_extension to CPUContext (`Reazul Hoque `_) +* PR `#6883 `_: Add support of np.swapaxes #4074 (`Daniel Nagel `_) +* PR `#6885 `_: CUDA: Explicitly specify objmode + looplifting for jit functions in cuda.random (`Graham Markall `_) +* PR `#6886 `_: CUDA: Fix parallel testing for all testsuite submodules (`Graham Markall `_) +* PR `#6888 `_: Get overload to consider compiler flags in cache lookup (`Siu Kwan Lam `_) +* PR `#6889 `_: Address guvectorize too slow for cuda target (`Michael Collison `_) +* PR `#6890 `_: fixes #6884 (`Kalyan `_) +* PR `#6898 `_: Work on overloading by hardware target. (`stuartarchibald `_) +* PR `#6911 `_: CUDA: Add support for activemask(), lanemask_lt(), and nanosleep() (`Graham Markall `_) +* PR `#6912 `_: Prevent use of varargs in closure calls. (`stuartarchibald `_) +* PR `#6913 `_: Add runtests option to gitdiff on the common ancestor (`Siu Kwan Lam `_) +* PR `#6915 `_: Update _Intrinsic for sphinx to capture the inner docstring (`Guilherme Leobas `_) +* PR `#6917 `_: Add type conversion for StringLiteral to unicode_type and test. (`stuartarchibald `_) +* PR `#6918 `_: Start section on commonly encounted unsupported parfors code. (`stuartarchibald `_) +* PR `#6924 `_: CUDA: Fix ``ffs`` (`Graham Markall `_) +* PR `#6928 `_: Add support for axis keyword arg to numpy.argmax() (`stuartarchibald `_ `Itamar Turner-Trauring `_) +* PR `#6929 `_: Fix CI failure when gitpython is missing. (`Siu Kwan Lam `_) +* PR `#6935 `_: fixes broken link in numba-runtime.rst (`Kalyan `_) +* PR `#6936 `_: CUDA: Implement support for PTDS globally (`Graham Markall `_) +* PR `#6937 `_: Fix memory leak in bytes boxing (`stuartarchibald `_) +* PR `#6940 `_: Fix function resolution for intrinsics across hardware. (`stuartarchibald `_) +* PR `#6941 `_: ABC the target descriptor and make consistent throughout. (`stuartarchibald `_) +* PR `#6944 `_: CUDA: Support for ``@overload`` (`Graham Markall `_) +* PR `#6945 `_: Fix issue with array analysis tests needing scipy. (`stuartarchibald `_) +* PR `#6948 `_: Refactor registry init. (`stuartarchibald `_ `Graham Markall `_ `Siu Kwan Lam `_) +* PR `#6953 `_: CUDA: Fix and deprecate ``inspect_ptx()``, fix NVVM option setup for device functions (`Graham Markall `_) +* PR `#6958 `_: Inconsistent behavior of reshape between numpy and numba/cuda device array (`Lauren Arnett `_) +* PR `#6961 `_: Update overload glue to deal with typing_key (`stuartarchibald `_) +* PR `#6964 `_: Move minimum supported Python version to 3.7 (`stuartarchibald `_) +* PR `#6966 `_: Fix issue with TBB test detecting forks from incorrect state. (`stuartarchibald `_) +* PR `#6971 `_: Fix CUDA ``@intrinsic`` use (`stuartarchibald `_) +* PR `#6977 `_: Vendor cloudpickle (`Siu Kwan Lam `_) +* PR `#6978 `_: Implement operator.contains for empty Tuples (`Brandon T. Willard `_) +* PR `#6981 `_: Fix LLVM IR parsing error on use of ``np.bool_`` in globals (`stuartarchibald `_) +* PR `#6983 `_: Support Optional types in ufuncs. (`stuartarchibald `_) +* PR `#6985 `_: Implement static set/get items on records with integer index (`stuartarchibald `_) +* PR `#6986 `_: document release checklist (`esc `_) +* PR `#6989 `_: update threading docs for function loading (`esc `_) +* PR `#6990 `_: Refactor hardware extension API to refer to "target" instead. (`stuartarchibald `_) +* PR `#6991 `_: Move ROCm target status to "unmaintained". (`stuartarchibald `_) +* PR `#6995 `_: Resolve issue where nan was being assigned to int type numpy array (`Michael Collison `_) +* PR `#6996 `_: Add constant lowering support for `SliceType`s (`Brandon T. Willard `_) +* PR `#6997 `_: CUDA: Remove catch of NotImplementedError in target.py (`Graham Markall `_) +* PR `#6999 `_: Fix errors introduced by the cloudpickle patch (`Siu Kwan Lam `_) +* PR `#7003 `_: More mainline fixes (`stuartarchibald `_ `Graham Markall `_ `Siu Kwan Lam `_) +* PR `#7004 `_: Test extending the CUDA target (`Graham Markall `_) +* PR `#7007 `_: Made stencil compilation not fail for arrays of conflicting types. (`MegaIng `_) +* PR `#7008 `_: Added support for np.random.dirichlet with all size arguments (`Rishi Kulkarni `_) +* PR `#7016 `_: Docs: Add DALI to list of CAI-supporting libraries (`Graham Markall `_) +* PR `#7018 `_: Remove cu{blas,sparse,rand,fft} from library checks (`Graham Markall `_) +* PR `#7019 `_: Support NumPy 1.20 (`stuartarchibald `_) +* PR `#7020 `_: Fix #7017. Adds util class PickleCallableByPath (`Siu Kwan Lam `_) +* PR `#7024 `_: fixed llvmir usage in create_module method (`stuartarchibald `_ `Kalyan `_) +* PR `#7027 `_: Fix nrt debug print (`MegaIng `_) +* PR `#7031 `_: Fix inliner to use a single scope for all blocks (`Alexey Kozlov `_ `Siu Kwan Lam `_) +* PR `#7040 `_: Add Github action to mark issues as stale (`Graham Markall `_) +* PR `#7044 `_: Fixes for LLVM 11 (`stuartarchibald `_) +* PR `#7049 `_: Make NumPy random module use @overload_glue (`stuartarchibald `_) +* PR `#7050 `_: Add overload_classmethod (`Siu Kwan Lam `_) +* PR `#7052 `_: Fix string support in CUDA target (`Graham Markall `_) +* PR `#7056 `_: Change prange conversion approach to reuse header block. (`Todd A. Anderson `_) +* PR `#7061 `_: Add ndarray allocator classmethod (`stuartarchibald `_ `Siu Kwan Lam `_) +* PR `#7064 `_: Testhound/host array performance warning (`Michael Collison `_) +* PR `#7066 `_: Fix #7065: Add expected exception messages for NumPy 1.20 to tests (`Graham Markall `_) +* PR `#7068 `_: Enhancing docs about PRNG seeding (`Jérome Eertmans `_) +* PR `#7070 `_: Improve the issue templates and pull request template. (`Guoqiang QI `_) +* PR `#7080 `_: Fix ``__eq__`` for Flags and cpu_options classes (`Siu Kwan Lam `_) +* PR `#7087 `_: Add note to docs about zero-initialization of variables. (`stuartarchibald `_) +* PR `#7088 `_: Initialize NUMBA_DEFAULT_NUM_THREADS with a batch scheduler aware value (`Thomas VINCENT `_) +* PR `#7100 `_: Replace deprecated call to cuDeviceComputeCapability (`Graham Markall `_) +* PR `#7113 `_: Temporarily disable debug env export. (`stuartarchibald `_) +* PR `#7114 `_: CUDA: Deprecate eager compilation of device functions (`Graham Markall `_) +* PR `#7116 `_: Fix various issues with dwarf emission: (`stuartarchibald `_ `vlad-perevezentsev `_) +* PR `#7118 `_: Remove print to stdout (`stuartarchibald `_) +* PR `#7121 `_: Continue work on numpy subclasses (`Todd A. Anderson `_ `Siu Kwan Lam `_) +* PR `#7122 `_: Rtd/sphinx compat (`esc `_) +* PR `#7134 `_: Move minimum LLVM version to 11. (`stuartarchibald `_) +* PR `#7137 `_: skip pycc test on Python 3.7 + macOS because of distutils issue (`esc `_) +* PR `#7138 `_: Update the Azure default linux image to Ubuntu 18.04 (`stuartarchibald `_) +* PR `#7141 `_: Require llvmlite 0.37 as minimum supported. (`stuartarchibald `_) +* PR `#7143 `_: Update version checks in __init__ for np 1.17 (`stuartarchibald `_) +* PR `#7145 `_: Fix mainline (`stuartarchibald `_) +* PR `#7146 `_: Fix ``inline_closurecall`` may not be imported (`Siu Kwan Lam `_) +* PR `#7147 `_: Revert "Workaround gitpython 3.1.18 dependency issue" (`stuartarchibald `_) +* PR `#7149 `_: Fix issue in bytecode analysis where target and next are same. (`stuartarchibald `_) +* PR `#7152 `_: Fix iterators in CUDA (`Graham Markall `_) +* PR `#7156 `_: Fix ``ir_utils._max_label`` being updated incorrectly (`Siu Kwan Lam `_) +* PR `#7160 `_: Split parfors tests (`stuartarchibald `_) +* PR `#7161 `_: Update README for 0.54 (`stuartarchibald `_) +* PR `#7162 `_: CUDA: Fix linkage of device functions when compiling for debug (`Graham Markall `_) +* PR `#7163 `_: Split legalization pass to consider IR and features separately. (`stuartarchibald `_) +* PR `#7165 `_: Fix use of np.clip where out is not provided. (`stuartarchibald `_) +* PR `#7189 `_: CUDA: Skip IPC tests on ARM (`Graham Markall `_) +* PR `#7190 `_: CUDA: Fix test_pinned on Jetson (`Graham Markall `_) +* PR `#7192 `_: Fix missing import in array.argsort impl and add more tests. (`stuartarchibald `_) +* PR `#7196 `_: Fixes for lineinfo emission. (`stuartarchibald `_) +* PR `#7203 `_: remove duplicate changelog entries (`esc `_) +* PR `#7209 `_: Clamp numpy (`esc `_) +* PR `#7216 `_: Update CHANGE_LOG for 0.54.0rc2. (`stuartarchibald `_) +* PR `#7223 `_: Replace assertion errors on IR assumption violation (`Siu Kwan Lam `_) +* PR `#7230 `_: PR #7171 bugfix only (`Todd A. Anderson `_ `stuartarchibald `_) +* PR `#7236 `_: CUDA: Skip managed alloc tests on ARM (`Graham Markall `_) +* PR `#7267 `_: Fix #7258. Bug in SROA optimization (`Siu Kwan Lam `_) +* PR `#7271 `_: Update 3rd party license text. (`stuartarchibald `_) +* PR `#7272 `_: Allow annotations in njit-ed functions (`LunarLanding `_) +* PR `#7273 `_: Update CHANGE_LOG for 0.54.0rc3. (`stuartarchibald `_) +* PR `#7285 `_: CUDA: Fix OOB in test_kernel_arg (`Graham Markall `_) +* PR `#7294 `_: Continuation of PR #7280, fixing lifetime of TBB task_scheduler_handle (`Sergey Pokhodenko `_ `stuartarchibald `_) +* PR `#7298 `_: Use CBC to pin GCC to 7 on most linux and 9 on aarch64. (`stuartarchibald `_) +* PR `#7312 `_: Fix #7302. Workaround missing pthread problem on ppc64le (`Siu Kwan Lam `_) +* PR `#7317 `_: In TBB tsh test switch os.fork for mp fork ctx (`stuartarchibald `_) +* PR `#7319 `_: Update CHANGE_LOG for 0.54.0 final. (`stuartarchibald `_) + +Authors: + +* `Alexander-Makaryev `_ +* `Todd A. Anderson `_ +* `Hannes Pahl `_ +* `Ivan Butygin `_ +* `MegaIng `_ +* `Sergey Pokhodenko `_ +* `Aaron Russell Voelker `_ +* `Ashutosh Varma `_ +* `Ben Greiner `_ +* `Brandon T. Willard `_ +* `Daniel Nagel `_ +* `David Nadlinger `_ +* `Ehsan Totoni `_ +* `esc `_ +* `Felix Divo `_ +* `Graham Markall `_ +* `Guilherme Leobas `_ +* `Guoqiang QI `_ +* `Itamar Turner-Trauring `_ +* `Jérome Eertmans `_ +* `Alexey Kozlov `_ +* `Lauren Arnett `_ +* `LunarLanding `_ +* `Max Katz `_ +* `Kalyan `_ +* `Reazul Hoque `_ +* `Rishi Kulkarni `_ +* `Shaun Cutts `_ +* `Siu Kwan Lam `_ +* `stuartarchibald `_ +* `Thomas VINCENT `_ +* `Michael Collison `_ +* `vlad-perevezentsev `_ + + +Version 0.53.1 (25 March, 2021) +------------------------------- + +This is a bugfix release for 0.53.0. It contains the following four +pull-requests which fix two critical regressions and two build failures +reported by the openSuSe team: + +* PR #6826 Fix regression on gufunc serialization +* PR #6828 Fix regression in CUDA: Set stream in mapped and managed array + device_setup +* PR #6837 Ignore warnings from packaging module when testing import behaviour. +* PR #6851 set non-reported llvm timing values to 0.0 + +Authors: + +* Ben Greiner +* Graham Markall +* Siu Kwan Lam +* Stuart Archibald + +Version 0.53.0 (11 March, 2021) +------------------------------- + +This release continues to add new features, bug fixes and stability improvements +to Numba. + +Highlights of core changes: + +* Support for Python 3.9 (Stuart Archibald). +* Function sub-typing (Lucio Fernandez-Arjona). +* Initial support for dynamic ``gufuncs`` (i.e. from ``@guvectorize``) + (Guilherme Leobas). +* Parallel Accelerator (``@njit(parallel=True)`` now supports Fortran ordered + arrays (Todd A. Anderson and Siu Kwan Lam). + +Intel also kindly sponsored research and development that lead to two new +features: + + * Exposing LLVM compilation pass timings for diagnostic purposes (Siu Kwan + Lam). + * An event system for broadcasting compiler events (Siu Kwan Lam). + +Highlights of changes for the CUDA target: + +* CUDA 11.2 onwards (versions of the toolkit using NVVM IR 1.6 / LLVM IR 7.0.1) + are now supported (Graham Markall). +* A fast cube root function is added (Michael Collison). +* Support for atomic ``xor``, increment, decrement, exchange, are added, and + compare-and-swap is extended to support 64-bit integers (Michael Collison). +* Addition of ``cuda.is_supported_version()`` to check if the CUDA runtime + version is supported (Graham Markall). +* The CUDA dispatcher now shares infrastructure with the CPU dispatcher, + improving launch times for lazily-compiled kernels (Graham Markall). +* The CUDA Array Interface is updated to version 3, with support for streams + added (Graham Markall). +* Tuples and ``namedtuples`` can now be passed to kernels (Graham Markall). +* Initial support for Cooperative Groups is added, with support for Grid Groups + and Grid Sync (Graham Markall and Nick White). +* Support for ``math.log2`` and ``math.remainder`` is added (Guilherme Leobas). + +General deprecation notices: + +* There are no new general deprecations. + +CUDA target deprecation notices: + +* CUDA support on macOS is deprecated with this release (it still works, it is + just unsupported). +* The ``argtypes``, ``restypes``, and ``bind`` keyword arguments to the + ``cuda.jit`` decorator, deprecated since 0.51.0, are removed +* The ``Device.COMPUTE_CAPABILITY`` property, deprecated since 2014, has been + removed (use ``compute_capability`` instead). +* The ``to_host`` method of device arrays is removed (use ``copy_to_host`` + instead). + +General Enhancements: + +* PR #4769: objmode complex type spelling (Siu Kwan Lam) +* PR #5579: Function subtyping (Lucio Fernandez-Arjona) +* PR #5659: Add support for parfors creating 'F'ortran layout Numpy arrays. + (Todd A. Anderson) +* PR #5936: Improve array analysis for user-defined data types. (Todd A. + Anderson) +* PR #5938: Initial support for dynamic gufuncs (Guilherme Leobas) +* PR #5958: Making typed.List a typing Generic (Lucio Fernandez-Arjona) +* PR #6334: Support attribute access from other modules (Farah Hariri) +* PR #6373: Allow Dispatchers to be cached (Eric Wieser) +* PR #6519: Avoid unnecessary ir.Del generation and removal (Ehsan Totoni) +* PR #6545: Refactoring ParforDiagnostics (Elena Totmenina) +* PR #6560: Add LLVM pass timer (Siu Kwan Lam) +* PR #6573: Improve ``__str__`` for typed.List when invoked from IPython shell + (Amin Sadeghi) +* PR #6575: Avoid temp variable assignments (Ehsan Totoni) +* PR #6578: Add support for numpy ``intersect1d`` and basic test cases + (``@caljrobe``) +* PR #6579: Python 3.9 support. (Stuart Archibald) +* PR #6580: Store partial typing errors in compiler state (Ehsan Totoni) +* PR #6626: A simple event system to broadcast compiler events (Siu Kwan Lam) +* PR #6635: Try to resolve dynamic getitems as static post unroll transform. + (Stuart Archibald) +* PR #6636: Adds llvm_lock event (Siu Kwan Lam) +* PR #6664: Adds tests for PR 5659 (Siu Kwan Lam) +* PR #6680: Allow getattr to work in objmode output type spec (Siu Kwan Lam) + +Fixes: + +* PR #6176: Remove references to deprecated numpy globals (Eric Wieser) +* PR #6374: Use Python 3 style OSError handling (Eric Wieser) +* PR #6402: Fix ``typed.Dict`` and ``typed.List`` crashing on parametrized types + (Andreas Sodeur) +* PR #6403: Add ``types.ListType.key`` (Andreas Sodeur) +* PR #6410: Fixes issue #6386 (Danny Weitekamp) +* PR #6425: Fix unicode join for issue #6405 (Teugea Ioan-Teodor) +* PR #6437: Don't pass reduction variables known in an outer parfor to inner + parfors when analyzing reductions. (Todd A. Anderson) +* PR #6453: Keep original variable names in metadata to improve diagnostics + (Ehsan Totoni) +* PR #6454: FIX: Fixes for literals (Eric Larson) +* PR #6463: Bump llvmlite to 0.36 series (Stuart Archibald) +* PR #6466: Remove the misspelling of finalize_dynamic_globals (Sergey + Pokhodenko) +* PR #6489: Improve the error message for unsupported Buffer in Buffer + situation. (Stuart Archibald) +* PR #6503: Add test to ensure Numba imports without warnings. (Stuart + Archibald) +* PR #6508: Defer requirements to setup.py (Siu Kwan Lam) +* PR #6521: Skip annotated jitclass test if typeguard is running. (Stuart + Archibald) +* PR #6524: Fix typed.List return value (Lucio Fernandez-Arjona) +* PR #6562: Correcting typo in numba sysinfo output (Nick Sutcliffe) +* PR #6574: Run parfor fusion if 2 or more parfors (Ehsan Totoni) +* PR #6582: Fix typed dict error with uninitialized padding bytes (Siu Kwan + Lam) +* PR #6584: Remove jitclass from ``__init__`` ``__all__``. (Stuart Archibald) +* PR #6586: Run closure inlining ahead of branch pruning in case of nonlocal + (Stuart Archibald) +* PR #6591: Fix inlineasm test failure. (Siu Kwan Lam) +* PR #6622: Fix 6534, handle unpack of assign-like tuples. (Stuart Archibald) +* PR #6652: Simplify PR-6334 (Siu Kwan Lam) +* PR #6653: Fix get_numba_envvar (Siu Kwan Lam) +* PR #6654: Fix #6632 support alternative dtype string spellings (Stuart + Archibald) +* PR #6685: Add Python 3.9 to classifiers. (Stuart Archibald) +* PR #6693: patch to compile _devicearray.cpp with c++11 (Valentin Haenel) +* PR #6716: Consider assignment lhs live if used in rhs (Fixes #6715) (Ehsan + Totoni) +* PR #6727: Avoid errors in array analysis for global tuples with non-int + (Ehsan Totoni) +* PR #6733: Fix segfault and errors in #6668 (Siu Kwan Lam) +* PR #6741: Enable SSA in IR inliner (Ehsan Totoni) +* PR #6763: use an alternative constraint for the conda packages (Valentin + Haenel) +* PR #6786: Fix gufunc kwargs support (Siu Kwan Lam) + +CUDA Enhancements/Fixes: + +* PR #5162: Specify synchronization semantics of CUDA Array Interface (Graham + Markall) +* PR #6245: CUDA Cooperative grid groups (Graham Markall and Nick White) +* PR #6333: Remove dead ``_Kernel.__call__`` (Graham Markall) +* PR #6343: CUDA: Add support for passing tuples and namedtuples to kernels + (Graham Markall) +* PR #6349: Refactor Dispatcher to remove unnecessary indirection (Graham + Markall) +* PR #6358: Add log2 and remainder implementations for cuda (Guilherme Leobas) +* PR #6376: Added a fixed seed in test_atomics.py for issue #6370 (Teugea + Ioan-Teodor) +* PR #6377: CUDA: Fix various issues in test suite (Graham Markall) +* PR #6409: Implement cuda atomic xor (Michael Collison) +* PR #6422: CUDA: Remove deprecated items, expect CUDA 11.1 (Graham Markall) +* PR #6427: Remove duplicate repeated definition of gufunc (Amit Kumar) +* PR #6432: CUDA: Use ``_dispatcher.Dispatcher`` as base Dispatcher class + (Graham Markall) +* PR #6447: CUDA: Add get_regs_per_thread method to Dispatcher (Graham Markall) +* PR #6499: CUDA atomic increment, decrement, exchange and compare and swap + (Michael Collison) +* PR #6510: CUDA: Make device array assignment synchronous where necessary + (Graham Markall) +* PR #6517: CUDA: Add NVVM test of all 8-bit characters (Graham Markall) +* PR #6567: Refactor llvm replacement code into separate function (Michael + Collison) +* PR #6642: Testhound/cuda cuberoot (Michael Collison) +* PR #6661: CUDA: Support NVVM70 / CUDA 11.2 (Graham Markall) +* PR #6663: Fix error caused by missing "-static" libraries defined for some + platforms (Siu Kwan Lam) +* PR #6666: CUDA: Add a function to query whether the runtime version is + supported. (Graham Markall) +* PR #6725: CUDA: Fix compile to PTX with debug for CUDA 11.2 (Graham Markall) + +Documentation Updates: + +* PR #5740: Add FAQ entry on how to create a MWR. (Stuart Archibald) +* PR #6346: DOC: add where to get dev builds from to FAQ (Eyal Trabelsi) +* PR #6418: docs: use https for homepage (``@imba-tjd``) +* PR #6430: CUDA docs: Add RNG example with 3D grid and strided loops (Graham + Markall) +* PR #6436: docs: remove typo in Deprecation Notices (Thibault Ballier) +* PR #6440: Add note about performance of typed containers from the interpreter. + (Stuart Archibald) +* PR #6457: Link to read the docs instead of numba homepage (Hannes Pahl) +* PR #6470: Adding PyCon Sweden 2020 talk on numba (Ankit Mahato) +* PR #6472: Document ``numba.extending.is_jitted`` (Stuart Archibald) +* PR #6495: Fix typo in literal list docs. (Stuart Archibald) +* PR #6501: Add doc entry on Numba's limited resources and how to help. (Stuart + Archibald) +* PR #6502: Add CODEOWNERS file. (Stuart Archibald) +* PR #6531: Update canonical URL. (Stuart Archibald) +* PR #6544: Minor typo / grammar fixes to 5 minute guide (Ollin Boer Bohan) +* PR #6599: docs: fix simple typo, consevatively -> conservatively (Tim Gates) +* PR #6609: Recommend miniforge instead of c4aarch64 (Isuru Fernando) +* PR #6671: Update environment creation example to python 3.8 (Lucio + Fernandez-Arjona) +* PR #6676: Update hardware and software versions in various docs. (Stuart + Archibald) +* PR #6682: Update deprecation notices for 0.53 (Stuart Archibald) + +CI/Infrastructure Updates: + +* PR #6458: Enable typeguard in CI (Siu Kwan Lam) +* PR #6500: Update bug and feature request templates. (Stuart Archibald) +* PR #6516: Fix RTD build by using conda. (Stuart Archibald) +* PR #6587: Add zenodo badge (Siu Kwan Lam) + +Authors: + +* Amin Sadeghi +* Amit Kumar +* Andreas Sodeur +* Ankit Mahato +* Chris Barnes +* Danny Weitekamp +* Ehsan Totoni (core dev) +* Eric Larson +* Eric Wieser +* Eyal Trabelsi +* Farah Hariri +* Graham Markall +* Guilherme Leobas +* Hannes Pahl +* Isuru Fernando +* Lucio Fernandez-Arjona +* Michael Collison +* Nick Sutcliffe +* Nick White +* Ollin Boer Bohan +* Sergey Pokhodenko +* Siu Kwan Lam (core dev) +* Stuart Archibald (core dev) +* Teugea Ioan-Teodor +* Thibault Ballier +* Tim Gates +* Todd A. Anderson (core dev) +* Valentin Haenel (core dev) +* ``@caljrobe`` +* ``@imba-tjd`` + + +Version 0.52.0 (30 November, 2020) +---------------------------------- + +This release focuses on performance improvements, but also adds some new +features and contains numerous bug fixes and stability improvements. + +Highlights of core performance improvements include: + +* Intel kindly sponsored research and development into producing a new reference + count pruning pass. This pass operates at the LLVM level and can prune a + number of common reference counting patterns. This will improve performance + for two primary reasons: + + * There will be less pressure on the atomic locks used to do the reference + counting. + * Removal of reference counting operations permits more inlining and the + optimisation passes can in general do more with what is present. + + (Siu Kwan Lam). +* Intel also sponsored work to improve the performance of the + ``numba.typed.List`` container, particularly in the case of ``__getitem__`` + and iteration (Stuart Archibald). +* Superword-level parallelism vectorization is now switched on and the + optimisation pipeline has been lightly analysed and tuned so as to be able to + vectorize more and more often (Stuart Archibald). + +Highlights of core feature changes include: + +* The ``inspect_cfg`` method on the JIT dispatcher object has been + significantly enhanced and now includes highlighted output and interleaved + line markers and Python source (Stuart Archibald). +* The BSD operating system is now unofficially supported (Stuart Archibald). +* Numerous features/functionality improvements to NumPy support, including + support for: + + * ``np.asfarray`` (Guilherme Leobas) + * "subtyping" in record arrays (Lucio Fernandez-Arjona) + * ``np.split`` and ``np.array_split`` (Isaac Virshup) + * ``operator.contains`` with ``ndarray`` (``@mugoh``). + * ``np.asarray_chkfinite`` (Rishabh Varshney). + * NumPy 1.19 (Stuart Archibald). + * the ``ndarray`` allocators, ``empty``, ``ones`` and ``zeros``, accepting a + ``dtype`` specified as a string literal (Stuart Archibald). + +* Booleans are now supported as literal types (Alexey Kozlov). +* On the CUDA target: + + * CUDA 9.0 is now the minimum supported version (Graham Markall). + * Support for Unified Memory has been added (Max Katz). + * Kernel launch overhead is reduced (Graham Markall). + * Cudasim support for mapped array, memcopies and memset has been added (Mike + Williams). + * Access has been wired in to all libdevice functions (Graham Markall). + * Additional CUDA atomic operations have been added (Michael Collison). + * Additional math library functions (``frexp``, ``ldexp``, ``isfinite``) + (Zhihao Yuan). + * Support for ``power`` on complex numbers (Graham Markall). + +Deprecations to note: + +There are no new deprecations. However, note that "compatibility" mode, which +was added some 40 releases ago to help transition from 0.11 to 0.12+, has been +removed! Also, the shim to permit the import of ``jitclass`` from Numba's top +level namespace has now been removed as per the deprecation schedule. + +General Enhancements: + +* PR #5418: Add np.asfarray impl (Guilherme Leobas) +* PR #5560: Record subtyping (Lucio Fernandez-Arjona) +* PR #5609: Jitclass Infer Spec from Type Annotations (Ethan Pronovost) +* PR #5699: Implement np.split and np.array_split (Isaac Virshup) +* PR #6015: Adding BooleanLiteral type (Alexey Kozlov) +* PR #6027: Support operators inlining in InlineOverloads (Alexey Kozlov) +* PR #6038: Closes #6037, fixing FreeBSD compilation (László Károlyi) +* PR #6086: Add more accessible version information (Stuart Archibald) +* PR #6157: Add pipeline_class argument to @cfunc as supported by @jit. (Arthur + Peters) +* PR #6262: Support dtype from str literal. (Stuart Archibald) +* PR #6271: Support ``ndarray`` contains (``@mugoh``) +* PR #6295: Enhance inspect_cfg (Stuart Archibald) +* PR #6304: Support NumPy 1.19 (Stuart Archibald) +* PR #6309: Add suitable file search path for BSDs. (Stuart Archibald) +* PR #6341: Re roll 6279 (Rishabh Varshney and Valentin Haenel) + +Performance Enhancements: + +* PR #6145: Patch to fingerprint namedtuples. (Stuart Archibald) +* PR #6202: Speed up str(int) (Stuart Archibald) +* PR #6261: Add np.ndarray.ptp() support. (Stuart Archibald) +* PR #6266: Use custom LLVM refcount pruning pass (Siu Kwan Lam) +* PR #6275: Switch on SLP vectorize. (Stuart Archibald) +* PR #6278: Improve typed list performance. (Stuart Archibald) +* PR #6335: Split optimisation passes. (Stuart Archibald) +* PR #6455: Fix refprune on obfuscated refs and stabilize optimisation WRT + wrappers. (Stuart Archibald) + +Fixes: + +* PR #5639: Make UnicodeType inherit from Hashable (Stuart Archibald) +* PR #6006: Resolves incorrectly hoisted list in parfor. (Todd A. Anderson) +* PR #6126: fix version_info if version can not be determined (Valentin Haenel) +* PR #6137: Remove references to Python 2's long (Eric Wieser) +* PR #6139: Use direct syntax instead of the ``add_metaclass`` decorator (Eric + Wieser) +* PR #6140: Replace calls to utils.iteritems(d) with d.items() (Eric Wieser) +* PR #6141: Fix #6130 objmode cache segfault (Siu Kwan Lam) +* PR #6156: Remove callers of ``reraise`` in favor of using ``with_traceback`` + directly (Eric Wieser) +* PR #6162: Move charseq support out of init (Stuart Archibald) +* PR #6165: #5425 continued (Amos Bird and Stuart Archibald) +* PR #6166: Remove Python 2 compatibility from numba.core.utils (Eric Wieser) +* PR #6185: Better error message on NotDefinedError (Luiz Almeida) +* PR #6194: Remove recursion from traverse_types (Radu Popovici) +* PR #6200: Workaround #5973 (Stuart Archibald) +* PR #6203: Make find_callname only lookup functions that are likely part of + NumPy. (Stuart Archibald) +* PR #6204: Fix unicode kind selection for getitem. (Stuart Archibald) +* PR #6206: Build all extension modules with -g -Wall -Werror on Linux x86, + provide -O0 flag option (Graham Markall) +* PR #6212: Fix for objmode recompilation issue (Alexey Kozlov) +* PR #6213: Fix #6177. Remove AOT dependency on the Numba package (Siu Kwan Lam) +* PR #6224: Add support for tuple concatenation to array analysis. (#5396 + continued) (Todd A. Anderson) +* PR #6231: Remove compatibility mode (Graham Markall) +* PR #6254: Fix win-32 hashing bug (from Stuart Archibald) (Ray Donnelly) +* PR #6265: Fix #6260 (Stuart Archibald) +* PR #6267: speed up a couple of really slow unittests (Stuart Archibald) +* PR #6281: Remove numba.jitclass shim as per deprecation schedule. (Stuart + Archibald) +* PR #6294: Make return type propagate to all return variables (Andreas Sodeur) +* PR #6300: Un-skip tests that were skipped because of #4026. (Owen Anderson) +* PR #6307: Remove restrictions on SVML version due to bug in LLVM SVML CC + (Stuart Archibald) +* PR #6316: Make IR inliner tests not self mutating. (Stuart Archibald) +* PR #6318: PR #5892 continued (Todd A. Anderson, via Stuart Archibald) +* PR #6319: Permit switching off boundschecking when debug is on. (Stuart + Archibald) +* PR #6324: PR 6208 continued (Ivan Butygin and Stuart Archibald) +* PR #6337: Implements ``key`` on ``types.TypeRef`` (Andreas Sodeur) +* PR #6354: Bump llvmlite to 0.35. series. (Stuart Archibald) +* PR #6357: Fix enumerate invalid decref (Siu Kwan Lam) +* PR #6359: Fixes typed list indexing on 32bit (Stuart Archibald) +* PR #6378: Fix incorrect CPU override in vectorization test. (Stuart Archibald) +* PR #6379: Use O0 to enable inline and not affect loop-vectorization by later + O3... (Siu Kwan Lam) +* PR #6384: Fix failing tests to match on platform invariant int spelling. + (Stuart Archibald) +* PR #6390: Updates inspect_cfg (Stuart Archibald) +* PR #6396: Remove hard dependency on tbb package. (Stuart Archibald) +* PR #6408: Don't do array analysis for tuples that contain arrays. (Todd A. + Anderson) +* PR #6441: Fix ASCII flag in Unicode slicing (0.52.0rc2 regression) (Ehsan + Totoni) +* PR #6442: Fix array analysis regression in 0.52 RC2 for tuple of 1D arrays + (Ehsan Totoni) +* PR #6446: Fix #6444: pruner issues with reference stealing functions (Siu + Kwan Lam) +* PR #6450: Fix asfarray kwarg default handling. (Stuart Archibald) +* PR #6486: fix abstract base class import (Valentin Haenel) +* PR #6487: Restrict maximum version of python (Siu Kwan Lam) +* PR #6527: setup.py: fix py version guard (Chris Barnes) + +CUDA Enhancements/Fixes: + +* PR #5465: Remove macro expansion and replace uses with FE typing + BE lowering + (Graham Markall) +* PR #5741: CUDA: Add two-argument implementation of round() (Graham Markall) +* PR #5900: Enable CUDA Unified Memory (Max Katz) +* PR #6042: CUDA: Lower launch overhead by launching kernel directly (Graham + Markall) +* PR #6064: Lower math.frexp and math.ldexp in numba.cuda (Zhihao Yuan) +* PR #6066: Lower math.isfinite in numba.cuda (Zhihao Yuan) +* PR #6092: CUDA: Add mapped_array_like and pinned_array_like (Graham Markall) +* PR #6127: Fix race in reduction kernels on Volta, require CUDA 9, add syncwarp + with default mask (Graham Markall) +* PR #6129: Extend Cudasim to support most of the memory functionality. (Mike + Williams) +* PR #6150: CUDA: Turn on flake8 for cudadrv and fix errors (Graham Markall) +* PR #6152: CUDA: Provide wrappers for all libdevice functions, and fix typing + of math function (#4618) (Graham Markall) +* PR #6227: Raise exception when no supported architectures are found (Jacob + Tomlinson) +* PR #6244: CUDA Docs: Make workflow using simulator more explicit (Graham + Markall) +* PR #6248: Add support for CUDA atomic subtract operations (Michael Collison) +* PR #6289: Refactor atomic test cases to reduce code duplication (Michael + Collison) +* PR #6290: CUDA: Add support for complex power (Graham Markall) +* PR #6296: Fix flake8 violations in numba.cuda module (Graham Markall) +* PR #6297: Fix flake8 violations in numba.cuda.tests.cudapy module (Graham + Markall) +* PR #6298: Fix flake8 violations in numba.cuda.tests.cudadrv (Graham Markall) +* PR #6299: Fix flake8 violations in numba.cuda.simulator (Graham Markall) +* PR #6306: Fix flake8 in cuda atomic test from merge. (Stuart Archibald) +* PR #6325: Refactor code for atomic operations (Michael Collison) +* PR #6329: Flake8 fix for a CUDA test (Stuart Archibald) +* PR #6331: Explicitly state that NUMBA_ENABLE_CUDASIM needs to be set before + import (Graham Markall) +* PR #6340: CUDA: Fix #6339, performance regression launching specialized + kernels (Graham Markall) +* PR #6380: Only test managed allocations on Linux (Graham Markall) + +Documentation Updates: + +* PR #6090: doc: Add doc on direct creation of Numba typed-list (``@rht``) +* PR #6110: Update CONTRIBUTING.md (Stuart Archibald) +* PR #6128: CUDA Docs: Restore Dispatcher.forall() docs (Graham Markall) +* PR #6277: fix: cross2d wrong doc. reference (issue #6276) (``@jeertmans``) +* PR #6282: Remove docs on Python 2(.7) EOL. (Stuart Archibald) +* PR #6283: Add note on how public CI is impl and what users can do to help. + (Stuart Archibald) +* PR #6292: Document support for structured array attribute access + (Graham Markall) +* PR #6310: Declare unofficial \*BSD support (Stuart Archibald) +* PR #6342: Fix docs on literally usage. (Stuart Archibald) +* PR #6348: doc: fix typo in jitclass.rst ("initilising" -> "initialising") + (``@muxator``) +* PR #6362: Move llvmlite support in README to 0.35 (Stuart Archibald) +* PR #6363: Note that reference counted types are not permitted in set(). + (Stuart Archibald) +* PR #6364: Move deprecation schedules for 0.52 (Stuart Archibald) + +CI/Infrastructure Updates: + +* PR #6252: Show channel URLs (Siu Kwan Lam) +* PR #6338: Direct user questions to Discourse instead of the Google Group. + (Stan Seibert) +* PR #6474: Add skip on PPC64LE for tests causing SIGABRT in LLVM. (Stuart + Archibald) + +Authors: + +* Alexey Kozlov +* Amos Bird +* Andreas Sodeur +* Arthur Peters +* Chris Barnes +* Ehsan Totoni (core dev) +* Eric Wieser +* Ethan Pronovost +* Graham Markall +* Guilherme Leobas +* Isaac Virshup +* Ivan Butygin +* Jacob Tomlinson +* Luiz Almeida +* László Károlyi +* Lucio Fernandez-Arjona +* Max Katz +* Michael Collison +* Mike Williams +* Owen Anderson +* Radu Popovici +* Ray Donnelly +* Rishabh Varshney +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* Valentin Haenel (core dev) +* Zhihao Yuan +* ``@jeertmans`` +* ``@mugoh`` +* ``@muxator`` +* ``@rht`` + + + +Version 0.51.2 (September 2, 2020) +---------------------------------- + +This is a bugfix release for 0.51.1. It fixes a critical performance bug in the +CFG back edge computation algorithm that leads to exponential time complexity +arising in compilation for use cases with certain pathological properties. + +* PR #6195: PR 6187 Continue. Don't visit already checked successors + +Authors: + +* Graham Markall +* Siu Kwan Lam (core dev) + + +Version 0.51.1 (August 26, 2020) +-------------------------------- + +This is a bugfix release for 0.51.0, it fixes a critical bug in caching, another +critical bug in the CUDA target initialisation sequence and also fixes some +compile time performance regressions: + +* PR #6141: Fix #6130 objmode cache segfault +* PR #6146: Fix compilation slowdown due to controlflow analysis +* PR #6147: CUDA: Don't make a runtime call on import +* PR #6153: Fix for #6151. Make UnicodeCharSeq into str for comparison. +* PR #6168: Fix Issue #6167: Failure in test_cuda_submodules + +Authors: + +* Graham Markall +* Siu Kwan Lam (core dev) +* Stuart Archibald (core dev) + + +Version 0.51.0 (August 12, 2020) +-------------------------------- + +This release continues to add new features to Numba and also contains a +significant number of bug fixes and stability improvements. + +Highlights of core feature changes include: + +* The compilation chain is now based on LLVM 10 (Valentin Haenel). +* Numba has internally switched to prefer non-literal types over literal ones so + as to reduce function over-specialisation, this with view of speeding up + compile times (Siu Kwan Lam). +* On the CUDA target: Support for CUDA Toolkit 11, Ampere, and Compute + Capability 8.0; Printing of ``SASS`` code for kernels; Callbacks to Python + functions can be inserted into CUDA streams, and streams are async awaitable; + Atomic ``nanmin`` and ``nanmax`` functions are added; Fixes for various + miscompilations and segfaults. (mostly Graham Markall; call backs on + streams by Peter Würtz). + +Intel also kindly sponsored research and development that lead to some exciting +new features: + +* Support for heterogeneous immutable lists and heterogeneous immutable string + key dictionaries. Also optional initial/construction value capturing for all + lists and dictionaries containing literal values (Stuart Archibald). +* A new pass-by-reference mutable structure extension type ``StructRef`` (Siu + Kwan Lam). +* Object mode blocks are now cacheable, with the side effect of numerous bug + fixes and performance improvements in caching. This also permits caching of + functions defined in closures (Siu Kwan Lam). + +Deprecations to note: + +To align with other targets, the ``argtypes`` and ``restypes`` kwargs to +``@cuda.jit`` are now deprecated, the ``bind`` kwarg is also deprecated. +Further the ``target`` kwarg to the ``numba.jit`` decorator family is +deprecated. + +General Enhancements: + +* PR #5463: Add str(int) impl +* PR #5526: Impl. np.asarray(literal) +* PR #5619: Add support for multi-output ufuncs +* PR #5711: Division with timedelta input +* PR #5763: Support minlength argument to np.bincount +* PR #5779: Return zero array from np.dot when the arguments are empty. +* PR #5796: Add implementation for np.positive +* PR #5849: Setitem for records when index is StringLiteral, including literal + unroll +* PR #5856: Add support for conversion of inplace_binop to parfor. +* PR #5893: Allocate 1D iteration space one at a time for more even + distribution. +* PR #5922: Reduce objmode and unpickling overhead +* PR #5944: re-enable OpenMP in wheels +* PR #5946: Implement literal dictionaries and lists. +* PR #5956: Update numba_sysinfo.py +* PR #5978: Add structref as a mutable struct that is pass-by-ref +* PR #5980: Deprecate target kwarg for numba.jit. +* PR #6058: Add prefer_literal option to overload API + +Fixes: + +* PR #5674: Fix #3955. Allow `with objmode` to be cached +* PR #5724: Initialize process lock lazily to prevent multiprocessing issue +* PR #5783: Make np.divide and np.remainder code more similar +* PR #5808: Fix 5665 Block jit(nopython=True, forceobj=True) and suppress + njit(forceobj=True) +* PR #5834: Fix the is operator on Ellipsis +* PR #5838: Ensure ``Dispatcher.__eq__`` always returns a bool +* PR #5841: cleanup: Use PythonAPI.bool_from_bool in more places +* PR #5862: Do not leak loop iteration variables into the numba.np.npyimpl + namespace +* PR #5869: Update repomap +* PR #5879: Fix erroneous input mutation in linalg routines +* PR #5882: Type check function in jit decorator +* PR #5925: Use np.inf and -np.inf for max and min float values respectively. +* PR #5935: Fix default arguments with multiprocessing +* PR #5952: Fix "Internal error ... local variable 'errstr' referenced before + assignment during BoundFunction(...)" +* PR #5962: Fix SVML tests with LLVM 10 and AVX512 +* PR #5972: fix flake8 for numba/runtests.py +* PR #5995: Update setup.py with new llvmlite versions +* PR #5996: Set lower bound for llvmlite to 0.33 +* PR #6004: Fix problem in branch pruning with LiteralStrKeyDict +* PR #6017: Fixing up numba_do_raise +* PR #6028: Fix #6023 +* PR #6031: Continue 5821 +* PR #6035: Fix overspecialize of literal +* PR #6046: Fixes statement reordering bug in maximize fusion step. +* PR #6056: Fix issue on invalid inlining of non-empty build_list by + inline_arraycall +* PR #6057: fix aarch64/python_3.8 failure on master +* PR #6070: Fix overspecialized containers +* PR #6071: Remove f-strings in setup.py +* PR #6072: Fix for #6005 +* PR #6073: Fixes invalid C prototype in helper function. +* PR #6078: Duplicate NumPy's PyArray_DescrCheck macro +* PR #6081: Fix issue with cross drive use and relpath. +* PR #6083: Fix bug in initial value unify. +* PR #6087: remove invalid sanity check from randrange tests +* PR #6089: Fix invalid reference to TypingError +* PR #6097: Add function code and closure bytes into cache key +* PR #6099: Restrict upper limit of TBB version due to ABI changes. +* PR #6101: Restrict lower limit of icc_rt version due to assumed SVML bug. +* PR #6107: Fix and test #6095 +* PR #6109: Fixes an issue reported in #6094 +* PR #6111: Decouple LiteralList and LiteralStrKeyDict from tuple +* PR #6116: Fix #6102. Problem with non-unique label. + +CUDA Enhancements/Fixes: + +* PR #5359: Remove special-casing of 0d arrays +* PR #5709: CUDA: Refactoring of cuda.jit and kernel / dispatcher abstractions +* PR #5732: CUDA Docs: document ``forall`` method of kernels +* PR #5745: CUDA stream callbacks and async awaitable streams +* PR #5761: Add implmentation for int types for isnan and isinf for CUDA +* PR #5819: Add support for CUDA 11 and Ampere / CC 8.0 +* PR #5826: CUDA: Add function to get SASS for kernels +* PR #5846: CUDA: Allow disabling NVVM optimizations, and fix debug issues +* PR #5851: CUDA EMM enhancements - add default get_ipc_handle implementation, + skip a test conditionally +* PR #5852: CUDA: Fix ``cuda.test()`` +* PR #5857: CUDA docs: Add notes on resetting the EMM plugin +* PR #5859: CUDA: Fix reduce docs and style improvements +* PR #6016: Fixes change of list spelling in a cuda test. +* PR #6020: CUDA: Fix #5820, adding atomic nanmin / nanmax +* PR #6030: CUDA: Don't optimize IR before sending it to NVVM +* PR #6052: Fix dtype for atomic_add_double testsuite +* PR #6080: CUDA: Prevent auto-upgrade of atomic intrinsics +* PR #6123: Fix #6121 + +Documentation Updates: + +* PR #5782: Host docs on Read the Docs +* PR #5830: doc: Mention that caching uses pickle +* PR #5963: Fix broken link to numpy ufunc signature docs +* PR #5975: restructure communication section +* PR #5981: Document bounds-checking behavior in python deviations page +* PR #5993: Docs for structref +* PR #6008: Small fix so bullet points are rendered by sphinx +* PR #6013: emphasize cuda kernel functions are asynchronous +* PR #6036: Update deprecation doc from numba.errors to numba.core.errors +* PR #6062: Change references to numba.pydata.org to https + +CI updates: + +* PR #5850: Updates the "New Issue" behaviour to better redirect users. +* PR #5940: Add discourse badge +* PR #5960: Setting mypy on CI + +Enhancements from user contributed PRs (with thanks!): + +* Aisha Tammy added the ability to switch off TBB support at compile time in + #5821 (continued in #6031 by Stuart Archibald). +* Alexander Stiebing fixed a reference before assignment bug in #5952. +* Alexey Kozlov fixed a bug in tuple getitem for literals in #6028. +* Andrew Eckart updated the repomap in #5869, added support for Read the Docs + in #5782, fixed a bug in the ``np.dot`` implementation to correctly handle + empty arrays in #5779 and added support for ``minlength`` to ``np.bincount`` + in #5763. +* ``@bitsisbits`` updated ``numba_sysinfo.py`` to handle HSA agents correctly in + #5956. +* Daichi Suzuo Fixed a bug in the threading backend initialisation sequence such + that it is now correctly a lazy lock in #5724. +* Eric Wieser contributed a number of patches, particularly in enhancing and + improving the ``ufunc`` capabilities: + + * #5359: Remove special-casing of 0d arrays + * #5834: Fix the is operator on Ellipsis + * #5619: Add support for multi-output ufuncs + * #5841: cleanup: Use PythonAPI.bool_from_bool in more places + * #5862: Do not leak loop iteration variables into the numba.np.npyimpl + namespace + * #5838: Ensure ``Dispatcher.__eq__`` always returns a bool + * #5830: doc: Mention that caching uses pickle + * #5783: Make np.divide and np.remainder code more similar + +* Ethan Pronovost added a guard to prevent the common mistake of applying a jit + decorator to the same function twice in #5881. +* Graham Markall contributed many patches to the CUDA target, as follows: + + * #6052: Fix dtype for atomic_add_double tests + * #6030: CUDA: Don't optimize IR before sending it to NVVM + * #5846: CUDA: Allow disabling NVVM optimizations, and fix debug issues + * #5826: CUDA: Add function to get SASS for kernels + * #5851: CUDA EMM enhancements - add default get_ipc_handle implementation, + skip a test conditionally + * #5709: CUDA: Refactoring of cuda.jit and kernel / dispatcher abstractions + * #5819: Add support for CUDA 11 and Ampere / CC 8.0 + * #6020: CUDA: Fix #5820, adding atomic nanmin / nanmax + * #5857: CUDA docs: Add notes on resetting the EMM plugin + * #5859: CUDA: Fix reduce docs and style improvements + * #5852: CUDA: Fix ``cuda.test()`` + * #5732: CUDA Docs: document ``forall`` method of kernels + +* Guilherme Leobas added support for ``str(int)`` in #5463 and + ``np.asarray(literal value)``` in #5526. +* Hameer Abbasi deprecated the ``target`` kwarg for ``numba.jit`` in #5980. +* Hannes Pahl added a badge to the Numba github page linking to the new + discourse forum in #5940 and also fixed a bug that permitted illegal + combinations of flags to be passed into ``@jit`` in #5808. +* Kayran Schmidt emphasized that CUDA kernel functions are asynchronous in the + documentation in #6013. +* Leonardo Uieda fixed a broken link to the NumPy ufunc signature docs in #5963. +* Lucio Fernandez-Arjona added mypy to CI and started adding type annotations to + the code base in #5960, also fixed a (de)serialization problem on the + dispatcher in #5935, improved the undefined variable error message in #5876, + added support for division with timedelta input in #5711 and implemented + ``setitem`` for records when the index is a ``StringLiteral`` in #5849. +* Ludovic Tiako documented Numba's bounds-checking behavior in the python + deviations page in #5981. +* Matt Roeschke changed all ``http`` references ``https`` in #6062. +* ``@niteya-shah`` implemented ``isnan`` and ``isinf`` for integer types on the + CUDA target in #5761 and implemented ``np.positive`` in #5796. +* Peter Würtz added CUDA stream callbacks and async awaitable streams in #5745. +* ``@rht`` fixed an invalid import referred to in the deprecation documentation + in #6036. +* Sergey Pokhodenko updated the SVML tests for LLVM 10 in #5962. +* Shyam Saladi fixed a Sphinx rendering bug in #6008. + +Authors: + +* Aisha Tammy +* Alexander Stiebing +* Alexey Kozlov +* Andrew Eckart +* ``@bitsisbits`` +* Daichi Suzuo +* Eric Wieser +* Ethan Pronovost +* Graham Markall +* Guilherme Leobas +* Hameer Abbasi +* Hannes Pahl +* Kayran Schmidt +* Kozlov, Alexey +* Leonardo Uieda +* Lucio Fernandez-Arjona +* Ludovic Tiako +* Matt Roeschke +* ``@niteya-shah`` +* Peter Würtz +* Sergey Pokhodenko +* Shyam Saladi +* ``@rht`` +* Siu Kwan Lam (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* Valentin Haenel (core dev) + + +Version 0.50.1 (Jun 24, 2020) +----------------------------- + +This is a bugfix release for 0.50.0, it fixes a critical bug in error reporting +and a number of other smaller issues: + +* PR #5861: Added except for possible Windows get_terminal_size exception +* PR #5876: Improve undefined variable error message +* PR #5884: Update the deprecation notices for 0.50.1 +* PR #5889: Fixes literally not forcing re-dispatch for inline='always' +* PR #5912: Fix bad attr access on certain typing templates breaking exceptions. +* PR #5918: Fix cuda test due to #5876 + +Authors: + +* ``@pepping_dore`` +* Lucio Fernandez-Arjona +* Siu Kwan Lam (core dev) +* Stuart Archibald (core dev) + + +Version 0.50.0 (Jun 10, 2020) +----------------------------- + +This is a more usual release in comparison to the others that have been made in +the last six months. It comprises the result of a number of maintenance tasks +along with some new features and a lot of bug fixes. + +Highlights of core feature changes include: + +* The compilation chain is now based on LLVM 9. +* The error handling and reporting system has been improved to reduce the size + of error messages, and also improve quality and specificity. +* The CUDA target has more stream constructors available and a new function for + compiling to PTX without linking and loading the code to a device. Further, + the macro-based system for describing CUDA threads and blocks has been + replaced with standard typing and lowering implementations, for improved + debugging and extensibility. + +IMPORTANT: The backwards compatibility shim, that was present in 0.49.x to +accommodate the refactoring of Numba's internals, has been removed. If a module +is imported from a moved location an ``ImportError`` will occur. + +General Enhancements: + +* PR #5060: Enables np.sum for timedelta64 +* PR #5225: Adjust interpreter to make conditionals predicates via bool() call. +* PR #5506: Jitclass static methods +* PR #5580: Revert shim +* PR #5591: Fix #5525 Add figure for total memory to ``numba -s`` output. +* PR #5616: Simplify the ufunc kernel registration +* PR #5617: Remove /examples from the Numba repo. +* PR #5673: Fix inliners to run all passes on IR and clean up correctly. +* PR #5700: Make it easier to understand type inference: add SSA dump, use for + ``DEBUG_TYPEINFER`` +* PR #5702: Fixes for LLVM 9 +* PR #5722: Improve error messages. +* PR #5758: Support NumPy 1.18 + +Fixes: + +* PR #5390: add error handling for lookup_module +* PR #5464: Jitclass drops annotations to avoid error +* PR #5478: Fix #5471. Issue with omitted type not recognized as literal value. +* PR #5517: Fix numba.typed.List extend for singleton and empty iterable +* PR #5549: Check type getitem +* PR #5568: Add skip to entrypoint test on windows +* PR #5581: Revert #5568 +* PR #5602: Fix segfault caused by pop from numba.typed.List +* PR #5645: Fix SSA redundant CFG computation +* PR #5686: Fix issue with SSA not minimal +* PR #5689: Fix bug in unified_function_type (issue 5685) +* PR #5694: Skip part of slice array analysis if any part is not analyzable. +* PR #5697: Fix usedef issue with parfor loopnest variables. +* PR #5705: A fix for cases where SSA looks like a reduction variable. +* PR #5714: Fix bug in test +* PR #5717: Initialise Numba extensions ahead of any compilation starting. +* PR #5721: Fix array iterator layout. +* PR #5738: Unbreak master on buildfarm +* PR #5757: Force LLVM to use ZMM registers for vectorization. +* PR #5764: fix flake8 errors +* PR #5768: Interval example: fix import +* PR #5781: Moving record array examples to a test module +* PR #5791: Fix up no cgroups problem +* PR #5795: Restore refct removal pass and make it strict +* PR #5807: Skip failing test on POWER8 due to PPC CTR Loop problem. +* PR #5812: Fix side issue from #5792, @overload inliner cached IR being + mutated. +* PR #5815: Pin llvmlite to 0.33 +* PR #5833: Fixes the source location appearing incorrectly in error messages. + +CUDA Enhancements/Fixes: + +* PR #5347: CUDA: Provide more stream constructors +* PR #5388: CUDA: Fix OOB write in test_round{f4,f8} +* PR #5437: Fix #5429: Exception using ``.get_ipc_handle(...)`` on array from + ``as_cuda_array(...)`` +* PR #5481: CUDA: Replace macros with typing and lowering implementations +* PR #5556: CUDA: Make atomic semantics match Python / NumPy, and fix #5458 +* PR #5558: CUDA: Only release primary ctx if retained +* PR #5561: CUDA: Add function for compiling to PTX (+ other small fixes) +* PR #5573: CUDA: Skip tests under cuda-memcheck that hang it +* PR #5578: Implement math.modf for CUDA target +* PR #5704: CUDA Eager compilation: Fix max_registers kwarg +* PR #5718: CUDA lib path tests: unset CUDA_PATH when CUDA_HOME unset +* PR #5800: Fix LLVM 9 IR for NVVM +* PR #5803: CUDA Update expected error messages to fix #5797 + +Documentation Updates: + +* PR #5546: DOC: Add documentation about cost model to inlining notes. +* PR #5653: Update doc with respect to try-finally case + +Enhancements from user contributed PRs (with thanks!): + +* Elias Kuthe fixed in issue with imports in the Interval example in #5768 +* Eric Wieser Simplified the ufunc kernel registration mechanism in #5616 +* Ethan Pronovost patched a problem with ``__annotations__`` in ``jitclass`` in + #5464, fixed a bug that lead to infinite loops in Numba's ``Type.__getitem__`` + in #5549, fixed a bug in ``np.arange`` testing in #5714 and added support for + ``@staticmethod`` to ``jitclass`` in #5506. +* Gabriele Gemmi implemented ``math.modf`` for the CUDA target in #5578 +* Graham Markall contributed many patches, largely to the CUDA target, as + follows: + + * #5347: CUDA: Provide more stream constructors + * #5388: CUDA: Fix OOB write in test_round{f4,f8} + * #5437: Fix #5429: Exception using ``.get_ipc_handle(...)`` on array from + ``as_cuda_array(...)`` + * #5481: CUDA: Replace macros with typing and lowering implementations + * #5556: CUDA: Make atomic semantics match Python / NumPy, and fix #5458 + * #5558: CUDA: Only release primary ctx if retained + * #5561: CUDA: Add function for compiling to PTX (+ other small fixes) + * #5573: CUDA: Skip tests under cuda-memcheck that hang it + * #5648: Unset the memory manager after EMM Plugin tests + * #5700: Make it easier to understand type inference: add SSA dump, use for + ``DEBUG_TYPEINFER`` + * #5704: CUDA Eager compilation: Fix max_registers kwarg + * #5718: CUDA lib path tests: unset CUDA_PATH when CUDA_HOME unset + * #5800: Fix LLVM 9 IR for NVVM + * #5803: CUDA Update expected error messages to fix #5797 + +* Guilherme Leobas updated the documentation surrounding try-finally in #5653 +* Hameer Abbasi added documentation about the cost model to the notes on + inlining in #5546 +* Jacques Gaudin rewrote ``numba -s`` to produce and consume a dictionary of + output about the current system in #5591 +* James Bourbeau Updated min/argmin and max/argmax to handle non-leading nans + (via #5758) +* Lucio Fernandez-Arjona moved the record array examples to a test module in + #5781 and added ``np.timedelta64`` handling to ``np.sum`` in #5060 +* Pearu Peterson Fixed a bug in unified_function_type in #5689 +* Sergey Pokhodenko fixed an issue impacting LLVM 10 regarding vectorization + widths on Intel SkyLake processors in #5757 +* Shan Sikdar added error handling for ``lookup_module`` in #5390 +* @toddrme2178 add CI testing for NumPy 1.18 (via #5758) + +Authors: + +* Elias Kuthe +* Eric Wieser +* Ethan Pronovost +* Gabriele Gemmi +* Graham Markall +* Guilherme Leobas +* Hameer Abbasi +* Jacques Gaudin +* James Bourbeau +* Lucio Fernandez-Arjona +* Pearu Peterson +* Sergey Pokhodenko +* Shan Sikdar +* Siu Kwan Lam (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* ``@toddrme2178`` +* Valentin Haenel (core dev) + + +Version 0.49.1 (May 7, 2020) +---------------------------- + +This is a bugfix release for 0.49.0, it fixes some residual issues with SSA +form, a critical bug in the branch pruning logic and a number of other smaller +issues: + +* PR #5587: Fixed #5586 Threading Implementation Typos +* PR #5592: Fixes #5583 Remove references to cffi_support from docs and examples +* PR #5614: Fix invalid type in resolve for comparison expr in parfors. +* PR #5624: Fix erroneous rewrite of predicate to bit const on prune. +* PR #5627: Fixes #5623, SSA local def scan based on invalid equality + assumption. +* PR #5629: Fixes naming error in array_exprs +* PR #5630: Fix #5570. Incorrect race variable detection due to SSA naming. +* PR #5638: Make literal_unroll function work as a freevar. +* PR #5648: Unset the memory manager after EMM Plugin tests +* PR #5651: Fix some SSA issues +* PR #5652: Pin to sphinx=2.4.4 to avoid problem with C declaration +* PR #5658: Fix unifying undefined first class function types issue +* PR #5669: Update example in 5m guide WRT SSA type stability. +* PR #5676: Restore ``numba.types`` as public API + +Authors: + +* Graham Markall +* Juan Manuel Cruz Martinez +* Pearu Peterson +* Sean Law +* Stuart Archibald (core dev) +* Siu Kwan Lam (core dev) + + +Version 0.49.0 (Apr 16, 2020) +----------------------------- + +This release is very large in terms of code changes. Large scale removal of +unsupported Python and NumPy versions has taken place along with a significant +amount of refactoring to simplify the Numba code base to make it easier for +contributors. Numba's intermediate representation has also undergone some +important changes to solve a number of long standing issues. In addition some +new features have been added and a large number of bugs have been fixed! + +IMPORTANT: In this release Numba's internals have moved about a lot. A backwards +compatibility "shim" is provided for this release so as to not immediately break +projects using Numba's internals. If a module is imported from a moved location +the shim will issue a deprecation warning and suggest how to update the import +statement for the new location. The shim will be removed in 0.50.0! + +Highlights of core feature changes include: + +* Removal of all Python 2 related code and also updating the minimum supported + Python version to 3.6, the minimum supported NumPy version to 1.15 and the + minimum supported SciPy version to 1.0. (Stuart Archibald). +* Refactoring of the Numba code base. The code is now organised into submodules + by functionality. This cleans up Numba's top level namespace. + (Stuart Archibald). +* Introduction of an ``ir.Del`` free static single assignment form for Numba's + intermediate representation (Siu Kwan Lam and Stuart Archibald). +* An OpenMP-like thread masking API has been added for use with code using the + parallel CPU backends (Aaron Meurer and Stuart Archibald). +* For the CUDA target, all kernel launches now require a configuration, this + preventing accidental launches of kernels with the old default of a single + thread in a single block. The hard-coded autotuner is also now removed, such + tuning is deferred to CUDA API calls that provide the same functionality + (Graham Markall). +* The CUDA target also gained an External Memory Management plugin interface to + allow Numba to use another CUDA-aware library for all memory allocations and + deallocations (Graham Markall). +* The Numba Typed List container gained support for construction from iterables + (Valentin Haenel). +* Experimental support was added for first-class function types + (Pearu Peterson). + +Enhancements from user contributed PRs (with thanks!): + +* Aaron Meurer added support for thread masking at runtime in #4615. +* Andreas Sodeur fixed a long standing bug that was preventing ``cProfile`` from + working with Numba JIT compiled functions in #4476. +* Arik Funke fixed error messages in ``test_array_reductions`` (#5278), fixed + an issue with test discovery (#5239), made it so the documentation would build + again on windows (#5453) and fixed a nested list problem in the docs in #5489. +* Antonio Russo fixed a SyntaxWarning in #5252. +* Eric Wieser added support for inferring the types of object arrays (#5348) and + iterating over 2D arrays (#5115), also fixed some compiler warnings due to + missing (void) in #5222. Also helped improved the "shim" and associated + warnings in #5485, #5488, #5498 and partly #5532. +* Ethan Pronovost fixed a problem with the shim erroneously warning for jitclass + use in #5454 and also prevented illegal return values in jitclass ``__init__`` + in #5505. +* Gabriel Majeri added SciPy 2019 talks to the docs in #5106. +* Graham Markall changed the Numba HTML documentation theme to resolve a number + of long standing issues in #5346. Also contributed were a large number of CUDA + enhancements and fixes, namely: + + * #5519: CUDA: Silence the test suite - Fix #4809, remove autojit, delete + prints + * #5443: Fix #5196: Docs: assert in CUDA only enabled for debug + * #5436: Fix #5408: test_set_registers_57 fails on Maxwell + * #5423: Fix #5421: Add notes on printing in CUDA kernels + * #5400: Fix #4954, and some other small CUDA testsuite fixes + * #5328: NBEP 7: External Memory Management Plugin Interface + * #5144: Fix #4875: Make #2655 test with debug expect to pass + * #5323: Document lifetime semantics of CUDA Array Interface + * #5061: Prevent kernel launch with no configuration, remove autotuner + * #5099: Fix #5073: Slices of dynamic shared memory all alias + * #5136: CUDA: Enable asynchronous operations on the default stream + * #5085: Support other itemsizes with view + * #5059: Docs: Explain how to use Memcheck with Numba, fixups in CUDA + documentation + * #4957: Add notes on overwriting gufunc inputs to docs + +* Greg Jennings fixed an issue with ``np.random.choice`` not acknowledging the + RNG seed correctly in #3897/#5310. +* Guilherme Leobas added support for ``np.isnat`` in #5293. +* Henry Schreiner made the llvmlite requirements more explicit in + requirements.txt in #5150. +* Ivan Butygin helped fix an issue with parfors sequential lowering in + #5114/#5250. +* Jacques Gaudin fixed a bug for Python >= 3.8 in ``numba -s`` in #5548. +* Jim Pivarski added some hints for debugging entry points in #5280. +* John Kirkham added ``numpy.dtype`` coercion for the ``dtype`` argument to CUDA + device arrays in #5252. +* Leo Fang added a list of libraries that support ``__cuda_array_interface__`` + in #5104. +* Lucio Fernandez-Arjona added ``getitem`` for the NumPy record type when the + index is a ``StringLiteral`` type in #5182 and improved the documentation + rendering via additions to the TOC and removal of numbering in #5450. +* Mads R. B. Kristensen fixed an issue with ``__cuda_array_interface__`` not + requiring the context in #5189. +* Marcin Tolysz added support for nested modules in AOT compilation in #5174. +* Mike Williams fixed some issues with NumPy records and ``getitem`` in the CUDA + simulator in #5343. +* Pearu Peterson added experimental support for first-class function types in + #5287 (and fixes in #5459, #5473/#5429, and #5557). +* Ravi Teja Gutta added support for ``np.flip`` in #4376/#5313. +* Rohit Sanjay fixed an issue with type refinement for unicode input supplied to + typed-list ``extend()`` (#5295) and fixed unicode ``.strip()`` to strip all + whitespace characters in #5213. +* Vladimir Lukyanov fixed an awkward bug in ``typed.dict`` in #5361, added a fix + to ensure the LLVM and assembly dumps are highlighted correctly in #5357 and + implemented a Numba IR Lexer and added highlighting to Numba IR dumps in + #5333. +* hdf fixed an issue with the ``boundscheck`` flag in the CUDA jit target in + #5257. + +General Enhancements: + +* PR #4615: Allow masking threads out at runtime +* PR #4798: Add branch pruning based on raw predicates. +* PR #5115: Add support for iterating over 2D arrays +* PR #5117: Implement ord()/chr() +* PR #5122: Remove Python 2. +* PR #5127: Calling convention adaptor for boxer/unboxer to call jitcode +* PR #5151: implement None-typed typed-list +* PR #5174: Nested modules https://github.com/numba/numba/issues/4739 +* PR #5182: Add getitem for Record type when index is StringLiteral +* PR #5185: extract code-gen utilities from closures +* PR #5197: Refactor Numba, part I +* PR #5210: Remove more unsupported Python versions from build tooling. +* PR #5212: Adds support for viewing the CFG of the ELF disassembly. +* PR #5227: Immutable typed-list +* PR #5231: Added support for ``np.asarray`` to be used with + ``numba.typed.List`` +* PR #5235: Added property ``dtype`` to ``numba.typed.List`` +* PR #5272: Refactor parfor: split up ParforPass +* PR #5281: Make IR ir.Del free until legalized. +* PR #5287: First-class function type +* PR #5293: np.isnat +* PR #5294: Create typed-list from iterable +* PR #5295: refine typed-list on unicode input to extend +* PR #5296: Refactor parfor: better exception from passes +* PR #5308: Provide ``numba.extending.is_jitted`` +* PR #5320: refactor array_analysis +* PR #5325: Let literal_unroll accept types.Named*Tuple +* PR #5330: refactor common operation in parfor lowering into a new util +* PR #5333: Add: highlight Numba IR dump +* PR #5342: Support for tuples passed to parfors. +* PR #5348: Add support for inferring the types of object arrays +* PR #5351: SSA again +* PR #5352: Add shim to accommodate refactoring. +* PR #5356: implement allocated parameter in njit +* PR #5369: Make test ordering more consistent across feature availability +* PR #5428: Wip/deprecate jitclass location +* PR #5441: Additional changes to first class function +* PR #5455: Move to llvmlite 0.32.* +* PR #5457: implement repr for untyped lists + +Fixes: + +* PR #4476: Another attempt at fixing frame injection in the dispatcher tracing + path +* PR #4942: Prevent some parfor aliasing. Rename copied function var to prevent + recursive type locking. +* PR #5092: Fix 5087 +* PR #5150: More explicit llvmlite requirement in requirements.txt +* PR #5172: fix version spec for llvmlite +* PR #5176: Normalize kws going into fold_arguments. +* PR #5183: pass 'inline' explicitly to overload +* PR #5193: Fix CI failure due to missing files when installed +* PR #5213: Fix ``.strip()`` to strip all whitespace characters +* PR #5216: Fix namedtuple mistreated by dispatcher as simple tuple +* PR #5222: Fix compiler warnings due to missing (void) +* PR #5232: Fixes a bad import that breaks master +* PR #5239: fix test discovery for unittest +* PR #5247: Continue PR #5126 +* PR #5250: Part fix/5098 +* PR #5252: Trivially fix SyntaxWarning +* PR #5276: Add prange variant to has_no_side_effect. +* PR #5278: fix error messages in test_array_reductions +* PR #5310: PR #3897 continued +* PR #5313: Continues PR #4376 +* PR #5318: Remove AUTHORS file reference from MANIFEST.in +* PR #5327: Add warning if FNV hashing is found as the default for CPython. +* PR #5338: Remove refcount pruning pass +* PR #5345: Disable test failing due to removed pass. +* PR #5357: Small fix to have llvm and asm highlighted properly +* PR #5361: 5081 typed.dict +* PR #5431: Add tolerance to numba extension module entrypoints. +* PR #5432: Fix code causing compiler warnings. +* PR #5445: Remove undefined variable +* PR #5454: Don't warn for numba.experimental.jitclass +* PR #5459: Fixes issue 5448 +* PR #5480: Fix for #5477, literal_unroll KeyError searching for getitems +* PR #5485: Show the offending module in "no direct replacement" error message +* PR #5488: Add missing ``numba.config`` shim +* PR #5495: Fix missing null initializer for variable after phi strip +* PR #5498: Make the shim deprecation warnings work on python 3.6 too +* PR #5505: Better error message if __init__ returns value +* PR #5527: Attempt to fix #5518 +* PR #5529: PR #5473 continued +* PR #5532: Make ``numba.`` available without an import +* PR #5542: Fixes RC2 module shim bug +* PR #5548: Fix #5537 Removed reference to ``platform.linux_distribution`` +* PR #5555: Fix #5515 by reverting changes to ArrayAnalysis +* PR #5557: First-class function call cannot use keyword arguments +* PR #5569: Fix RewriteConstGetitems not registering calltype for new expr +* PR #5571: Pin down llvmlite requirement + +CUDA Enhancements/Fixes: + +* PR #5061: Prevent kernel launch with no configuration, remove autotuner +* PR #5085: Support other itemsizes with view +* PR #5099: Fix #5073: Slices of dynamic shared memory all alias +* PR #5104: Add a list of libraries that support __cuda_array_interface__ +* PR #5136: CUDA: Enable asynchronous operations on the default stream +* PR #5144: Fix #4875: Make #2655 test with debug expect to pass +* PR #5189: __cuda_array_interface__ not requiring context +* PR #5253: Coerce ``dtype`` to ``numpy.dtype`` +* PR #5257: boundscheck fix +* PR #5319: Make user facing error string use abs path not rel. +* PR #5323: Document lifetime semantics of CUDA Array Interface +* PR #5328: NBEP 7: External Memory Management Plugin Interface +* PR #5343: Fix cuda spoof +* PR #5400: Fix #4954, and some other small CUDA testsuite fixes +* PR #5436: Fix #5408: test_set_registers_57 fails on Maxwell +* PR #5519: CUDA: Silence the test suite - Fix #4809, remove autojit, delete + prints + +Documentation Updates: + +* PR #4957: Add notes on overwriting gufunc inputs to docs +* PR #5059: Docs: Explain how to use Memcheck with Numba, fixups in CUDA + documentation +* PR #5106: Add SciPy 2019 talks to docs +* PR #5147: Update master for 0.48.0 updates +* PR #5155: Explain what inlining at Numba IR level will do +* PR #5161: Fix README.rst formatting +* PR #5207: Remove AUTHORS list +* PR #5249: fix target path for See also +* PR #5262: fix typo in inlining docs +* PR #5270: fix 'see also' in typeddict docs +* PR #5280: Added some hints for debugging entry points. +* PR #5297: Update docs with intro to {g,}ufuncs. +* PR #5326: Update installation docs with OpenMP requirements. +* PR #5346: Docs: use sphinx_rtd_theme +* PR #5366: Remove reference to Python 2.7 in install check output +* PR #5423: Fix #5421: Add notes on printing in CUDA kernels +* PR #5438: Update package deps for doc building. +* PR #5440: Bump deprecation notices. +* PR #5443: Fix #5196: Docs: assert in CUDA only enabled for debug +* PR #5450: Docs: remove numbers and add titles to TOC +* PR #5453: fix building docs on windows +* PR #5489: docs: fix rendering of nested bulleted list + +CI updates: + +* PR #5314: Update the image used in Azure CI for OSX. +* PR #5360: Remove Travis CI badge. + +Authors: + +* Aaron Meurer +* Andreas Sodeur +* Antonio Russo +* Arik Funke +* Eric Wieser +* Ethan Pronovost +* Gabriel Majeri +* Graham Markall +* Greg Jennings +* Guilherme Leobas +* hdf +* Henry Schreiner +* Ivan Butygin +* Jacques Gaudin +* Jim Pivarski +* John Kirkham +* Leo Fang +* Lucio Fernandez-Arjona +* Mads R. B. Kristensen +* Marcin Tolysz +* Mike Williams +* Pearu Peterson +* Ravi Teja Gutta +* Rohit Sanjay +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* Valentin Haenel (core dev) +* Vladimir Lukyanov + + +Version 0.48.0 (Jan 27, 2020) +----------------------------- + +This release is particularly small as it was present to catch anything that +missed the 0.47.0 deadline (the deadline deliberately coincided with the end of +support for Python 2.7). The next release will be considerably larger. + +The core changes in this release are dominated by the start of the clean up +needed for the end of Python 2.7 support, improvements to the CUDA target and +support for numerous additional unicode string methods. + +Enhancements from user contributed PRs (with thanks!): + +* Brian Wignall fixed more spelling typos in #4998. +* Denis Smirnov added support for string methods ``capitalize`` (#4823), + ``casefold`` (#4824), ``swapcase`` (#4825), ``rsplit`` (#4834), ``partition`` + (#4845) and ``splitlines`` (#4849). +* Elena Totmenina extended support for string methods ``startswith`` (#4867) and + added ``endswith`` (#4868). +* Eric Wieser made ``type_callable`` return the decorated function itself in + #4760 +* Ethan Pronovost added support for ``np.argwhere`` in #4617 +* Graham Markall contributed a large number of CUDA enhancements and fixes, + namely: + + * #5068: Remove Python 3.4 backports from utils + * #4975: Make ``device_array_like`` create contiguous arrays (Fixes #4832) + * #5023: Don't launch ForAll kernels with 0 elements (Fixes #5017) + * #5016: Fix various issues in CUDA library search (Fixes #4979) + * #5014: Enable use of records and bools for shared memory, remove ddt, add + additional transpose tests + * #4964: Fix #4628: Add more appropriate typing for CUDA device arrays + * #5007: test_consuming_strides: Keep dev array alive + * #4997: State that CUDA Toolkit 8.0 required in docs + +* James Bourbeau added the Python 3.8 classifier to setup.py in #5027. +* John Kirkham added a clarification to the ``__cuda_array_interface__`` + documentation in #5049. +* Leo Fang Fixed an indexing problem in ``dummyarray`` in #5012. +* Marcel Bargull fixed a build and test issue for Python 3.8 in #5029. +* Maria Rubtsov added support for string methods ``isdecimal`` (#4842), + ``isdigit`` (#4843), ``isnumeric`` (#4844) and ``replace`` (#4865). + +General Enhancements: + +* PR #4760: Make type_callable return the decorated function +* PR #5010: merge string prs + + This merge PR included the following: + + * PR #4823: Implement str.capitalize() based on CPython + * PR #4824: Implement str.casefold() based on CPython + * PR #4825: Implement str.swapcase() based on CPython + * PR #4834: Implement str.rsplit() based on CPython + * PR #4842: Implement str.isdecimal + * PR #4843: Implement str.isdigit + * PR #4844: Implement str.isnumeric + * PR #4845: Implement str.partition() based on CPython + * PR #4849: Implement str.splitlines() based on CPython + * PR #4865: Implement str.replace + * PR #4867: Functionality extension str.startswith() based on CPython + * PR #4868: Add functionality for str.endswith() + +* PR #5039: Disable help messages. +* PR #4617: Add coverage for ``np.argwhere`` + +Fixes: + +* PR #4724: Only use lives (and not aliases) to create post parfor live set. +* PR #4998: Fix more spelling typos +* PR #5024: Propagate semantic constants ahead of static rewrites. +* PR #5027: Add Python 3.8 classifier to setup.py +* PR #5046: Update setup.py and buildscripts for dependency requirements +* PR #5053: Convert from arrays to names in define() and don't invalidate for + multiple consistent defines. +* PR #5058: Permit mixed int types in wrap_index +* PR #5078: Catch the use of global typed-list in JITed functions +* PR #5092: Fix #5087, bug in bytecode analysis. + +CUDA Enhancements/Fixes: + +* PR #4964: Fix #4628: Add more appropriate typing for CUDA device arrays +* PR #4975: Make ``device_array_like`` create contiguous arrays (Fixes #4832) +* PR #4997: State that CUDA Toolkit 8.0 required in docs +* PR #5007: test_consuming_strides: Keep dev array alive +* PR #5012: Fix IndexError when accessing the "-1" element of dummyarray +* PR #5014: Enable use of records and bools for shared memory, remove ddt, add + additional transpose tests +* PR #5016: Fix various issues in CUDA library search (Fixes #4979) +* PR #5023: Don't launch ForAll kernels with 0 elements (Fixes #5017) +* PR #5068: Remove Python 3.4 backports from utils + +Documentation Updates: + +* PR #5049: Clarify what dictionary means +* PR #5062: Update docs for updated version requirements +* PR #5090: Update deprecation notices for 0.48.0 + +CI updates: + +* PR #5029: Install optional dependencies for Python 3.8 tests +* PR #5040: Drop Py2.7 and Py3.5 from public CI +* PR #5048: Fix CI py38 + +Authors: + +* Brian Wignall +* Denis Smirnov +* Elena Totmenina +* Eric Wieser +* Ethan Pronovost +* Graham Markall +* James Bourbeau +* John Kirkham +* Leo Fang +* Marcel Bargull +* Maria Rubtsov +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* Valentin Haenel (core dev) + + +Version 0.47.0 (Jan 2, 2020) +----------------------------- + +This release expands the capability of Numba in a number of important areas and +is also significant as it is the last major point release with support for +Python 2 and Python 3.5 included. The next release (0.48.0) will be for Python +3.6+ only! (This follows NumPy's deprecation schedule as specified in +`NEP 29 `_.) + +Highlights of core feature changes include: + +* Full support for Python 3.8 (Siu Kwan Lam) +* Opt-in bounds checking (Aaron Meurer) +* Support for ``map``, ``filter`` and ``reduce`` (Stuart Archibald) + +Intel also kindly sponsored research and development that lead to some exciting +new features: + +* Initial support for basic ``try``/``except`` use (Siu Kwan Lam) +* The ability to pass functions created from closures/lambdas as arguments + (Stuart Archibald) +* ``sorted`` and ``list.sort()`` now accept the ``key`` argument (Stuart + Archibald and Siu Kwan Lam) +* A new compiler pass triggered through the use of the function + ``numba.literal_unroll`` which permits iteration over heterogeneous tuples + and constant lists of constants. (Stuart Archibald) + +Enhancements from user contributed PRs (with thanks!): + +* Ankit Mahato added a reference to a new talk on Numba at PyCon India 2019 in + #4862 +* Brian Wignall kindly fixed some spelling mistakes and typos in #4909 +* Denis Smirnov wrote numerous methods to considerable enhance string support + including: + + * ``str.rindex()`` in #4861 + * ``str.isprintable()`` in #4836 + * ``str.index()`` in #4860 + * ``start/end`` parameters for ``str.find()`` in #4866 + * ``str.isspace()`` in #4835 + * ``str.isidentifier()`` #4837 + * ``str.rpartition()`` in #4841 + * ``str.lower()`` and ``str.islower()`` in #4651 + +* Elena Totmenina implemented both ``str.isalnum()``, ``str.isalpha()`` and + ``str.isascii`` in #4839, #4840 and #4847 respectively. +* Eric Larson fixed a bug in literal comparison in #4710 +* Ethan Pronovost updated the ``np.arange`` implementation in #4770 to allow + the use of the ``dtype`` key word argument and also added ``bool`` + implementations for several types in #4715. +* Graham Markall fixed some issues with the CUDA target, namely: + + * #4931: Added physical limits for CC 7.0 / 7.5 to CUDA autotune + * #4934: Fixed bugs in TestCudaWarpOperations + * #4938: Improved errors / warnings for the CUDA vectorize decorator + +* Guilherme Leobas fixed a typo in the ``urem`` implementation in #4667 +* Isaac Virshup contributed a number of patches that fixed bugs, added support + for more NumPy functions and enhanced Python feature support. These + contributions included: + + * #4729: Allow array construction with mixed type shape tuples + * #4904: Implementing ``np.lcm`` + * #4780: Implement np.gcd and math.gcd + * #4779: Make slice constructor more similar to python. + * #4707: Added support for slice.indices + * #4578: Clarify numba ufunc supported features + +* James Bourbeau fixed some issues with tooling, #4794 add ``setuptools`` as a + dependency and #4501 add pre-commit hooks for ``flake8`` compliance. +* Leo Fang made ``numba.dummyarray.Array`` iterable in #4629 +* Marc Garcia fixed the ``numba.jit`` parameter name signature_or_function in + #4703 +* Marcelo Duarte Trevisani patched the llvmlite requirement to ``>=0.30.0`` in + #4725 +* Matt Cooper fixed a long standing CI problem in #4737 by remove maxParallel +* Matti Picus fixed an issue with ``collections.abc`` in #4734 + from Azure Pipelines. +* Rob Ennis patched a bug in ``np.interp`` ``float32`` handling in #4911 +* VDimir fixed a bug in array transposition layouts in #4777 and re-enabled and + fixed some idle tests in #4776. +* Vyacheslav Smirnov Enable support for `str.istitle()`` in #4645 + +General Enhancements: + +* PR #4432: Bounds checking +* PR #4501: Add pre-commit hooks +* PR #4536: Handle kw args in inliner when callee is a function +* PR #4599: Permits closures to become functions, enables map(), filter() +* PR #4611: Implement method title() for unicode based on Cpython +* PR #4645: Enable support for istitle() method for unicode string +* PR #4651: Implement str.lower() and str.islower() +* PR #4652: Implement str.rfind() +* PR #4695: Refactor `overload*` and support `jit_options` and `inline` +* PR #4707: Added support for slice.indices +* PR #4715: Add `bool` overload for several types +* PR #4729: Allow array construction with mixed type shape tuples +* PR #4755: Python3.8 support +* PR #4756: Add parfor support for ndarray.fill. +* PR #4768: Update typeconv error message to ask for sys.executable. +* PR #4770: Update `np.arange` implementation with `@overload` +* PR #4779: Make slice constructor more similar to python. +* PR #4780: Implement np.gcd and math.gcd +* PR #4794: Add setuptools as a dependency +* PR #4802: put git hash into build string +* PR #4803: Better compiler error messages for improperly used reduction + variables. +* PR #4817: Typed list implement and expose allocation +* PR #4818: Typed list faster copy +* PR #4835: Implement str.isspace() based on CPython +* PR #4836: Implement str.isprintable() based on CPython +* PR #4837: Implement str.isidentifier() based on CPython +* PR #4839: Implement str.isalnum() based on CPython +* PR #4840: Implement str.isalpha() based on CPython +* PR #4841: Implement str.rpartition() based on CPython +* PR #4847: Implement str.isascii() based on CPython +* PR #4851: Add graphviz output for FunctionIR +* PR #4854: Python3.8 looplifting +* PR #4858: Implement str.expandtabs() based on CPython +* PR #4860: Implement str.index() based on CPython +* PR #4861: Implement str.rindex() based on CPython +* PR #4866: Support params start/end for str.find() +* PR #4874: Bump to llvmlite 0.31 +* PR #4896: Specialise arange dtype on arch + python version. +* PR #4902: basic support for try except +* PR #4904: Implement np.lcm +* PR #4910: loop canonicalisation and type aware tuple unroller/loop body + versioning passes +* PR #4961: Update hash(tuple) for Python 3.8. +* PR #4977: Implement sort/sorted with key. +* PR #4987: Add `is_internal` property to all Type classes. + +Fixes: + +* PR #4090: Update to LLVM8 memset/memcpy intrinsic +* PR #4582: Convert sub to add and div to mul when doing the reduction across + the per-thread reduction array. +* PR #4648: Handle 0 correctly as slice parameter. +* PR #4660: Remove multiply defined variables from all blocks' equivalence sets. +* PR #4672: Fix pickling of dufunc +* PR #4710: BUG: Comparison for literal +* PR #4718: Change get_call_table to support intermediate Vars. +* PR #4725: Requires llvmlite >=0.30.0 +* PR #4734: prefer to import from collections.abc +* PR #4736: fix flake8 errors +* PR #4776: Fix and enable idle tests from test_array_manipulation +* PR #4777: Fix transpose output array layout +* PR #4782: Fix issue with SVML (and knock-on function resolution effects). +* PR #4785: Treat 0d arrays like scalars. +* PR #4787: fix missing incref on flags +* PR #4789: fix typos in numba/targets/base.py +* PR #4791: fix typos +* PR #4811: fix spelling in now-failing tests +* PR #4852: windowing test should check equality only up to double precision + errors +* PR #4881: fix refining list by using extend on an iterator +* PR #4882: Fix return type in arange and zero step size handling. +* PR #4885: suppress spurious RuntimeWarning about ufunc sizes +* PR #4891: skip the xfail test for now. Py3.8 CFG refactor seems to have + changed the test case +* PR #4892: regex needs to accept singular form of "argument" +* PR #4901: fix typed list equals +* PR #4909: Fix some spelling typos +* PR #4911: np.interp bugfix for float32 handling +* PR #4920: fix creating list with JIT disabled +* PR #4921: fix creating dict with JIT disabled +* PR #4935: Better handling of prange with multiple reductions on the same + variable. +* PR #4946: Improve the error message for `raise `. +* PR #4955: Move overload of literal_unroll to avoid circular dependency that + breaks Python 2.7 +* PR #4962: Fix test error on windows +* PR #4973: Fixes a bug in the relabelling logic in literal_unroll. +* PR #4978: Fix overload_method problem with stararg +* PR #4981: Add ind_to_const to enable fewer equivalence classes. +* PR #4991: Continuation of #4588 (Let dead code removal handle removing more of + the unneeded code after prange conversion to parfor) +* PR #4994: Remove xfail for test which has since had underlying issue fixed. +* PR #5018: Fix #5011. +* PR #5019: skip pycc test on Python 3.8 + macOS because of distutils issue + +CUDA Enhancements/Fixes: + +* PR #4629: Make numba.dummyarray.Array iterable +* PR #4675: Bump cuda array interface to version 2 +* PR #4741: Update choosing the "CUDA_PATH" for windows +* PR #4838: Permit ravel('A') for contig device arrays in CUDA target +* PR #4931: Add physical limits for CC 7.0 / 7.5 to autotune +* PR #4934: Fix fails in TestCudaWarpOperations +* PR #4938: Improve errors / warnings for cuda vectorize decorator + +Documentation Updates: + +* PR #4418: Directed graph task roadmap +* PR #4578: Clarify numba ufunc supported features +* PR #4655: fix sphinx build warning +* PR #4667: Fix typo on urem implementation +* PR #4669: Add link to ParallelAccelerator paper. +* PR #4703: Fix numba.jit parameter name signature_or_function +* PR #4862: Addition of PyCon India 2019 talk on Numba +* PR #4947: Document jitclass with numba.typed use. +* PR #4958: Add docs for `try..except` +* PR #4993: Update deprecations for 0.47 + +CI Updates: + +* PR #4737: remove maxParallel from Azure Pipelines +* PR #4767: pin to 2.7.16 for py27 on osx +* PR #4781: WIP/runtest cf pytest + +Authors: + +* Aaron Meurer +* Ankit Mahato +* Brian Wignall +* Denis Smirnov +* Ehsan Totoni (core dev) +* Elena Totmenina +* Eric Larson +* Ethan Pronovost +* Giovanni Cavallin +* Graham Markall +* Guilherme Leobas +* Isaac Virshup +* James Bourbeau +* Leo Fang +* Marc Garcia +* Marcelo Duarte Trevisani +* Matt Cooper +* Matti Picus +* Rob Ennis +* Rujal Desai +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* VDimir +* Valentin Haenel (core dev) +* Vyacheslav Smirnov + + +Version 0.46.0 +-------------- + +This release significantly reworked one of the main parts of Numba, the compiler +pipeline, to make it more extensible and easier to use. The purpose of this was +to continue enhancing Numba's ability for use as a compiler toolkit. In a +similar vein, Numba now has an extension registration mechanism to allow other +Numba-using projects to automatically have their Numba JIT compilable functions +discovered. There were also a number of other related compiler toolkit +enhancement added along with some more NumPy features and a lot of bug fixes. + +This release has updated the CUDA Array Interface specification to version 2, +which clarifies the `strides` attribute for C-contiguous arrays and specifies +the treatment for zero-size arrays. The implementation in Numba has been +changed and may affect downstream packages relying on the old behavior +(see issue #4661). + +Enhancements from user contributed PRs (with thanks!): + +* Aaron Meurer fixed some Python issues in the code base in #4345 and #4341. +* Ashwin Srinath fixed a CUDA performance bug via #4576. +* Ethan Pronovost added support for triangular indices functions in #4601 (the + NumPy functions ``tril_indices``, ``tril_indices_from``, ``triu_indices``, and + ``triu_indices_from``). +* Gerald Dalley fixed a tear down race occurring in Python 2. +* Gregory R. Lee fixed the use of deprecated ``inspect.getargspec``. +* Guilherme Leobas contributed five PRs, adding support for ``np.append`` and + ``np.count_nonzero`` in #4518 and #4386. The typed List was fixed to accept + unsigned integers in #4510. #4463 made a fix to NamedTuple internals and #4397 + updated the docs for ``np.sum``. +* James Bourbeau added a new feature to permit the automatic application of the + `jit` decorator to a whole module in #4331. Also some small fixes to the docs + and the code base were made in #4447 and #4433, and a fix to inplace array + operation in #4228. +* Jim Crist fixed a bug in the rendering of patched errors in #4464. +* Leo Fang updated the CUDA Array Interface contract in #4609. +* Pearu Peterson added support for Unicode based NumPy arrays in #4425. +* Peter Andreas Entschev fixed a CUDA concurrency bug in #4581. +* Lucio Fernandez-Arjona extended Numba's ``np.sum`` support to now accept the + ``dtype`` kwarg in #4472. +* Pedro A. Morales Maries added support for ``np.cross`` in #4128 and also added + the necessary extension ``numba.numpy_extensions.cross2d`` in #4595. +* David Hoese, Eric Firing, Joshua Adelman, and Juan Nunez-Iglesias all made + documentation fixes in #4565, #4482, #4455, #4375 respectively. +* Vyacheslav Smirnov and Rujal Desai enabled support for ``count()`` on unicode + strings in #4606. + +General Enhancements: + +* PR #4113: Add rewrite for semantic constants. +* PR #4128: Add np.cross support +* PR #4162: Make IR comparable and legalize it. +* PR #4208: R&D inlining, jitted and overloaded. +* PR #4331: Automatic JIT of called functions +* PR #4353: Inspection tool to check what numba supports +* PR #4386: Implement np.count_nonzero +* PR #4425: Unicode array support +* PR #4427: Entrypoints for numba extensions +* PR #4467: Literal dispatch +* PR #4472: Allow dtype input argument in np.sum +* PR #4513: New compiler. +* PR #4518: add support for np.append +* PR #4554: Refactor NRT C-API +* PR #4556: 0.46 scheduled deprecations +* PR #4567: Add env var to disable performance warnings. +* PR #4568: add np.array_equal support +* PR #4595: Implement numba.cross2d +* PR #4601: Add triangular indices functions +* PR #4606: Enable support for count() method for unicode string + +Fixes: + +* PR #4228: Fix inplace operator error for arrays +* PR #4282: Detect and raise unsupported on generator expressions +* PR #4305: Don't allow the allocation of mutable objects written into a + container to be hoisted. +* PR #4311: Avoid deprecated use of inspect.getargspec +* PR #4328: Replace GC macro with function call +* PR #4330: Loosen up typed container casting checks +* PR #4341: Fix some coding lines at the top of some files (utf8 -> utf-8) +* PR #4345: Replace "import \*" with explicit imports in numba/types +* PR #4346: Fix incorrect alg in isupper for ascii strings. +* PR #4349: test using jitclass in typed-list +* PR #4361: Add allocation hoisting info to LICM section at diagnostic L4 +* PR #4366: Offset search box to avoid wrapping on some pages with Safari. + Fixes #4365. +* PR #4372: Replace all "except BaseException" with "except Exception". +* PR #4407: Restore the "free" conda channel for NumPy 1.10 support. +* PR #4408: Add lowering for constant bytes. +* PR #4409: Add exception chaining for better error context +* PR #4411: Name of type should not contain user facing description for debug. +* PR #4412: Fix #4387. Limit the number of return types for recursive functions +* PR #4426: Fixed two module teardown races in py2. +* PR #4431: Fix and test numpy.random.random_sample(n) for np117 +* PR #4463: NamedTuple - Raises an error on non-iterable elements +* PR #4464: Add a newline in patched errors +* PR #4474: Fix liveness for remove dead of parfors (and other IR extensions) +* PR #4510: Make List.__getitem__ accept unsigned parameters +* PR #4512: Raise specific error at typing time for iteration on >1D array. +* PR #4532: Fix static_getitem with Literal type as index +* PR #4547: Update to inliner cost model information. +* PR #4557: Use specific random number seed when generating arbitrary test data +* PR #4559: Adjust test timeouts +* PR #4564: Skip unicode array tests on ppc64le that trigger an LLVM bug +* PR #4621: Fix packaging issue due to missing numba/cext +* PR #4623: Fix issue 4520 due to storage model mismatch +* PR #4644: Updates for llvmlite 0.30.0 + +CUDA Enhancements/Fixes: + +* PR #4410: Fix #4111. cudasim mishandling recarray +* PR #4576: Replace use of `np.prod` with `functools.reduce` for computing size + from shape +* PR #4581: Prevent taking the GIL in ForAll +* PR #4592: Fix #4589. Just pass NULL for b2d_func for constant dynamic + sharedmem +* PR #4609: Update CUDA Array Interface & Enforce Numba compliance +* PR #4619: Implement math.{degrees, radians} for the CUDA target. +* PR #4675: Bump cuda array interface to version 2 + +Documentation Updates: + +* PR #4317: Add docs for ARMv8/AArch64 +* PR #4318: Add supported platforms to the docs. Closes #4316 +* PR #4375: Add docstrings to inspect methods +* PR #4388: Update Python 2.7 EOL statement +* PR #4397: Add note about np.sum +* PR #4447: Minor parallel performance tips edits +* PR #4455: Clarify docs for typed dict with regard to arrays +* PR #4482: Fix example in guvectorize docstring. +* PR #4541: fix two typos in architecture.rst +* PR #4548: Document numba.extending.intrinsic and inlining. +* PR #4565: Fix typo in jit-compilation docs +* PR #4607: add dependency list to docs +* PR #4614: Add documentation for implementing new compiler passes. + +CI Updates: + +* PR #4415: Make 32bit incremental builds on linux not use free channel +* PR #4433: Removes stale azure comment +* PR #4493: Fix Overload Inliner wrt CUDA Intrinsics +* PR #4593: Enable Azure CI batching + +Contributors: + +* Aaron Meurer +* Ashwin Srinath +* David Hoese +* Ehsan Totoni (core dev) +* Eric Firing +* Ethan Pronovost +* Gerald Dalley +* Gregory R. Lee +* Guilherme Leobas +* James Bourbeau +* Jim Crist +* Joshua Adelman +* Juan Nunez-Iglesias +* Leo Fang +* Lucio Fernandez-Arjona +* Pearu Peterson +* Pedro A. Morales Marie +* Peter Andreas Entschev +* Rujal Desai +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* Valentin Haenel (core dev) +* Vyacheslav Smirnov + + +Version 0.45.1 +-------------- + +This patch release addresses some regressions reported in the 0.45.0 release and +adds support for NumPy 1.17: + +* PR #4325: accept scalar/0d-arrays +* PR #4338: Fix #4299. Parfors reduction vars not deleted. +* PR #4350: Use process level locks for fork() only. +* PR #4354: Try to fix #4352. +* PR #4357: Fix np1.17 isnan, isinf, isfinite ufuncs +* PR #4363: Fix np.interp for np1.17 nan handling +* PR #4371: Fix nump1.17 random function non-aliasing + +Contributors: + +* Siu Kwan Lam (core dev) +* Stuart Archibald (core dev) +* Valentin Haenel (core dev) + + +Version 0.45.0 +-------------- + +In this release, Numba gained an experimental :ref:`numba.typed.List +` container as a future replacement of the :ref:`reflected +list `. In addition, functions decorated with +``parallel=True`` can now be cached to reduce compilation overhead associated +with the auto-parallelization. + + +Enhancements from user contributed PRs (with thanks!): + +* James Bourbeau added the Numba version to reportable error messages in #4227, + added the ``signature`` parameter to ``inspect_types`` in #4200, improved the + docstring of ``normalize_signature`` in #4205, and fixed #3658 by adding + reference counting to ``register_dispatcher`` in #4254 + +* Guilherme Leobas implemented the dominator tree and dominance frontier + algorithms in #4216 and #4149, respectively. + +* Nick White fixed the issue with ``round`` in the CUDA target in #4137. + +* Joshua Adelman added support for determining if a value is in a `range` + (i.e. ``x in range(...)``) in #4129, and added windowing functions + (``np.bartlett``, ``np.hamming``, ``np.blackman``, ``np.hanning``, + ``np.kaiser``) from NumPy in #4076. + +* Lucio Fernandez-Arjona added support for ``np.select`` in #4077 + +* Rob Ennis added support for ``np.flatnonzero`` in #4157 + +* Keith Kraus extended the ``__cuda_array_interface__`` with an optional mask + attribute in #4199. + +* Gregory R. Lee replaced deprecated use of ``inspect.getargspec`` in #4311. + + +General Enhancements: + +* PR #4328: Replace GC macro with function call +* PR #4311: Avoid deprecated use of inspect.getargspec +* PR #4296: Slacken window function testing tol on ppc64le +* PR #4254: Add reference counting to register_dispatcher +* PR #4239: Support len() of multi-dim arrays in array analysis +* PR #4234: Raise informative error for np.kron array order +* PR #4232: Add unicodetype db, low level str functions and examples. +* PR #4229: Make hashing cacheable +* PR #4227: Include numba version in reportable error message +* PR #4216: Add dominator tree +* PR #4200: Add signature parameter to inspect_types +* PR #4196: Catch missing imports of internal functions. +* PR #4180: Update use of unlowerable global message. +* PR #4166: Add tests for PR #4149 +* PR #4157: Support for np.flatnonzero +* PR #4149: Implement dominance frontier for SSA for the Numba IR +* PR #4148: Call branch pruning in inline_closure_call() +* PR #4132: Reduce usage of inttoptr +* PR #4129: Support contains for range +* PR #4112: better error messages for np.transpose and tuples +* PR #4110: Add range attrs, start, stop, step +* PR #4077: Add np select +* PR #4076: Add numpy windowing functions support (np.bartlett, np.hamming, + np.blackman, np.hanning, np.kaiser) +* PR #4095: Support ir.Global/FreeVar in find_const() +* PR #3691: Make TypingError abort compiling earlier +* PR #3646: Log internal errors encountered in typeinfer + +Fixes: + +* PR #4303: Work around scipy bug 10206 +* PR #4302: Fix flake8 issue on master +* PR #4301: Fix integer literal bug in np.select impl +* PR #4291: Fix pickling of jitclass type +* PR #4262: Resolves #4251 - Fix bug in reshape analysis. +* PR #4233: Fixes issue revealed by #4215 +* PR #4224: Fix #4223. Looplifting error due to StaticSetItem in objectmode +* PR #4222: Fix bad python path. +* PR #4178: Fix unary operator overload, check with unicode impl +* PR #4173: Fix return type in np.bincount with weights +* PR #4153: Fix slice shape assignment in array analysis +* PR #4152: fix status check in dict lookup +* PR #4145: Use callable instead of checking __module__ +* PR #4118: Fix inline assembly support on CPU. +* PR #4088: Resolves #4075 - parfors array_analysis bug. +* PR #4085: Resolves #3314 - parfors array_analysis bug with reshape. + +CUDA Enhancements/Fixes: + +* PR #4199: Extend `__cuda_array_interface__` with optional mask attribute, + bump version to 1 +* PR #4137: CUDA - Fix round Builtin +* PR #4114: Support 3rd party activated CUDA context + +Documentation Updates: + +* PR #4317: Add docs for ARMv8/AArch64 +* PR #4318: Add supported platforms to the docs. Closes #4316 +* PR #4295: Alter deprecation schedules +* PR #4253: fix typo in pysupported docs +* PR #4252: fix typo on repomap +* PR #4241: remove unused import +* PR #4240: fix typo in jitclass docs +* PR #4205: Update return value order in normalize_signature docstring +* PR #4237: Update doc links to point to latest not dev docs. +* PR #4197: hyperlink repomap +* PR #4170: Clarify docs on accumulating into arrays in prange +* PR #4147: fix docstring for DictType iterables +* PR #3951: A guide to overloading + +CI Updates: + +* PR #4300: AArch64 has no faulthandler package +* PR #4273: pin to MKL BLAS for testing to get consistent results +* PR #4209: Revert previous network tol patch and try with conda config +* PR #4138: Remove tbb before Azure test only on Python 3, since it was already + removed for Python 2 + +Contributors: + +* Ehsan Totoni (core dev) +* Gregory R. Lee +* Guilherme Leobas +* James Bourbeau +* Joshua L. Adelman +* Keith Kraus +* Lucio Fernandez-Arjona +* Nick White +* Rob Ennis +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* Valentin Haenel (core dev) + + +Version 0.44.1 +-------------- + +This patch release addresses some regressions reported in the 0.44.0 release: + +- PR #4165: Fix #4164 issue with NUMBAPRO_NVVM. +- PR #4172: Abandon branch pruning if an arg name is redefined. (Fixes #4163) +- PR #4183: Fix #4156. Problem with defining in-loop variables. + + +Version 0.44.0 +-------------- + +IMPORTANT: In this release a few significant deprecations (and some less +significant ones) are being made, users are encouraged to read the related +documentation. + +General enhancements in this release include: + +- Numba is backed by LLVM 8 on all platforms apart from ppc64le, which, due to + bugs, remains on the LLVM 7.x series. +- Numba's dictionary support now includes type inference for keys and values. +- The .view() method now works for NumPy scalar types. +- Newly supported NumPy functions added: np.delete, np.nanquantile, np.quantile, + np.repeat, np.shape. + +In addition considerable effort has been made to fix some long standing bugs and +a large number of other bugs, the "Fixes" section is very large this time! + +Enhancements from user contributed PRs (with thanks!): + +- Max Bolingbroke added support for the selective use of ``fastmath`` flags in + #3847. +- Rob Ennis made min() and max() work on iterables in #3820 and added + np.quantile and np.nanquantile in #3899. +- Sergey Shalnov added numerous unicode string related features, zfill in #3978, + ljust in #4001, rjust and center in #4044 and strip, lstrip and rstrip in + #4048. +- Guilherme Leobas added support for np.delete in #3890 +- Christoph Deil exposed the Numba CLI via ``python -m numba`` in #4066 and made + numerous documentation fixes. +- Leo Schwarz wrote the bulk of the code for jitclass default constructor + arguments in #3852. +- Nick White enhanced the CUDA backend to use min/max PTX instructions where + possible in #4054. +- Lucio Fernandez-Arjona implemented the unicode string ``__mul__`` function in + #3952. +- Dimitri Vorona wrote the bulk of the code to implement getitem and setitem for + jitclass in #3861. + +General Enhancements: + +* PR #3820: Min max on iterables +* PR #3842: Unicode type iteration +* PR #3847: Allow fine-grained control of fastmath flags to partially address #2923 +* PR #3852: Continuation of PR #2894 +* PR #3861: Continuation of PR #3730 +* PR #3890: Add support for np.delete +* PR #3899: Support for np.quantile and np.nanquantile +* PR #3900: Fix 3457 :: Implements np.repeat +* PR #3928: Add .view() method for NumPy scalars +* PR #3939: Update icc_rt clone recipe. +* PR #3952: __mul__ for strings, initial implementation and tests +* PR #3956: Type-inferred dictionary +* PR #3959: Create a view for string slicing to avoid extra allocations +* PR #3978: zfill operation implementation +* PR #4001: ljust operation implementation +* PR #4010: Support `dict()` and `{}` +* PR #4022: Support for llvm 8 +* PR #4034: Make type.Optional str more representative +* PR #4041: Deprecation warnings +* PR #4044: rjust and center operations implementation +* PR #4048: strip, lstrip and rstrip operations implementation +* PR #4066: Expose numba CLI via python -m numba +* PR #4081: Impl `np.shape` and support function for `asarray`. +* PR #4091: Deprecate the use of iternext_impl without RefType + +CUDA Enhancements/Fixes: + +* PR #3933: Adds `.nbytes` property to CUDA device array objects. +* PR #4011: Add .inspect_ptx() to cuda device function +* PR #4054: CUDA: Use min/max PTX Instructions +* PR #4096: Update env-vars for CUDA libraries lookup + +Documentation Updates: + +* PR #3867: Code repository map +* PR #3918: adding Joris' Fosdem 2019 presentation +* PR #3926: order talks on applications of Numba by date +* PR #3943: fix two small typos in vectorize docs +* PR #3944: Fixup jitclass docs +* PR #3990: mention preprint repo in FAQ. Fixes #3981 +* PR #4012: Correct runtests command in contributing.rst +* PR #4043: fix typo +* PR #4047: Ambiguous Documentation fix for guvectorize. +* PR #4060: Remove remaining mentions of autojit in docs +* PR #4063: Fix annotate example in docstring +* PR #4065: Add FAQ entry explaining Numba project name +* PR #4079: Add Documentation for atomicity of typed.Dict +* PR #4105: Remove info about CUDA ENVVAR potential replacement + +Fixes: + +* PR #3719: Resolves issue #3528. Adds support for slices when not using parallel=True. +* PR #3727: Remove dels for known dead vars. +* PR #3845: Fix mutable flag transmission in .astype +* PR #3853: Fix some minor issues in the C source. +* PR #3862: Correct boolean reinterpretation of data +* PR #3863: Comments out the appveyor badge +* PR #3869: fixes flake8 after merge +* PR #3871: Add assert to ir.py to help enforce correct structuring +* PR #3881: fix preparfor dtype transform for datetime64 +* PR #3884: Prevent mutation of objmode fallback IR. +* PR #3885: Updates for llvmlite 0.29 +* PR #3886: Use `safe_load` from pyyaml. +* PR #3887: Add tolerance to network errors by permitting conda to retry +* PR #3893: Fix casting in namedtuple ctor. +* PR #3894: Fix array inliner for multiple array definition. +* PR #3905: Cherrypick #3903 to main +* PR #3920: Raise better error if unsupported jump opcode found. +* PR #3927: Apply flake8 to the numpy related files +* PR #3935: Silence DeprecationWarning +* PR #3938: Better error message for unknown opcode +* PR #3941: Fix typing of ufuncs in parfor conversion +* PR #3946: Return variable renaming dict from inline_closurecall +* PR #3962: Fix bug in alignment computation of `Record.make_c_struct` +* PR #3967: Fix error with pickling unicode +* PR #3964: Unicode split algo versioning +* PR #3975: Add handler for unknown locale to numba -s +* PR #3991: Permit Optionals in ufunc machinery +* PR #3995: Remove assert in type inference causing poor error message. +* PR #3996: add is_ascii flag to UnicodeType +* PR #4009: Prevent zero division error in np.linalg.cond +* PR #4014: Resolves #4007. +* PR #4021: Add a more specific error message for invalid write to a global. +* PR #4023: Fix handling of titles in record dtype +* PR #4024: Do a check if a call is const before saying that an object is multiply defined. +* PR #4027: Fix issue #4020. Turn off no_cpython_wrapper flag when compiling for… +* PR #4033: [WIP] Fixing wrong dtype of array inside reflected list #4028 +* PR #4061: Change IPython cache dir name to numba_cache +* PR #4067: Delete examples/notebooks/LinearRegr.py +* PR #4070: Catch writes to global typed.Dict and raise. +* PR #4078: Check tuple length +* PR #4084: Fix missing incref on optional return None +* PR #4089: Make the warnings fixer flush work for warning comparing on type. +* PR #4094: Fix function definition finding logic for commented def +* PR #4100: Fix alignment check on 32-bit. +* PR #4104: Use PEP 508 compliant env markers for install deps + +Contributors: + +* Benjamin Zaitlen +* Christoph Deil +* David Hirschfeld +* Dimitri Vorona +* Ehsan Totoni (core dev) +* Guilherme Leobas +* Leo Schwarz +* Lucio Fernandez-Arjona +* Max Bolingbroke +* NanduTej +* Nick White +* Ravi Teja Gutta +* Rob Ennis +* Sergey Shalnov +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* Valentin Haenel (core dev) + + +Version 0.43.1 +-------------- + +This is a bugfix release that provides minor changes to fix: a bug in branch +pruning, bugs in `np.interp` functionality, and also fully accommodate the +NumPy 1.16 release series. + +* PR #3826: NumPy 1.16 support +* PR #3850: Refactor np.interp +* PR #3883: Rewrite pruned conditionals as their evaluated constants. + +Contributors: + +* Rob Ennis +* Siu Kwan Lam (core dev) +* Stuart Archibald (core dev) + + +Version 0.43.0 +-------------- + +In this release, the major new features are: + +- Initial support for statically typed dictionaries +- Improvements to `hash()` to match Python 3 behavior +- Support for the heapq module +- Ability to pass C structs to Numba +- More NumPy functions: asarray, trapz, roll, ptp, extract + + +NOTE: + +The vast majority of NumPy 1.16 behaviour is supported, however +``datetime`` and ``timedelta`` use involving ``NaT`` matches the behaviour +present in earlier release. The ufunc suite has not been extending to +accommodate the two new time computation related additions present in NumPy +1.16. In addition the functions ``ediff1d`` and ``interp`` have known minor +issues in replicating outputs exactly when ``NaN``'s occur in certain input +patterns. + +General Enhancements: + +* PR #3563: Support for np.roll +* PR #3572: Support for np.ptp +* PR #3592: Add dead branch prune before type inference. +* PR #3598: Implement np.asarray() +* PR #3604: Support for np.interp +* PR #3607: Some simplication to lowering +* PR #3612: Exact match flag in dispatcher +* PR #3627: Support for np.trapz +* PR #3630: np.where with broadcasting +* PR #3633: Support for np.extract +* PR #3657: np.max, np.min, np.nanmax, np.nanmin - support for complex dtypes +* PR #3661: Access C Struct as Numpy Structured Array +* PR #3678: Support for str.split and str.join +* PR #3684: Support C array in C struct +* PR #3696: Add intrinsic to help debug refcount +* PR #3703: Implementations of type hashing. +* PR #3715: Port CPython3.7 dictionary for numba internal use +* PR #3716: Support inplace concat of strings +* PR #3718: Add location to ConstantInferenceError exceptions. +* PR #3720: improve error msg about invalid signature +* PR #3731: Support for heapq +* PR #3754: Updates for llvmlite 0.28 +* PR #3760: Overloadable operator.setitem +* PR #3775: Support overloading operator.delitem +* PR #3777: Implement compiler support for dictionary +* PR #3791: Implement interpreter-side interface for numba dict +* PR #3799: Support refcount'ed types in numba dict + +CUDA Enhancements/Fixes: + +* PR #3713: Fix the NvvmSupportError message when CC too low +* PR #3722: Fix #3705: slicing error with negative strides +* PR #3755: Make cuda.to_device accept readonly host array +* PR #3773: Adapt library search to accommodate multiple locations + +Documentation Updates: + +* PR #3651: fix link to berryconda in docs +* PR #3668: Add Azure Pipelines build badge +* PR #3749: DOC: Clarify when prange is different from range +* PR #3771: fix a few typos +* PR #3785: Clarify use of range as function only. +* PR #3829: Add docs for typed-dict + +Fixes: + +* PR #3614: Resolve #3586 +* PR #3618: Skip gdb tests on ARM. +* PR #3643: Remove support_literals usage +* PR #3645: Enforce and fix that AbstractTemplate.generic must be returning a Signature +* PR #3648: Fail on @overload signature mismatch. +* PR #3660: Added Ignore message to test numba.tests.test_lists.TestLists.test_mul_error +* PR #3662: Replace six with numba.six +* PR #3663: Removes coverage computation from travisci builds +* PR #3672: Avoid leaking memory when iterating over uniform tuple +* PR #3676: Fixes constant string lowering inside tuples +* PR #3677: Ensure all referenced compiled functions are linked properly +* PR #3692: Fix test failure due to overly strict test on floating point values. +* PR #3693: Intercept failed import to help users. +* PR #3694: Fix memory leak in enumerate iterator +* PR #3695: Convert return of None from intrinsic implementation to dummy value +* PR #3697: Fix for issue #3687 +* PR #3701: Fix array.T analysis (fixes #3700) +* PR #3704: Fixes for overload_method +* PR #3706: Don't push call vars recursively into nested parfors. Resolves #3686. +* PR #3710: Set as non-hoistable if a mutable variable is passed to a function in a loop. Resolves #3699. +* PR #3712: parallel=True to use better builtin mechanism to resolve call types. Resolves issue #3671 +* PR #3725: Fix invalid removal of dead empty list +* PR #3740: add uintp as a valid type to the tuple operator.getitem +* PR #3758: Fix target definition update in inlining +* PR #3782: Raise typing error on yield optional. +* PR #3792: Fix non-module object used as the module of a function. +* PR #3800: Bugfix for np.interp +* PR #3808: Bump macro to include VS2014 to fix py3.5 build +* PR #3809: Add debug guard to debug only C function. +* PR #3816: Fix array.sum(axis) 1d input return type. +* PR #3821: Replace PySys_WriteStdout with PySys_FormatStdout to ensure no truncation. +* PR #3830: Getitem should not return optional type +* PR #3832: Handle single string as path in find_file() + +Contributors: + +* Ehsan Totoni +* Gryllos Prokopis +* Jonathan J. Helmus +* Kayla Ngan +* lalitparate +* luk-f-a +* Matyt +* Max Bolingbroke +* Michael Seifert +* Rob Ennis +* Siu Kwan Lam +* Stan Seibert +* Stuart Archibald +* Todd A. Anderson +* Tao He +* Valentin Haenel + + +Version 0.42.1 +-------------- + +Bugfix release to fix the incorrect hash in OSX wheel packages. +No change in source code. + + +Version 0.42.0 +-------------- + +In this release the major features are: + +- The capability to launch and attach the GDB debugger from within a jitted + function. +- The upgrading of LLVM to version 7.0.0. + +We added a draft of the project roadmap to the developer manual. The roadmap is +for informational purposes only as priorities and resources may change. + +Here are some enhancements from contributed PRs: + +- #3532. Daniel Wennberg improved the ``cuda.{pinned, mapped}`` API so that + the associated memory is released immediately at the exit of the context + manager. +- #3531. Dimitri Vorona enabled the inlining of jitclass methods. +- #3516. Simon Perkins added the support for passing numpy dtypes (i.e. + ``np.dtype("int32")``) and their type constructor (i.e. ``np.int32``) into + a jitted function. +- #3509. Rob Ennis added support for ``np.corrcoef``. + +A regression issue (#3554, #3461) relating to making an empty slice in parallel +mode is resolved by #3558. + +General Enhancements: + +* PR #3392: Launch and attach gdb directly from Numba. +* PR #3437: Changes to accommodate LLVM 7.0.x +* PR #3509: Support for np.corrcoef +* PR #3516: Typeof dtype values +* PR #3520: Fix @stencil ignoring cval if out kwarg supplied. +* PR #3531: Fix jitclass method inlining and avoid unnecessary increfs +* PR #3538: Avoid future C-level assertion error due to invalid visibility +* PR #3543: Avoid implementation error being hidden by the try-except +* PR #3544: Add `long_running` test flag and feature to exclude tests. +* PR #3549: ParallelAccelerator caching improvements +* PR #3558: Fixes array analysis for inplace binary operators. +* PR #3566: Skip alignment tests on armv7l. +* PR #3567: Fix unifying literal types in namedtuple +* PR #3576: Add special copy routine for NumPy out arrays +* PR #3577: Fix example and docs typos for `objmode` context manager. + reorder statements. +* PR #3580: Use alias information when determining whether it is safe to +* PR #3583: Use `ir.unknown_loc` for unknown `Loc`, as #3390 with tests +* PR #3587: Fix llvm.memset usage changes in llvm7 +* PR #3596: Fix Array Analysis for Global Namedtuples +* PR #3597: Warn users if threading backend init unsafe. +* PR #3605: Add guard for writing to read only arrays from ufunc calls +* PR #3606: Improve the accuracy of error message wording for undefined type. +* PR #3611: gdb test guard needs to ack ptrace permissions +* PR #3616: Skip gdb tests on ARM. + +CUDA Enhancements: + +* PR #3532: Unregister temporarily pinned host arrays at once +* PR #3552: Handle broadcast arrays correctly in host->device transfer. +* PR #3578: Align cuda and cuda simulator kwarg names. + +Documentation Updates: + +* PR #3545: Fix @njit description in 5 min guide +* PR #3570: Minor documentation fixes for numba.cuda +* PR #3581: Fixing minor typo in `reference/types.rst` +* PR #3594: Changing `@stencil` docs to correctly reflect `func_or_mode` param +* PR #3617: Draft roadmap as of Dec 2018 + +Contributors: + +* Aaron Critchley +* Daniel Wennberg +* Dimitri Vorona +* Dominik Stańczak +* Ehsan Totoni (core dev) +* Iskander Sharipov +* Rob Ennis +* Simon Muller +* Simon Perkins +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) + + +Version 0.41.0 +-------------- + +This release adds the following major features: + +* Diagnostics showing the optimizations done by ParallelAccelerator +* Support for profiling Numba-compiled functions in Intel VTune +* Additional NumPy functions: partition, nancumsum, nancumprod, ediff1d, cov, + conj, conjugate, tri, tril, triu +* Initial support for Python 3 Unicode strings + +General Enhancements: + +* PR #1968: armv7 support +* PR #2983: invert mapping b/w binop operators and the operator module #2297 +* PR #3160: First attempt at parallel diagnostics +* PR #3307: Adding NUMBA_ENABLE_PROFILING envvar, enabling jit event +* PR #3320: Support for np.partition +* PR #3324: Support for np.nancumsum and np.nancumprod +* PR #3325: Add location information to exceptions. +* PR #3337: Support for np.ediff1d +* PR #3345: Support for np.cov +* PR #3348: Support user pipeline class in with lifting +* PR #3363: string support +* PR #3373: Improve error message for empty imprecise lists. +* PR #3375: Enable overload(operator.getitem) +* PR #3402: Support negative indexing in tuple. +* PR #3414: Refactor Const type +* PR #3416: Optimized usage of alloca out of the loop +* PR #3424: Updates for llvmlite 0.26 +* PR #3462: Add support for `np.conj/np.conjugate`. +* PR #3480: np.tri, np.tril, np.triu - default optional args +* PR #3481: Permit dtype argument as sole kwarg in np.eye + +CUDA Enhancements: + +* PR #3399: Add max_registers Option to cuda.jit + +Continuous Integration / Testing: + +* PR #3303: CI with Azure Pipelines +* PR #3309: Workaround race condition with apt +* PR #3371: Fix issues with Azure Pipelines +* PR #3362: Fix #3360: `RuntimeWarning: 'numba.runtests' found in sys.modules` +* PR #3374: Disable openmp in wheel building +* PR #3404: Azure Pipelines templates +* PR #3419: Fix cuda tests and error reporting in test discovery +* PR #3491: Prevent faulthandler installation on armv7l +* PR #3493: Fix CUDA test that used negative indexing behaviour that's fixed. +* PR #3495: Start Flake8 checking of Numba source + +Fixes: + +* PR #2950: Fix dispatcher to only consider contiguous-ness. +* PR #3124: Fix 3119, raise for 0d arrays in reductions +* PR #3228: Reduce redundant module linking +* PR #3329: Fix AOT on windows. +* PR #3335: Fix memory management of __cuda_array_interface__ views. +* PR #3340: Fix typo in error name. +* PR #3365: Fix the default unboxing logic +* PR #3367: Allow non-global reference to objmode() context-manager +* PR #3381: Fix global reference in objmode for dynamically created function +* PR #3382: CUDA_ERROR_MISALIGNED_ADDRESS Using Multiple Const Arrays +* PR #3384: Correctly handle very old versions of colorama +* PR #3394: Add 32bit package guard for non-32bit installs +* PR #3397: Fix with-objmode warning +* PR #3403 Fix label offset in call inline after parfor pass +* PR #3429: Fixes raising of user defined exceptions for exec(). +* PR #3432: Fix error due to function naming in CI in py2.7 +* PR #3444: Fixed TBB's single thread execution and test added for #3440 +* PR #3449: Allow matching non-array objects in find_callname() +* PR #3455: Change getiter and iternext to not be pure. Resolves #3425 +* PR #3467: Make ir.UndefinedType singleton class. +* PR #3478: Fix np.random.shuffle sideeffect +* PR #3487: Raise unsupported for kwargs given to `print()` +* PR #3488: Remove dead script. +* PR #3498: Fix stencil support for boolean as return type +* PR #3511: Fix handling make_function literals (regression of #3414) +* PR #3514: Add missing unicode != unicode +* PR #3527: Fix complex math sqrt implementation for large -ve values +* PR #3530: This adds arg an check for the pattern supplied to Parfors. +* PR #3536: Sets list dtor linkage to `linkonce_odr` to fix visibility in AOT + +Documentation Updates: + +* PR #3316: Update 0.40 changelog with additional PRs +* PR #3318: Tweak spacing to avoid search box wrapping onto second line +* PR #3321: Add note about memory leaks with exceptions to docs. Fixes #3263 +* PR #3322: Add FAQ on CUDA + fork issue. Fixes #3315. +* PR #3343: Update docs for argsort, kind kwarg partially supported. +* PR #3357: Added mention of njit in 5minguide.rst +* PR #3434: Fix parallel reduction example in docs. +* PR #3452: Fix broken link and mark up problem. +* PR #3484: Size Numba logo in docs in em units. Fixes #3313 +* PR #3502: just two typos +* PR #3506: Document string support +* PR #3513: Documentation for parallel diagnostics. +* PR #3526: Fix 5 min guide with respect to @njit decl + +Contributors: + +* Alex Ford +* Andreas Sodeur +* Anton Malakhov +* Daniel Stender +* Ehsan Totoni (core dev) +* Henry Schreiner +* Marcel Bargull +* Matt Cooper +* Nick White +* Nicolas Hug +* rjenc29 +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) + + +Version 0.40.1 +-------------- + +This is a PyPI-only patch release to ensure that PyPI wheels can enable the +TBB threading backend, and to disable the OpenMP backend in the wheels. +Limitations of manylinux1 and variation in user environments can cause +segfaults when OpenMP is enabled on wheel builds. Note that this release has +no functional changes for users who obtained Numba 0.40.0 via conda. + +Patches: + +* PR #3338: Accidentally left Anton off contributor list for 0.40.0 +* PR #3374: Disable OpenMP in wheel building +* PR #3376: Update 0.40.1 changelog and docs on OpenMP backend + +Version 0.40.0 +-------------- + +This release adds a number of major features: + +* A new GPU backend: kernels for AMD GPUs can now be compiled using the ROCm + driver on Linux. +* The thread pool implementation used by Numba for automatic multithreading + is configurable to use TBB, OpenMP, or the old "workqueue" implementation. + (TBB is likely to become the preferred default in a future release.) +* New documentation on thread and fork-safety with Numba, along with overall + improvements in thread-safety. +* Experimental support for executing a block of code inside a nopython mode + function in object mode. +* Parallel loops now allow arrays as reduction variables +* CUDA improvements: FMA, faster float64 atomics on supporting hardware, + records in const memory, and improved datatime dtype support +* More NumPy functions: vander, tri, triu, tril, fill_diagonal + + +General Enhancements: + +* PR #3017: Add facility to support with-contexts +* PR #3033: Add support for multidimensional CFFI arrays +* PR #3122: Add inliner to object mode pipeline +* PR #3127: Support for reductions on arrays. +* PR #3145: Support for np.fill_diagonal +* PR #3151: Keep a queue of references to last N deserialized functions. Fixes #3026 +* PR #3154: Support use of list() if typeable. +* PR #3166: Objmode with-block +* PR #3179: Updates for llvmlite 0.25 +* PR #3181: Support function extension in alias analysis +* PR #3189: Support literal constants in typing of object methods +* PR #3190: Support passing closures as literal values in typing +* PR #3199: Support inferring stencil index as constant in simple unary expressions +* PR #3202: Threading layer backend refactor/rewrite/reinvention! +* PR #3209: Support for np.tri, np.tril and np.triu +* PR #3211: Handle unpacking in building tuple (BUILD_TUPLE_UNPACK opcode) +* PR #3212: Support for np.vander +* PR #3227: Add NumPy 1.15 support +* PR #3272: Add MemInfo_data to runtime._nrt_python.c_helpers +* PR #3273: Refactor. Removing thread-local-storage based context nesting. +* PR #3278: compiler threadsafety lockdown +* PR #3291: Add CPU count and CFS restrictions info to numba -s. + +CUDA Enhancements: + +* PR #3152: Use cuda driver api to get best blocksize for best occupancy +* PR #3165: Add FMA intrinsic support +* PR #3172: Use float64 add Atomics, Where Available +* PR #3186: Support Records in CUDA Const Memory +* PR #3191: CUDA: fix log size +* PR #3198: Fix GPU datetime timedelta types usage +* PR #3221: Support datetime/timedelta scalar argument to a CUDA kernel. +* PR #3259: Add DeviceNDArray.view method to reinterpret data as a different type. +* PR #3310: Fix IPC handling of sliced cuda array. + +ROCm Enhancements: + +* PR #3023: Support for AMDGCN/ROCm. +* PR #3108: Add ROC info to `numba -s` output. +* PR #3176: Move ROC vectorize init to npyufunc +* PR #3177: Add auto_synchronize support to ROC stream +* PR #3178: Update ROC target documentation. +* PR #3294: Add compiler lock to ROC compilation path. +* PR #3280: Add wavebits property to the HSA Agent. +* PR #3281: Fix ds_permute types and add tests + +Continuous Integration / Testing: + +* PR #3091: Remove old recipes, switch to test config based on env var. +* PR #3094: Add higher ULP tolerance for products in complex space. +* PR #3096: Set exit on error in incremental scripts +* PR #3109: Add skip to test needing jinja2 if no jinja2. +* PR #3125: Skip cudasim only tests +* PR #3126: add slack, drop flowdock +* PR #3147: Improve error message for arg type unsupported during typing. +* PR #3128: Fix recipe/build for jetson tx2/ARM +* PR #3167: In build script activate env before installing. +* PR #3180: Add skip to broken test. +* PR #3216: Fix libcuda.so loading in some container setup +* PR #3224: Switch to new Gitter notification webhook URL and encrypt it +* PR #3235: Add 32bit Travis CI jobs +* PR #3257: This adds scipy/ipython back into windows conda test phase. + +Fixes: + +* PR #3038: Fix random integer generation to match results from NumPy. +* PR #3045: Fix #3027 - Numba reassigns sys.stdout +* PR #3059: Handler for known LoweringErrors. +* PR #3060: Adjust attribute error for NumPy functions. +* PR #3067: Abort simulator threads on exception in thread block. +* PR #3079: Implement +/-(types.boolean) Fix #2624 +* PR #3080: Compute np.var and np.std correctly for complex types. +* PR #3088: Fix #3066 (array.dtype.type in prange) +* PR #3089: Fix invalid ParallelAccelerator hoisting issue. +* PR #3136: Fix #3135 (lowering error) +* PR #3137: Fix for issue3103 (race condition detection) +* PR #3142: Fix Issue #3139 (parfors reuse of reduction variable across prange blocks) +* PR #3148: Remove dead array equal @infer code +* PR #3153: Fix canonicalize_array_math typing for calls with kw args +* PR #3156: Fixes issue with missing pygments in testing and adds guards. +* PR #3168: Py37 bytes output fix. +* PR #3171: Fix #3146. Fix CFUNCTYPE void* return-type handling +* PR #3193: Fix setitem/getitem resolvers +* PR #3222: Fix #3214. Mishandling of POP_BLOCK in while True loop. +* PR #3230: Fixes liveness analysis issue in looplifting +* PR #3233: Fix return type difference for 32bit ctypes.c_void_p +* PR #3234: Fix types and layout for `np.where`. +* PR #3237: Fix DeprecationWarning about imp module +* PR #3241: Fix #3225. Normalize 0nd array to scalar in typing of indexing code. +* PR #3256: Fix #3251: Move imports of ABCs to collections.abc for Python >= 3.3 +* PR #3292: Fix issue3279. +* PR #3302: Fix error due to mismatching dtype + +Documentation Updates: + +* PR #3104: Workaround for #3098 (test_optional_unpack Heisenbug) +* PR #3132: Adds an ~5 minute guide to Numba. +* PR #3194: Fix docs RE: np.random generator fork/thread safety +* PR #3242: Page with Numba talks and tutorial links +* PR #3258: Allow users to choose the type of issue they are reporting. +* PR #3260: Fixed broken link +* PR #3266: Fix cuda pointer ownership problem with user/externally allocated pointer +* PR #3269: Tweak typography with CSS +* PR #3270: Update FAQ for functions passed as arguments +* PR #3274: Update installation instructions +* PR #3275: Note pyobject and voidptr are types in docs +* PR #3288: Do not need to call parallel optimizations "experimental" anymore +* PR #3318: Tweak spacing to avoid search box wrapping onto second line + +Contributors: + +* Anton Malakhov +* Alex Ford +* Anthony Bisulco +* Ehsan Totoni (core dev) +* Leonard Lausen +* Matthew Petroff +* Nick White +* Ray Donnelly +* rjenc29 +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Stuart Reynolds +* Todd A. Anderson (core dev) + + +Version 0.39.0 +-------------- + +Here are the highlights for the Numba 0.39.0 release. + +* This is the first version that supports Python 3.7. +* With help from Intel, we have fixed the issues with SVML support (related + issues #2938, #2998, #3006). +* List has gained support for containing reference-counted types like NumPy + arrays and `list`. Note, list still cannot hold heterogeneous types. +* We have made a significant change to the internal calling-convention, + which should be transparent to most users, to allow for a future feature that + will permitting jumping back into python-mode from a nopython-mode function. + This also fixes a limitation to `print` that disabled its use from nopython + functions that were deep in the call-stack. +* For CUDA GPU support, we added a `__cuda_array_interface__` following the + NumPy array interface specification to allow Numba to consume externally + defined device arrays. We have opened a corresponding pull request to CuPy to + test out the concept and be able to use a CuPy GPU array. +* The Numba dispatcher `inspect_types()` method now supports the kwarg `pretty` + which if set to `True` will produce ANSI/HTML output, showing the annotated + types, when invoked from ipython/jupyter-notebook respectively. +* The NumPy functions `ndarray.dot`, `np.percentile` and `np.nanpercentile`, and + `np.unique` are now supported. +* Numba now supports the use of a per-project configuration file to permanently + set behaviours typically set via `NUMBA_*` family environment variables. +* Support for the `ppc64le` architecture has been added. + +Enhancements: + +* PR #2793: Simplify and remove javascript from html_annotate templates. +* PR #2840: Support list of refcounted types +* PR #2902: Support for np.unique +* PR #2926: Enable fence for all architecture and add developer notes +* PR #2928: Making error about untyped list more informative. +* PR #2930: Add configuration file and color schemes. +* PR #2932: Fix encoding to 'UTF-8' in `check_output` decode. +* PR #2938: Python 3.7 compat: _Py_Finalizing becomes _Py_IsFinalizing() +* PR #2939: Comprehensive SVML unit test +* PR #2946: Add support for `ndarray.dot` method and tests. +* PR #2953: percentile and nanpercentile +* PR #2957: Add new 3.7 opcode support. +* PR #2963: Improve alias analysis to be more comprehensive +* PR #2984: Support for namedtuples in array analysis +* PR #2986: Fix environment propagation +* PR #2990: Improve function call matching for intrinsics +* PR #3002: Second pass at error rewrites (interpreter errors). +* PR #3004: Add numpy.empty to the list of pure functions. +* PR #3008: Augment SVML detection with llvmlite SVML patch detection. +* PR #3012: Make use of the common spelling of heterogeneous/homogeneous. +* PR #3032: Fix pycc ctypes test due to mismatch in calling-convention +* PR #3039: Add SVML detection to Numba environment diagnostic tool. +* PR #3041: This adds @needs_blas to tests that use BLAS +* PR #3056: Require llvmlite>=0.24.0 + +CUDA Enhancements: + +* PR #2860: __cuda_array_interface__ +* PR #2910: More CUDA intrinsics +* PR #2929: Add Flag To Prevent Unneccessary D->H Copies +* PR #3037: Add CUDA IPC support on non-peer-accessible devices + +CI Enhancements: + +* PR #3021: Update appveyor config. +* PR #3040: Add fault handler to all builds +* PR #3042: Add catchsegv +* PR #3077: Adds optional number of processes for `-m` in testing + +Fixes: + +* PR #2897: Fix line position of delete statement in numba ir +* PR #2905: Fix for #2862 +* PR #3009: Fix optional type returning in recursive call +* PR #3019: workaround and unittest for issue #3016 +* PR #3035: [TESTING] Attempt delayed removal of Env +* PR #3048: [WIP] Fix cuda tests failure on buildfarm +* PR #3054: Make test work on 32-bit +* PR #3062: Fix cuda.In freeing devary before the kernel launch +* PR #3073: Workaround #3072 +* PR #3076: Avoid ignored exception due to missing globals at interpreter teardown + +Documentation Updates: + +* PR #2966: Fix syntax in env var docs. +* PR #2967: Fix typo in CUDA kernel layout example. +* PR #2970: Fix docstring copy paste error. + +Contributors: + +The following people contributed to this release. + +* Anton Malakhov +* Ehsan Totoni (core dev) +* Julia Tatz +* Matthias Bussonnier +* Nick White +* Ray Donnelly +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Todd A. Anderson (core dev) +* Rik-de-Kort +* rjenc29 + + +Version 0.38.1 +-------------- + +This is a critical bug fix release addressing: +https://github.com/numba/numba/issues/3006 + +The bug does not impact users using conda packages from Anaconda or Intel Python +Distribution (but it does impact conda-forge). It does not impact users of pip +using wheels from PyPI. + +This only impacts a small number of users where: + + * The ICC runtime (specifically libsvml) is present in the user's environment. + * The user is using an llvmlite statically linked against a version of LLVM + that has not been patched with SVML support. + * The platform is 64-bit. + +The release fixes a code generation path that could lead to the production of +incorrect results under the above situation. + +Fixes: + +* PR #3007: Augment SVML detection with llvmlite SVML patch detection. + +Contributors: + +The following people contributed to this release. + +* Stuart Archibald (core dev) + + +Version 0.38.0 +-------------- + +Following on from the bug fix focus of the last release, this release swings +back towards the addition of new features and usability improvements based on +community feedback. This release is comparatively large! Three key features/ +changes to note are: + + * Numba (via llvmlite) is now backed by LLVM 6.0, general vectorization is + improved as a result. A significant long standing LLVM bug that was causing + corruption was also found and fixed. + * Further considerable improvements in vectorization are made available as + Numba now supports Intel's short vector math library (SVML). + Try it out with `conda install -c numba icc_rt`. + * CUDA 8.0 is now the minimum supported CUDA version. + +Other highlights include: + + * Bug fixes to `parallel=True` have enabled more vectorization opportunities + when using the ParallelAccelerator technology. + * Much effort has gone into improving error reporting and the general usability + of Numba. This includes highlighted error messages and performance tips + documentation. Try it out with `conda install colorama`. + * A number of new NumPy functions are supported, `np.convolve`, `np.correlate` + `np.reshape`, `np.transpose`, `np.permutation`, `np.real`, `np.imag`, and + `np.searchsorted` now supports the`side` kwarg. Further, `np.argsort` now + supports the `kind` kwarg with `quicksort` and `mergesort` available. + * The Numba extension API has gained the ability operate more easily with + functions from Cython modules through the use of + `numba.extending.get_cython_function_address` to obtain function addresses + for direct use in `ctypes.CFUNCTYPE`. + * Numba now allows the passing of jitted functions (and containers of jitted + functions) as arguments to other jitted functions. + * The CUDA functionality has gained support for a larger selection of bit + manipulation intrinsics, also SELP, and has had a number of bugs fixed. + * Initial work to support the PPC64LE platform has been added, full support is + however waiting on the LLVM 6.0.1 release as it contains critical patches + not present in 6.0.0. + It is hoped that any remaining issues will be fixed in the next release. + * The capacity for advanced users/compiler engineers to define their own + compilation pipelines. + +Enhancements: + +* PR #2660: Support bools from cffi in nopython. +* PR #2741: Enhance error message for undefined variables. +* PR #2744: Add diagnostic error message to test suite discovery failure. +* PR #2748: Added Intel SVML optimizations as opt-out choice working by default +* PR #2762: Support transpose with axes arguments. +* PR #2777: Add support for np.correlate and np.convolve +* PR #2779: Implement np.random.permutation +* PR #2801: Passing jitted functions as args +* PR #2802: Support np.real() and np.imag() +* PR #2807: Expose `import_cython_function` +* PR #2821: Add kwarg 'side' to np.searchsorted +* PR #2822: Adds stable argsort +* PR #2832: Fixups for llvmlite 0.23/llvm 6 +* PR #2836: Support `index` method on tuples +* PR #2839: Support for np.transpose and np.reshape. +* PR #2843: Custom pipeline +* PR #2847: Replace signed array access indices in unsiged prange loop body +* PR #2859: Add support for improved error reporting. +* PR #2880: This adds a github issue template. +* PR #2881: Build recipe to clone Intel ICC runtime. +* PR #2882: Update TravisCI to test SVML +* PR #2893: Add reference to the data buffer in array.ctypes object +* PR #2895: Move to CUDA 8.0 + +Fixes: + +* PR #2737: Fix #2007 (part 1). Empty array handling in np.linalg. +* PR #2738: Fix install_requires to allow pip getting pre-release version +* PR #2740: Fix 2208. Generate better error message. +* PR #2765: Fix Bit-ness +* PR #2780: PowerPC reference counting memory fences +* PR #2805: Fix six imports. +* PR #2813: Fix #2812: gufunc scalar output bug. +* PR #2814: Fix the build post #2727 +* PR #2831: Attempt to fix #2473 +* PR #2842: Fix issue with test discovery and broken CUDA drivers. +* PR #2850: Add rtsys init guard and test. +* PR #2852: Skip vectorization test with targets that are not x86 +* PR #2856: Prevent printing to stdout in `test_extending.py` +* PR #2864: Correct C code to prevent compiler warnings. +* PR #2889: Attempt to fix #2386. +* PR #2891: Removed test skipping for inspect_cfg +* PR #2898: Add guard to parallel test on unsupported platforms +* PR #2907: Update change log for PPC64LE LLVM dependency. +* PR #2911: Move build requirement to llvmlite>=0.23.0dev0 +* PR #2912: Fix random permutation test. +* PR #2914: Fix MD list syntax in issue template. + +Documentation Updates: + +* PR #2739: Explicitly state default value of error_model in docstring +* PR #2803: DOC: parallel vectorize requires signatures +* PR #2829: Add Python 2.7 EOL plan to docs +* PR #2838: Use automatic numbering syntax in list. +* PR #2877: Add performance tips documentation. +* PR #2883: Fix #2872: update rng doc about thread/fork-safety +* PR #2908: Add missing link and ref to docs. +* PR #2909: Tiny typo correction + +ParallelAccelerator enhancements/fixes: + +* PR #2727: Changes to enable vectorization in ParallelAccelerator. +* PR #2816: Array analysis for transpose with arbitrary arguments +* PR #2874: Fix dead code eliminator not to remove a call with side-effect +* PR #2886: Fix ParallelAccelerator arrayexpr repr + +CUDA enhancements: + +* PR #2734: More Constants From cuda.h +* PR #2767: Add len(..) Support to DeviceNDArray +* PR #2778: Add More Device Array API Functions to CUDA Simulator +* PR #2824: Add CUDA Primitives for Population Count +* PR #2835: Emit selp Instructions to Avoid Branching +* PR #2867: Full support for CUDA device attributes + +CUDA fixes: +* PR #2768: Don't Compile Code on Every Assignment +* PR #2878: Fixes a Win64 issue with the test in Pr/2865 + +Contributors: + +The following people contributed to this release. + +* Abutalib Aghayev +* Alex Olivas +* Anton Malakhov +* Dong-hee Na +* Ehsan Totoni (core dev) +* John Zwinck +* Josh Wilson +* Kelsey Jordahl +* Nick White +* Olexa Bilaniuk +* Rik-de-Kort +* Siu Kwan Lam (core dev) +* Stan Seibert (core dev) +* Stuart Archibald (core dev) +* Thomas Arildsen +* Todd A. Anderson (core dev) + + +Version 0.37.0 +-------------- + +This release focuses on bug fixing and stability but also adds a few new +features including support for Numpy 1.14. The key change for Numba core was the +long awaited addition of the final tranche of thread safety improvements that +allow Numba to be run concurrently on multiple threads without hitting known +thread safety issues inside LLVM itself. Further, a number of fixes and +enhancements went into the CUDA implementation and ParallelAccelerator gained +some new features and underwent some internal refactoring. + +Misc enhancements: + +* PR #2627: Remove hacks to make llvmlite threadsafe +* PR #2672: Add ascontiguousarray +* PR #2678: Add Gitter badge +* PR #2691: Fix #2690: add intrinsic to convert array to tuple +* PR #2703: Test runner feature: failed-first and last-failed +* PR #2708: Patch for issue #1907 +* PR #2732: Add support for array.fill + +Misc Fixes: + +* PR #2610: Fix #2606 lowering of optional.setattr +* PR #2650: Remove skip for win32 cosine test +* PR #2668: Fix empty_like from readonly arrays. +* PR #2682: Fixes 2210, remove _DisableJitWrapper +* PR #2684: Fix #2340, generator error yielding bool +* PR #2693: Add travis-ci testing of NumPy 1.14, and also check on Python 2.7 +* PR #2694: Avoid type inference failure due to a typing template rejection +* PR #2695: Update llvmlite version dependency. +* PR #2696: Fix tuple indexing codegeneration for empty tuple +* PR #2698: Fix #2697 by deferring deletion in the simplify_CFG loop. +* PR #2701: Small fix to avoid tempfiles being created in the current directory +* PR #2725: Fix 2481, LLVM IR parsing error due to mutated IR +* PR #2726: Fix #2673: incorrect fork error msg. +* PR #2728: Alternative to #2620. Remove dead code ByteCodeInst.get. +* PR #2730: Add guard for test needing SciPy/BLAS + +Documentation updates: + +* PR #2670: Update communication channels +* PR #2671: Add docs about diagnosing loop vectorizer +* PR #2683: Add docs on const arg requirements and on const mem alloc +* PR #2722: Add docs on numpy support in cuda +* PR #2724: Update doc: warning about unsupported arguments + +ParallelAccelerator enhancements/fixes: + +Parallel support for `np.arange` and `np.linspace`, also `np.mean`, `np.std` +and `np.var` are added. This was performed as part of a general refactor and +cleanup of the core ParallelAccelerator code. + +* PR #2674: Core pa +* PR #2704: Generate Dels after parfor sequential lowering +* PR #2716: Handle matching directly supported functions + +CUDA enhancements: + +* PR #2665: CUDA DeviceNDArray: Support numpy tranpose API +* PR #2681: Allow Assigning to DeviceNDArrays +* PR #2702: Make DummyArray do High Dimensional Reshapes +* PR #2714: Use CFFI to Reuse Code + +CUDA fixes: + +* PR #2667: Fix CUDA DeviceNDArray slicing +* PR #2686: Fix #2663: incorrect offset when indexing cuda array. +* PR #2687: Ensure Constructed Stream Bound +* PR #2706: Workaround for unexpected warp divergence due to exception raising + code +* PR #2707: Fix regression: cuda test submodules not loading properly in + runtests +* PR #2731: Use more challenging values in slice tests. +* PR #2720: A quick testsuite fix to not run the new cuda testcase in the + multiprocess pool + +Contributors: + +The following people contributed to this release. + +* Coutinho Menezes Nilo +* Daniel +* Ehsan Totoni +* Nick White +* Paul H. Liu +* Siu Kwan Lam +* Stan Seibert +* Stuart Archibald +* Todd A. Anderson + + +Version 0.36.2 +-------------- + +This is a bugfix release that provides minor changes to address: + +* PR #2645: Avoid CPython bug with ``exec`` in older 2.7.x. +* PR #2652: Add support for CUDA 9. + + +Version 0.36.1 +-------------- + +This release continues to add new features to the work undertaken in partnership +with Intel on ParallelAccelerator technology. Other changes of note include the +compilation chain being updated to use LLVM 5.0 and the production of conda +packages using conda-build 3 and the new compilers that ship with it. + +NOTE: A version 0.36.0 was tagged for internal use but not released. + +ParallelAccelerator: + +NOTE: The ParallelAccelerator technology is under active development and should +be considered experimental. + +New features relating to ParallelAccelerator, from work undertaken with Intel, +include the addition of the `@stencil` decorator for ease of implementation of +stencil-like computations, support for general reductions, and slice and +range fusion for parallel slice/bit-array assignments. Documentation on both the +use and implementation of the above has been added. Further, a new debug +environment variable `NUMBA_DEBUG_ARRAY_OPT_STATS` is made available to give +information about which operators/calls are converted to parallel for-loops. + +ParallelAccelerator features: + +* PR #2457: Stencil Computations in ParallelAccelerator +* PR #2548: Slice and range fusion, parallelizing bitarray and slice assignment +* PR #2516: Support general reductions in ParallelAccelerator + +ParallelAccelerator fixes: + +* PR #2540: Fix bug #2537 +* PR #2566: Fix issue #2564. +* PR #2599: Fix nested multi-dimensional parfor type inference issue +* PR #2604: Fixes for stencil tests and cmath sin(). +* PR #2605: Fixes issue #2603. + +Additional features of note: + +This release of Numba (and llvmlite) is updated to use LLVM version 5.0 as the +compiler back end, the main change to Numba to support this was the addition of +a custom symbol tracker to avoid the calls to LLVM's `ExecutionEngine` that was +crashing when asking for non-existent symbol addresses. Further, the conda +packages for this release of Numba are built using conda build version 3 and the +new compilers/recipe grammar that are present in that release. + +* PR #2568: Update for LLVM 5 +* PR #2607: Fixes abort when getting address to "nrt_unresolved_abort" +* PR #2615: Working towards conda build 3 + +Thanks to community feedback and bug reports, the following fixes were also +made. + +Misc fixes/enhancements: + +* PR #2534: Add tuple support to np.take. +* PR #2551: Rebranding fix +* PR #2552: relative doc links +* PR #2570: Fix issue #2561, handle missing successor on loop exit +* PR #2588: Fix #2555. Disable libpython.so linking on linux +* PR #2601: Update llvmlite version dependency. +* PR #2608: Fix potential cache file collision +* PR #2612: Fix NRT test failure due to increased overhead when running in coverage +* PR #2619: Fix dubious pthread_cond_signal not in lock +* PR #2622: Fix `np.nanmedian` for all NaN case. +* PR #2633: Fix markdown in CONTRIBUTING.md +* PR #2635: Make the dependency on compilers for AOT optional. + +CUDA support fixes: + +* PR #2523: Fix invalid cuda context in memory transfer calls in another thread +* PR #2575: Use CPU to initialize xoroshiro states for GPU RNG. Fixes #2573 +* PR #2581: Fix cuda gufunc mishandling of scalar arg as array and out argument + + +Version 0.35.0 +-------------- + +This release includes some exciting new features as part of the work +performed in partnership with Intel on ParallelAccelerator technology. +There are also some additions made to Numpy support and small but +significant fixes made as a result of considerable effort spent chasing bugs +and implementing stability improvements. + + +ParallelAccelerator: + +NOTE: The ParallelAccelerator technology is under active development and should +be considered experimental. + +New features relating to ParallelAccelerator, from work undertaken with Intel, +include support for a larger range of `np.random` functions in `parallel` +mode, printing Numpy arrays in no Python mode, the capacity to initialize Numpy +arrays directly from list comprehensions, and the axis argument to `.sum()`. +Documentation on the ParallelAccelerator technology implementation has also +been added. Further, a large amount of work on equivalence relations was +undertaken to enable runtime checks of broadcasting behaviours in parallel mode. + +ParallelAccelerator features: + +* PR #2400: Array comprehension +* PR #2405: Support printing Numpy arrays +* PR #2438: from Support more np.random functions in ParallelAccelerator +* PR #2482: Support for sum with axis in nopython mode. +* PR #2487: Adding developer documentation for ParallelAccelerator technology. +* PR #2492: Core PA refactor adds assertions for broadcast semantics + +ParallelAccelerator fixes: + +* PR #2478: Rename cfg before parfor translation (#2477) +* PR #2479: Fix broken array comprehension tests on unsupported platforms +* PR #2484: Fix array comprehension test on win64 +* PR #2506: Fix for 32-bit machines. + + +Additional features of note: + +Support for `np.take`, `np.finfo`, `np.iinfo` and `np.MachAr` in no Python +mode is added. Further, three new environment variables are added, two for +overriding CPU target/features and another to warn if `parallel=True` was set +no such transform was possible. + +* PR #2490: Implement np.take and ndarray.take +* PR #2493: Display a warning if parallel=True is set but not possible. +* PR #2513: Add np.MachAr, np.finfo, np.iinfo +* PR #2515: Allow environ overriding of cpu target and cpu features. + + +Due to expansion of the test farm and a focus on fixing bugs, the following +fixes were also made. + +Misc fixes/enhancements: + +* PR #2455: add contextual information to runtime errors +* PR #2470: Fixes #2458, poor performance in np.median +* PR #2471: Ensure LLVM threadsafety in {g,}ufunc building. +* PR #2494: Update doc theme +* PR #2503: Remove hacky code added in 2482 and feature enhancement +* PR #2505: Serialise env mutation tests during multithreaded testing. +* PR #2520: Fix failing cpu-target override tests + +CUDA support fixes: + +* PR #2504: Enable CUDA toolkit version testing +* PR #2509: Disable tests generating code unavailable in lower CC versions. +* PR #2511: Fix Windows 64 bit CUDA tests. + + +Version 0.34.0 +-------------- + +This release adds a significant set of new features arising from combined work +with Intel on ParallelAccelerator technology. It also adds list comprehension +and closure support, support for Numpy 1.13 and a new, faster, CUDA reduction +algorithm. For Linux users this release is the first to be built on Centos 6, +which will be the new base platform for future releases. Finally a number of +thread-safety, type inference and other smaller enhancements and bugs have been +fixed. + + +ParallelAccelerator features: + +NOTE: The ParallelAccelerator technology is under active development and should +be considered experimental. + +The ParallelAccelerator technology is accessed via a new "nopython" mode option +"parallel". The ParallelAccelerator technology attempts to identify operations +which have parallel semantics (for instance adding a scalar to a vector), fuse +together adjacent such operations, and then parallelize their execution across +a number of CPU cores. This is essentially auto-parallelization. + +In addition to the auto-parallelization feature, explicit loop based +parallelism is made available through the use of `prange` in place of `range` +as a loop iterator. + +More information and examples on both auto-parallelization and `prange` are +available in the documentation and examples directory respectively. + +As part of the necessary work for ParallelAccelerator, support for closures +and list comprehensions is added: + +* PR #2318: Transfer ParallelAccelerator technology to Numba +* PR #2379: ParallelAccelerator Core Improvements +* PR #2367: Add support for len(range(...)) +* PR #2369: List comprehension +* PR #2391: Explicit Parallel Loop Support (prange) + +The ParallelAccelerator features are available on all supported platforms and +Python versions with the exceptions of (with view of supporting in a future +release): + +* The combination of Windows operating systems with Python 2.7. +* Systems running 32 bit Python. + + +CUDA support enhancements: + +* PR #2377: New GPU reduction algorithm + + +CUDA support fixes: + +* PR #2397: Fix #2393, always set alignment of cuda static memory regions + + +Misc Fixes: + +* PR #2373, Issue #2372: 32-bit compatibility fix for parfor related code +* PR #2376: Fix #2375 missing stdint.h for py2.7 vc9 +* PR #2378: Fix deadlock in parallel gufunc when kernel acquires the GIL. +* PR #2382: Forbid unsafe casting in bitwise operation +* PR #2385: docs: fix Sphinx errors +* PR #2396: Use 64-bit RHS operand for shift +* PR #2404: Fix threadsafety logic issue in ufunc compilation cache. +* PR #2424: Ensure consistent iteration order of blocks for type inference. +* PR #2425: Guard code to prevent the use of 'parallel' on win32 + py27 +* PR #2426: Basic test for Enum member type recovery. +* PR #2433: Fix up the parfors tests with respect to windows py2.7 +* PR #2442: Skip tests that need BLAS/LAPACK if scipy is not available. +* PR #2444: Add test for invalid array setitem +* PR #2449: Make the runtime initialiser threadsafe +* PR #2452: Skip CFG test on 64bit windows + + +Misc Enhancements: + +* PR #2366: Improvements to IR utils +* PR #2388: Update README.rst to indicate the proper version of LLVM +* PR #2394: Upgrade to llvmlite 0.19.* +* PR #2395: Update llvmlite version to 0.19 +* PR #2406: Expose environment object to ufuncs +* PR #2407: Expose environment object to target-context inside lowerer +* PR #2413: Add flags to pass through to conda build for buildbot +* PR #2414: Add cross compile flags to local recipe +* PR #2415: A few cleanups for rewrites +* PR #2418: Add getitem support for Enum classes +* PR #2419: Add support for returning enums in vectorize +* PR #2421: Add copyright notice for Intel contributed files. +* PR #2422: Patch code base to work with np 1.13 release +* PR #2448: Adds in warning message when using 'parallel' if cache=True +* PR #2450: Add test for keyword arg on .sum-like and .cumsum-like array + methods + + +Version 0.33.0 +-------------- + +This release resolved several performance issues caused by atomic +reference counting operations inside loop bodies. New optimization +passes have been added to reduce the impact of these operations. We +observe speed improvements between 2x-10x in affected programs due to +the removal of unnecessary reference counting operations. + +There are also several enhancements to the CUDA GPU support: + +* A GPU random number generator based on `xoroshiro128+ algorithm `_ is added. + See details and examples in :ref:`documentation `. +* ``@cuda.jit`` CUDA kernels can now call ``@jit`` and ``@njit`` + CPU functions and they will automatically be compiled as CUDA device + functions. +* CUDA IPC memory API is exposed for sharing memory between proceses. + See usage details in :ref:`documentation `. + +Reference counting enhancements: + +* PR #2346, Issue #2345, #2248: Add extra refcount pruning after inlining +* PR #2349: Fix refct pruning not removing refct op with tail call. +* PR #2352, Issue #2350: Add refcount pruning pass for function that does not need refcount + +CUDA support enhancements: + +* PR #2023: Supports CUDA IPC for device array +* PR #2343, Issue #2335: Allow CPU jit decorated function to be used as cuda device function +* PR #2347: Add random number generator support for CUDA device code +* PR #2361: Update autotune table for CC: 5.3, 6.0, 6.1, 6.2 + +Misc fixes: + +* PR #2362: Avoid test failure due to typing to int32 on 32-bit platforms +* PR #2359: Fixed nogil example that threw a TypeError when executed. +* PR #2357, Issue #2356: Fix fragile test that depends on how the script is executed. +* PR #2355: Fix cpu dispatcher referenced as attribute of another module +* PR #2354: Fixes an issue with caching when function needs NRT and refcount pruning +* PR #2342, Issue #2339: Add warnings to inspection when it is used on unserialized cached code +* PR #2329, Issue #2250: Better handling of missing op codes + +Misc enhancements: + +* PR #2360: Adds missing values in error mesasge interp. +* PR #2353: Handle when get_host_cpu_features() raises RuntimeError +* PR #2351: Enable SVML for erf/erfc/gamma/lgamma/log2 +* PR #2344: Expose error_model setting in jit decorator +* PR #2337: Align blocking terminate support for fork() with new TBB version +* PR #2336: Bump llvmlite version to 0.18 +* PR #2330: Core changes in PR #2318 + + +Version 0.32.0 +-------------- + +In this release, we are upgrading to LLVM 4.0. A lot of work has been done +to fix many race-condition issues inside LLVM when the compiler is +used concurrently, which is likely when Numba is used with Dask. + +Improvements: + +* PR #2322: Suppress test error due to unknown but consistent error with tgamma +* PR #2320: Update llvmlite dependency to 0.17 +* PR #2308: Add details to error message on why cuda support is disabled. +* PR #2302: Add os x to travis +* PR #2294: Disable remove_module on MCJIT due to memory leak inside LLVM +* PR #2291: Split parallel tests and recycle workers to tame memory usage +* PR #2253: Remove the pointer-stuffing hack for storing meminfos in lists + +Fixes: + +* PR #2331: Fix a bug in the GPU array indexing +* PR #2326: Fix #2321 docs referring to non-existing function. +* PR #2316: Fixing more race-condition problems +* PR #2315: Fix #2314. Relax strict type check to allow optional type. +* PR #2310: Fix race condition due to concurrent compilation and cache loading +* PR #2304: Fix intrinsic 1st arg not a typing.Context as stated by the docs. +* PR #2287: Fix int64 atomic min-max +* PR #2286: Fix #2285 `@overload_method` not linking dependent libs +* PR #2303: Missing import statements to interval-example.rst + + +Version 0.31.0 +-------------- + +In this release, we added preliminary support for debugging with GDB +version >= 7.0. The feature is enabled by setting the ``debug=True`` compiler +option, which causes GDB compatible debug info to be generated. +The CUDA backend also gained limited debugging support so that source locations +are showed in memory-checking and profiling tools. +For details, see :ref:`numba-troubleshooting`. + +Also, we added the ``fastmath=True`` compiler option to enable unsafe +floating-point transformations, which allows LLVM to auto-vectorize more code. + +Other important changes include upgrading to LLVM 3.9.1 and adding support for +Numpy 1.12. + +Improvements: + +* PR #2281: Update for numpy1.12 +* PR #2278: Add CUDA atomic.{max, min, compare_and_swap} +* PR #2277: Add about section to conda recipies to identify license and other + metadata in Anaconda Cloud +* PR #2271: Adopt itanium C++-style mangling for CPU and CUDA targets +* PR #2267: Add fastmath flags +* PR #2261: Support dtype.type +* PR #2249: Changes for llvm3.9 +* PR #2234: Bump llvmlite requirement to 0.16 and add install_name_tool_fixer to + mviewbuf for OS X +* PR #2230: Add python3.6 to TravisCi +* PR #2227: Enable caching for gufunc wrapper +* PR #2170: Add debugging support +* PR #2037: inspect_cfg() for easier visualization of the function operation + +Fixes: + +* PR #2274: Fix nvvm ir patch in mishandling "load" +* PR #2272: Fix breakage to cuda7.5 +* PR #2269: Fix caching of copy_strides kernel in cuda.reduce +* PR #2265: Fix #2263: error when linking two modules with dynamic globals +* PR #2252: Fix path separator in test +* PR #2246: Fix overuse of memory in some system with fork +* PR #2241: Fix #2240: __module__ in dynamically created function not a str +* PR #2239: Fix fingerprint computation failure preventing fallback + + +Version 0.30.1 +-------------- + +This is a bug-fix release to enable Python 3.6 support. In addition, +there is now early Intel TBB support for parallel ufuncs when building from +source with TBBROOT defined. The TBB feature is not enabled in our official +builds. + +Fixes: + +* PR #2232: Fix name clashes with _Py_hashtable_xxx in Python 3.6. + +Improvements: + +* PR #2217: Add Intel TBB threadpool implementation for parallel ufunc. + + +Version 0.30.0 +-------------- + +This release adds preliminary support for Python 3.6, but no official build is +available yet. A new system reporting tool (``numba --sysinfo``) is added to +provide system information to help core developers in replication and debugging. +See below for other improvements and bug fixes. + +Improvements: + +* PR #2209: Support Python 3.6. +* PR #2175: Support ``np.trace()``, ``np.outer()`` and ``np.kron()``. +* PR #2197: Support ``np.nanprod()``. +* PR #2190: Support caching for ufunc. +* PR #2186: Add system reporting tool. + +Fixes: + +* PR #2214, Issue #2212: Fix memory error with ndenumerate and flat iterators. +* PR #2206, Issue #2163: Fix ``zip()`` consuming extra elements in early + exhaustion. +* PR #2185, Issue #2159, #2169: Fix rewrite pass affecting objmode fallback. +* PR #2204, Issue #2178: Fix annotation for liftedloop. +* PR #2203: Fix Appveyor segfault with Python 3.5. +* PR #2202, Issue #2198: Fix target context not initialized when loading from + ufunc cache. +* PR #2172, Issue #2171: Fix optional type unpacking. +* PR #2189, Issue #2188: Disable freezing of big (>1MB) global arrays. +* PR #2180, Issue #2179: Fix invalid variable version in looplifting. +* PR #2156, Issue #2155: Fix divmod, floordiv segfault on CUDA. + + +Version 0.29.0 +-------------- + +This release extends the support of recursive functions to include direct and +indirect recursion without explicit function type annotations. See new example +in `examples/mergesort.py`. Newly supported numpy features include array +stacking functions, np.linalg.eig* functions, np.linalg.matrix_power, np.roots +and array to array broadcasting in assignments. + +This release depends on llvmlite 0.14.0 and supports CUDA 8 but it is not +required. + +Improvements: + +* PR #2130, #2137: Add type-inferred recursion with docs and examples. +* PR #2134: Add ``np.linalg.matrix_power``. +* PR #2125: Add ``np.roots``. +* PR #2129: Add ``np.linalg.{eigvals,eigh,eigvalsh}``. +* PR #2126: Add array-to-array broadcasting. +* PR #2069: Add hstack and related functions. +* PR #2128: Allow for vectorizing a jitted function. (thanks to @dhirschfeld) +* PR #2117: Update examples and make them test-able. +* PR #2127: Refactor interpreter class and its results. + +Fixes: + +* PR #2149: Workaround MSVC9.0 SP1 fmod bug kb982107. +* PR #2145, Issue #2009: Fixes kwargs for jitclass ``__init__`` method. +* PR #2150: Fix slowdown in objmode fallback. +* PR #2050, Issue #1259: Fix liveness problem with some generator loops. +* PR #2072, Issue #1995: Right shift of unsigned LHS should be logical. +* PR #2115, Issue #1466: Fix inspect_types() error due to mangled variable name. +* PR #2119, Issue #2118: Fix array type created from record-dtype. +* PR #2122, Issue #1808: Fix returning a generator due to datamodel error. + + +Version 0.28.1 +-------------- + +This is a bug-fix release to resolve packaging issues with setuptools +dependency. + + +Version 0.28.0 +-------------- + +Amongst other improvements, this version improves again the level of +support for linear algebra -- functions from the :mod:`numpy.linalg` +module. Also, our random generator is now guaranteed to be thread-safe +and fork-safe. + +Improvements: + +* PR #2019: Add the ``@intrinsic`` decorator to define low-level + subroutines callable from JIT functions (this is considered + a private API for now). +* PR #2059: Implement ``np.concatenate`` and ``np.stack``. +* PR #2048: Make random generation fork-safe and thread-safe, producing + independent streams of random numbers for each thread or process. +* PR #2031: Add documentation of floating-point pitfalls. +* Issue #2053: Avoid polling in parallel CPU target (fixes severe performance + regression on Windows). +* Issue #2029: Make default arguments fast. +* PR #2052: Add logging to the CUDA driver. +* PR #2049: Implement the built-in ``divmod()`` function. +* PR #2036: Implement the ``argsort()`` method on arrays. +* PR #2046: Improving CUDA memory management by deferring deallocations + until certain thresholds are reached, so as to avoid breaking asynchronous + execution. +* PR #2040: Switch the CUDA driver implementation to use CUDA's + "primary context" API. +* PR #2017: Allow ``min(tuple)`` and ``max(tuple)``. +* PR #2039: Reduce fork() detection overhead in CUDA. +* PR #2021: Handle structured dtypes with titles. +* PR #1996: Rewrite looplifting as a transformation on Numba IR. +* PR #2014: Implement ``np.linalg.matrix_rank``. +* PR #2012: Implement ``np.linalg.cond``. +* PR #1985: Rewrite even trivial array expressions, which opens the door + for other optimizations (for example, ``array ** 2`` can be converted + into ``array * array``). +* PR #1950: Have ``typeof()`` always raise ValueError on failure. + Previously, it would either raise or return None, depending on the input. +* PR #1994: Implement ``np.linalg.norm``. +* PR #1987: Implement ``np.linalg.det`` and ``np.linalg.slogdet``. +* Issue #1979: Document integer width inference and how to workaround. +* PR #1938: Numba is now compatible with LLVM 3.8. +* PR #1967: Restrict ``np.linalg`` functions to homogeneous dtypes. Users + wanting to pass mixed-typed inputs have to convert explicitly, which + makes the performance implications more obvious. + +Fixes: + +* PR #2006: ``array(float32) ** int`` should return ``array(float32)``. +* PR #2044: Allow reshaping empty arrays. +* Issue #2051: Fix refcounting issue when concatenating tuples. +* Issue #2000: Make Numpy optional for setup.py, to allow ``pip install`` + to work without Numpy pre-installed. +* PR #1989: Fix assertion in ``Dispatcher.disable_compile()``. +* Issue #2028: Ignore filesystem errors when caching from multiple processes. +* Issue #2003: Allow unicode variable and function names (on Python 3). +* Issue #1998: Fix deadlock in parallel ufuncs that reacquire the GIL. +* PR #1997: Fix random crashes when AOT compiling on certain Windows platforms. +* Issue #1988: Propagate jitclass docstring. +* Issue #1933: Ensure array constants are emitted with the right alignment. + + +Version 0.27.0 +-------------- + +Improvements: + +* Issue #1976: improve error message when non-integral dimensions are given + to a CUDA kernel. +* PR #1970: Optimize the power operator with a static exponent. +* PR #1710: Improve contextual information for compiler errors. +* PR #1961: Support printing constant strings. +* PR #1959: Support more types in the print() function. +* PR #1823: Support ``compute_50`` in CUDA backend. +* PR #1955: Support ``np.linalg.pinv``. +* PR #1896: Improve the ``SmartArray`` API. +* PR #1947: Support ``np.linalg.solve``. +* Issue #1943: Improve error message when an argument fails typing.4 +* PR #1927: Support ``np.linalg.lstsq``. +* PR #1934: Use system functions for hypot() where possible, instead of our + own implementation. +* PR #1929: Add cffi support to ``@cfunc`` objects. +* PR #1932: Add user-controllable thread pool limits for parallel CPU target. +* PR #1928: Support self-recursion when the signature is explicit. +* PR #1890: List all lowering implementations in the developer docs. +* Issue #1884: Support ``np.lib.stride_tricks.as_strided()``. + +Fixes: + +* Issue #1960: Fix sliced assignment when source and destination areas are + overlapping. +* PR #1963: Make CUDA print() atomic. +* PR #1956: Allow 0d array constants. +* Issue #1945: Allow using Numpy ufuncs in AOT compiled code. +* Issue #1916: Fix documentation example for ``@generated_jit``. +* Issue #1926: Fix regression when caching functions in an IPython session. +* Issue #1923: Allow non-intp integer arguments to carray() and farray(). +* Issue #1908: Accept non-ASCII unicode docstrings on Python 2. +* Issue #1874: Allow ``del container[key]`` in object mode. +* Issue #1913: Fix set insertion bug when the lookup chain contains deleted + entries. +* Issue #1911: Allow function annotations on jitclass methods. + + +Version 0.26.0 +-------------- + +This release adds support for ``cfunc`` decorator for exporting numba jitted +functions to 3rd party API that takes C callbacks. Most of the overhead of +using jitclasses inside the interpreter are eliminated. Support for +decompositions in ``numpy.linalg`` are added. Finally, Numpy 1.11 is +supported. + +Improvements: + +* PR #1889: Export BLAS and LAPACK wrappers for pycc. +* PR #1888: Faster array power. +* Issue #1867: Allow "out" keyword arg for dufuncs. +* PR #1871: ``carray()`` and ``farray()`` for creating arrays from pointers. +* PR #1855: ``@cfunc`` decorator for exporting as ctypes function. +* PR #1862: Add support for ``numpy.linalg.qr``. +* PR #1851: jitclass support for '_' and '__' prefixed attributes. +* PR #1842: Optimize jitclass in Python interpreter. +* Issue #1837: Fix CUDA simulator issues with device function. +* PR #1839: Add support for decompositions from ``numpy.linalg``. +* PR #1829: Support Python enums. +* PR #1828: Add support for ``numpy.random.rand()``` and + ``numpy.random.randn()`` +* Issue #1825: Use of 0-darray in place of scalar index. +* Issue #1824: Scalar arguments to object mode gufuncs. +* Issue #1813: Let bitwise bool operators return booleans, not integers. +* Issue #1760: Optional arguments in generators. +* PR #1780: Numpy 1.11 support. + + +Version 0.25.0 +-------------- + +This release adds support for ``set`` objects in nopython mode. It also +adds support for many missing Numpy features and functions. It improves +Numba's compatibility and performance when using a distributed execution +framework such as dask, distributed or Spark. Finally, it removes +compatibility with Python 2.6, Python 3.3 and Numpy 1.6. + +Improvements: + +* Issue #1800: Add erf(), erfc(), gamma() and lgamma() to CUDA targets. +* PR #1793: Implement more Numpy functions: np.bincount(), np.diff(), + np.digitize(), np.histogram(), np.searchsorted() as well as NaN-aware + reduction functions (np.nansum(), np.nanmedian(), etc.) +* PR #1789: Optimize some reduction functions such as np.sum(), np.prod(), + np.median(), etc. +* PR #1752: Make CUDA features work in dask, distributed and Spark. +* PR #1787: Support np.nditer() for fast multi-array indexing with + broadcasting. +* PR #1799: Report JIT-compiled functions as regular Python functions + when profiling (allowing to see the filename and line number where a + function is defined). +* PR #1782: Support np.any() and np.all(). +* Issue #1788: Support the iter() and next() built-in functions. +* PR #1778: Support array.astype(). +* Issue #1775: Allow the user to set the target CPU model for AOT compilation. +* PR #1758: Support creating random arrays using the ``size`` parameter + to the np.random APIs. +* PR #1757: Support len() on array.flat objects. +* PR #1749: Remove Numpy 1.6 compatibility. +* PR #1748: Remove Python 2.6 and 3.3 compatibility. +* PR #1735: Support the ``not in`` operator as well as operator.contains(). +* PR #1724: Support homogeneous sets in nopython mode. +* Issue #875: make compilation of array constants faster. + +Fixes: + +* PR #1795: Fix a massive performance issue when calling Numba functions + with distributed, Spark or a similar mechanism using serialization. +* Issue #1784: Make jitclasses usable with NUMBA_DISABLE_JIT=1. +* Issue #1786: Allow using linear algebra functions when profiling. +* Issue #1796: Fix np.dot() memory leak on non-contiguous inputs. +* PR #1792: Fix static negative indexing of tuples. +* Issue #1771: Use fallback cache directory when __pycache__ isn't writable, + such as when user code is installed in a system location. +* Issue #1223: Use Numpy error model in array expressions (e.g. division + by zero returns ``inf`` or ``nan`` instead of raising an error). +* Issue #1640: Fix np.random.binomial() for large n values. +* Issue #1643: Improve error reporting when passing an invalid spec to + ``@jitclass``. +* PR #1756: Fix slicing with a negative step and an omitted start. + + +Version 0.24.0 +-------------- + +This release introduces several major changes, including the ``@generated_jit`` +decorator for flexible specializations as with Julia's "``@generated``" macro, +or the SmartArray array wrapper type that allows seamless transfer of array +data between the CPU and the GPU. + +This will be the last version to support Python 2.6, Python 3.3 and Numpy 1.6. + +Improvements: + +* PR #1723: Improve compatibility of JIT functions with the Python profiler. +* PR #1509: Support array.ravel() and array.flatten(). +* PR #1676: Add SmartArray type to support transparent data management in + multiple address spaces (host & GPU). +* PR #1689: Reduce startup overhead of importing Numba. +* PR #1705: Support registration of CFFI types as corresponding to known + Numba types. +* PR #1686: Document the extension API. +* PR #1698: Improve warnings raised during type inference. +* PR #1697: Support np.dot() and friends on non-contiguous arrays. +* PR #1692: cffi.from_buffer() improvements (allow more pointer types, + allow non-Numpy buffer objects). +* PR #1648: Add the ``@generated_jit`` decorator. +* PR #1651: Implementation of np.linalg.inv using LAPACK. Thanks to + Matthieu Dartiailh. +* PR #1674: Support np.diag(). +* PR #1673: Improve error message when looking up an attribute on an + unknown global. +* Issue #1569: Implement runtime check for the LLVM locale bug. +* PR #1612: Switch to LLVM 3.7 in sync with llvmlite. +* PR #1624: Allow slice assignment of sequence to array. +* PR #1622: Support slicing tuples with a constant slice. + +Fixes: + +* Issue #1722: Fix returning an optional boolean (bool or None). +* Issue #1734: NRT decref bug when variable is del'ed before being defined, + leading to a possible memory leak. +* PR #1732: Fix tuple getitem regression for CUDA target. +* PR #1718: Mishandling of optional to optional casting. +* PR #1714: Fix .compile() on a JIT function not respecting ._can_compile. +* Issue #1667: Fix np.angle() on arrays. +* Issue #1690: Fix slicing with an omitted stop and a negative step value. +* PR #1693: Fix gufunc bug in handling scalar formal arg with non-scalar + input value. +* PR #1683: Fix parallel testing under Windows. +* Issue #1616: Use system-provided versions of C99 math where possible. +* Issue #1652: Reductions of bool arrays (e.g. sum() or mean()) should + return integers or floats, not bools. +* Issue #1664: Fix regression when indexing a record array with a constant + index. +* PR #1661: Disable AVX on old Linux kernels. +* Issue #1636: Allow raising an exception looked up on a module. + + +Version 0.23.1 +-------------- + +This is a bug-fix release to address several regressions introduced +in the 0.23.0 release, and a couple other issues. + +Fixes: + +* Issue #1645: CUDA ufuncs were broken in 0.23.0. +* Issue #1638: Check tuple sizes when passing a list of tuples. +* Issue #1630: Parallel ufunc would keep eating CPU even after finishing + under Windows. +* Issue #1628: Fix ctypes and cffi tests under Windows with Python 3.5. +* Issue #1627: Fix xrange() support. +* PR #1611: Rewrite variable liveness analysis. +* Issue #1610: Allow nested calls between explicitly-typed ufuncs. +* Issue #1593: Fix `*args` in object mode. + + +Version 0.23.0 +-------------- + +This release introduces JIT classes using the new ``@jitclass`` decorator, +allowing user-defined structures for nopython mode. Other improvements +and bug fixes are listed below. + +Improvements: + +* PR #1609: Speed up some simple math functions by inlining them + in their caller +* PR #1571: Implement JIT classes +* PR #1584: Improve typing of array indexing +* PR #1583: Allow printing booleans +* PR #1542: Allow negative values in np.reshape() +* PR #1560: Support vector and matrix dot product, including ``np.dot()`` + and the ``@`` operator in Python 3.5 +* PR #1546: Support field lookup on record arrays and scalars (i.e. + ``array['field']`` in addition to ``array.field``) +* PR #1440: Support the HSA wavebarrier() and activelanepermute_wavewidth() + intrinsics +* PR #1540: Support np.angle() +* PR #1543: Implement CPU multithreaded gufuncs (target="parallel") +* PR #1551: Allow scalar arguments in np.where(), np.empty_like(). +* PR #1516: Add some more examples from NumbaPro +* PR #1517: Support np.sinc() + +Fixes: + +* Issue #1603: Fix calling a non-cached function from a cached function +* Issue #1594: Ensure a list is homogeneous when unboxing +* Issue #1595: Replace deprecated use of get_pointer_to_function() +* Issue #1586: Allow tests to be run by different users on the same machine +* Issue #1587: Make CudaAPIError picklable +* Issue #1568: Fix using Numba from inside Visual Studio 2015 +* Issue #1559: Fix serializing a jit function referring a renamed module +* PR #1508: Let reshape() accept integer argument(s), not just a tuple +* Issue #1545: Improve error checking when unboxing list objects +* Issue #1538: Fix array broadcasting in CUDA gufuncs +* Issue #1526: Fix a reference count handling bug + + +Version 0.22.1 +-------------- + +This is a bug-fix release to resolve some packaging issues and other +problems found in the 0.22.0 release. + +Fixes: + +* PR #1515: Include MANIFEST.in in MANIFEST.in so that sdist still works from + source tar files. +* PR #1518: Fix reference counting bug caused by hidden alias +* PR #1519: Fix erroneous assert when passing nopython=True to guvectorize. +* PR #1521: Fix cuda.test() + +Version 0.22.0 +-------------- + +This release features several highlights: Python 3.5 support, Numpy 1.10 +support, Ahead-of-Time compilation of extension modules, additional +vectorization features that were previously only available with the +proprietary extension NumbaPro, improvements in array indexing. + +Improvements: + +* PR #1497: Allow scalar input type instead of size-1 array to @guvectorize +* PR #1480: Add distutils support for AOT compilation +* PR #1460: Create a new API for Ahead-of-Time (AOT) compilation +* PR #1451: Allow passing Python lists to JIT-compiled functions, and + reflect mutations on function return +* PR #1387: Numpy 1.10 support +* PR #1464: Support cffi.FFI.from_buffer() +* PR #1437: Propagate errors raised from Numba-compiled ufuncs; also, + let "division by zero" and other math errors produce a warning instead + of exiting the function early +* PR #1445: Support a subset of fancy indexing +* PR #1454: Support "out-of-line" CFFI modules +* PR #1442: Improve array indexing to support more kinds of basic slicing +* PR #1409: Support explicit CUDA memory fences +* PR #1435: Add support for vectorize() and guvectorize() with HSA +* PR #1432: Implement numpy.nonzero() and numpy.where() +* PR #1416: Add support for vectorize() and guvectorize() with CUDA, + as originally provided in NumbaPro +* PR #1424: Support in-place array operators +* PR #1414: Python 3.5 support +* PR #1404: Add the parallel ufunc functionality originally provided in + NumbaPro +* PR #1393: Implement sorting on arrays and lists +* PR #1415: Add functions to estimate the occupancy of a CUDA kernel +* PR #1360: The JIT cache now stores the compiled object code, yielding + even larger speedups. +* PR #1402: Fixes for the ARMv7 (armv7l) architecture under Linux +* PR #1400: Add the cuda.reduce() decorator originally provided in NumbaPro + +Fixes: + +* PR #1483: Allow np.empty_like() and friends on non-contiguous arrays +* Issue #1471: Allow caching JIT functions defined in IPython +* PR #1457: Fix flat indexing of boolean arrays +* PR #1421: Allow calling Numpy ufuncs, without an explicit output, on + non-contiguous arrays +* Issue #1411: Fix crash when unpacking a tuple containing a Numba-allocated array +* Issue #1394: Allow unifying range_state32 and range_state64 +* Issue #1373: Fix code generation error on lists of bools + + +Version 0.21.0 +-------------- + +This release introduces support for AMD's Heterogeneous System Architecture, +which allows memory to be shared directly between the CPU and the GPU. +Other major enhancements are support for lists and the introduction of +an opt-in compilation cache. + +Improvements: + +* PR #1391: Implement print() for CUDA code +* PR #1366: Implement integer typing enhancement proposal (NBEP 1) +* PR #1380: Support the one-argument type() builtin +* PR #1375: Allow boolean evaluation of lists and tuples +* PR #1371: Support array.view() in CUDA mode +* PR #1369: Support named tuples in nopython mode +* PR #1250: Implement numpy.median(). +* PR #1289: Make dispatching faster when calling a JIT-compiled function + from regular Python +* Issue #1226: Improve performance of integer power +* PR #1321: Document features supported with CUDA +* PR #1345: HSA support +* PR #1343: Support lists in nopython mode +* PR #1356: Make Numba-allocated memory visible to tracemalloc +* PR #1363: Add an environment variable NUMBA_DEBUG_TYPEINFER +* PR #1051: Add an opt-in, per-function compilation cache + +Fixes: + +* Issue #1372: Some array expressions would fail rewriting when involved + the same variable more than once, or a unary operator +* Issue #1385: Allow CUDA local arrays to be declared anywhere in a function +* Issue #1285: Support datetime64 and timedelta64 in Numpy reduction functions +* Issue #1332: Handle the EXTENDED_ARG opcode. +* PR #1329: Handle the ``in`` operator in object mode +* Issue #1322: Fix augmented slice assignment on Python 2 +* PR #1357: Fix slicing with some negative bounds or step values. + + +Version 0.20.0 +-------------- + +This release updates Numba to use LLVM 3.6 and CUDA 7 for CUDA support. +Following the platform deprecation in CUDA 7, Numba's CUDA feature is no +longer supported on 32-bit platforms. The oldest supported version of +Windows is Windows 7. + +Improvements: + +* Issue #1203: Support indexing ndarray.flat +* PR #1200: Migrate cgutils to llvmlite +* PR #1190: Support more array methods: .transpose(), .T, .copy(), .reshape(), .view() +* PR #1214: Simplify setup.py and avoid manual maintenance +* PR #1217: Support datetime64 and timedelta64 constants +* PR #1236: Reload environment variables when compiling +* PR #1225: Various speed improvements in generated code +* PR #1252: Support cmath module in CUDA +* PR #1238: Use 32-byte aligned allocator to optimize for AVX +* PR #1258: Support numpy.frombuffer() +* PR #1274: Use TravisCI container infrastructure for lower wait time +* PR #1279: Micro-optimize overload resolution in call dispatch +* Issue #1248: Improve error message when return type unification fails + +Fixes: + +* Issue #1131: Handling of negative zeros in np.conjugate() and np.arccos() +* Issue #1188: Fix slow array return +* Issue #1164: Avoid warnings from CUDA context at shutdown +* Issue #1229: Respect the writeable flag in arrays +* Issue #1244: Fix bug in refcount pruning pass +* Issue #1251: Fix partial left-indexing of Fortran contiguous array +* Issue #1264: Fix compilation error in array expression +* Issue #1254: Fix error when yielding array objects +* Issue #1276: Fix nested generator use + + +Version 0.19.2 +-------------- + +This release fixes the source distribution on pypi. The only change is in the +setup.py file. We do not plan to provide a conda package as this release is +essentially the same as 0.19.1 for conda users. + + +Version 0.19.1 +-------------- + +* Issue #1196: + + * fix double-free segfault due to redundant variable deletion in the + Numba IR (#1195) + * fix use-after-delete in array expression rewrite pass + + +Version 0.19.0 +-------------- + +This version introduces memory management in the Numba runtime, allowing to +allocate new arrays inside Numba-compiled functions. There is also a rework +of the ufunc infrastructure, and an optimization pass to collapse cascading +array operations into a single efficient loop. + +.. warning:: + Support for Windows XP and Vista with all compiler targets and support + for 32-bit platforms (Win/Mac/Linux) with the CUDA compiler target are + deprecated. In the next release of Numba, the oldest version of Windows + supported will be Windows 7. CPU compilation will remain supported + on 32-bit Linux and Windows platforms. + +Known issues: + +* There are some performance regressions in very short running ``nopython`` + functions due to the additional overhead incurred by memory management. + We will work to reduce this overhead in future releases. + +Features: + +* Issue #1181: Add a Frequently Asked Questions section to the documentation. +* Issue #1162: Support the ``cumsum()`` and ``cumprod()`` methods on Numpy + arrays. +* Issue #1152: Support the ``*args`` argument-passing style. +* Issue #1147: Allow passing character sequences as arguments to + JIT-compiled functions. +* Issue #1110: Shortcut deforestation and loop fusion for array expressions. +* Issue #1136: Support various Numpy array constructors, for example + numpy.zeros() and numpy.zeros_like(). +* Issue #1127: Add a CUDA simulator running on the CPU, enabled with the + NUMBA_ENABLE_CUDASIM environment variable. +* Issue #1086: Allow calling standard Numpy ufuncs without an explicit + output array from ``nopython`` functions. +* Issue #1113: Support keyword arguments when calling numpy.empty() + and related functions. +* Issue #1108: Support the ``ctypes.data`` attribute of Numpy arrays. +* Issue #1077: Memory management for array allocations in ``nopython`` mode. +* Issue #1105: Support calling a ctypes function that takes ctypes.py_object + parameters. +* Issue #1084: Environment variable NUMBA_DISABLE_JIT disables compilation + of ``@jit`` functions, instead calling into the Python interpreter + when called. This allows easier debugging of multiple jitted functions. +* Issue #927: Allow gufuncs with no output array. +* Issue #1097: Support comparisons between tuples. +* Issue #1075: Numba-generated ufuncs can now be called from ``nopython`` + functions. +* Issue #1062: ``@vectorize`` now allows omitting the signatures, and will + compile the required specializations on the fly (like ``@jit`` does). +* Issue #1027: Support numpy.round(). +* Issue #1085: Allow returning a character sequence (as fetched from a + structured array) from a JIT-compiled function. + +Fixes: + +* Issue #1170: Ensure ``ndindex()``, ``ndenumerate()`` and ``ndarray.flat`` + work properly inside generators. +* Issue #1151: Disallow unpacking of tuples with the wrong size. +* Issue #1141: Specify install dependencies in setup.py. +* Issue #1106: Loop-lifting would fail when the lifted loop does not + produce any output values for the function tail. +* Issue #1103: Fix mishandling of some inputs when a JIT-compiled function + is called with multiple array layouts. +* Issue #1089: Fix range() with large unsigned integers. +* Issue #1088: Install entry-point scripts (numba, pycc) from the conda + build recipe. +* Issue #1081: Constant structured scalars now work properly. +* Issue #1080: Fix automatic promotion of booleans to integers. + + +Version 0.18.2 +-------------- + +Bug fixes: + +* Issue #1073: Fixes missing template file for HTML annotation +* Issue #1074: Fixes CUDA support on Windows machine due to NVVM API mismatch + + +Version 0.18.1 +-------------- + +Version 0.18.0 is not officially released. + +This version removes the old deprecated and undocumented ``argtypes`` and +``restype`` arguments to the ``@jit`` decorator. Function signatures +should always be passed as the first argument to ``@jit``. + +Features: + +* Issue #960: Add inspect_llvm() and inspect_asm() methods to JIT-compiled + functions: they output the LLVM IR and the native assembler source of the + compiled function, respectively. +* Issue #990: Allow passing tuples as arguments to JIT-compiled functions + in ``nopython`` mode. +* Issue #774: Support two-argument round() in ``nopython`` mode. +* Issue #987: Support missing functions from the math module in nopython + mode: frexp(), ldexp(), gamma(), lgamma(), erf(), erfc(). +* Issue #995: Improve code generation for round() on Python 3. +* Issue #981: Support functions from the random and numpy.random modules + in ``nopython`` mode. +* Issue #979: Add cuda.atomic.max(). +* Issue #1006: Improve exception raising and reporting. It is now allowed + to raise an exception with an error message in ``nopython`` mode. +* Issue #821: Allow ctypes- and cffi-defined functions as arguments to + ``nopython`` functions. +* Issue #901: Allow multiple explicit signatures with ``@jit``. The + signatures must be passed in a list, as with ``@vectorize``. +* Issue #884: Better error message when a JIT-compiled function is called + with the wrong types. +* Issue #1010: Simpler and faster CUDA argument marshalling thanks to a + refactoring of the data model. +* Issue #1018: Support arrays of scalars inside Numpy structured types. +* Issue #808: Reduce Numba import time by half. +* Issue #1021: Support the buffer protocol in ``nopython`` mode. + Buffer-providing objects, such as ``bytearray``, ``array.array`` or + ``memoryview`` support array-like operations such as indexing and iterating. + Furthermore, some standard attributes on the ``memoryview`` object are + supported. +* Issue #1030: Support nested arrays in Numpy structured arrays. +* Issue #1033: Implement the inspect_types(), inspect_llvm() and inspect_asm() + methods for CUDA kernels. +* Issue #1029: Support Numpy structured arrays with CUDA as well. +* Issue #1034: Support for generators in nopython and object mode. +* Issue #1044: Support default argument values when calling Numba-compiled + functions. +* Issue #1048: Allow calling Numpy scalar constructors from CUDA functions. +* Issue #1047: Allow indexing a multi-dimensional array with a single integer, + to take a view. +* Issue #1050: Support len() on tuples. +* Issue #1011: Revive HTML annotation. + +Fixes: + +* Issue #977: Assignment optimization was too aggressive. +* Issue #561: One-argument round() now returns an int on Python 3. +* Issue #1001: Fix an unlikely bug where two closures with the same name + and id() would compile to the same LLVM function name, despite different + closure values. +* Issue #1006: Fix reference leak when a JIT-compiled function is disposed of. +* Issue #1017: Update instructions for CUDA in the README. +* Issue #1008: Generate shorter LLVM type names to avoid segfaults with CUDA. +* Issue #1005: Properly clean up references when raising an exception from + object mode. +* Issue #1041: Fix incompatibility between Numba and the third-party + library "future". +* Issue #1053: Fix the size attribute of CUDA shared arrays. + + +Version 0.17.0 +-------------- + +The major focus in this release has been a rewrite of the documentation. +The new documentation is better structured and has more detailed coverage +of Numba features and APIs. It can be found online at +https://numba.pydata.org/numba-doc/dev/index.html + +Features: + +* Issue #895: LLVM can now inline nested function calls in ``nopython`` mode. +* Issue #863: CUDA kernels can now infer the types of their arguments + ("autojit"-like). +* Issue #833: Support numpy.{min,max,argmin,argmax,sum,mean,var,std} + in ``nopython`` mode. +* Issue #905: Add a ``nogil`` argument to the ``@jit`` decorator, to + release the GIL in ``nopython`` mode. +* Issue #829: Add a ``identity`` argument to ``@vectorize`` and + ``@guvectorize``, to set the identity value of the ufunc. +* Issue #843: Allow indexing 0-d arrays with the empty tuple. +* Issue #933: Allow named arguments, not only positional arguments, when + calling a Numba-compiled function. +* Issue #902: Support numpy.ndenumerate() in ``nopython`` mode. +* Issue #950: AVX is now enabled by default except on Sandy Bridge and + Ivy Bridge CPUs, where it can produce slower code than SSE. +* Issue #956: Support constant arrays of structured type. +* Issue #959: Indexing arrays with floating-point numbers isn't allowed + anymore. +* Issue #955: Add support for 3D CUDA grids and thread blocks. +* Issue #902: Support numpy.ndindex() in ``nopython`` mode. +* Issue #951: Numpy number types (``numpy.int8``, etc.) can be used as + constructors for type conversion in ``nopython`` mode. + +Fixes: + +* Issue #889: Fix ``NUMBA_DUMP_ASSEMBLY`` for the CUDA backend. +* Issue #903: Fix calling of stdcall functions with ctypes under Windows. +* Issue #908: Allow lazy-compiling from several threads at once. +* Issue #868: Wrong error message when multiplying a scalar by a non-scalar. +* Issue #917: Allow vectorizing with datetime64 and timedelta64 in the + signature (only with unit-less values, though, because of a Numpy limitation). +* Issue #431: Allow overloading of cuda device function. +* Issue #917: Print out errors occurred in object mode ufuncs. +* Issue #923: Numba-compiled ufuncs now inherit the name and doc of the + original Python function. +* Issue #928: Fix boolean return value in nested calls. +* Issue #915: ``@jit`` called with an explicit signature with a mismatching + type of arguments now raises an error. +* Issue #784: Fix the truth value of NaNs. +* Issue #953: Fix using shared memory in more than one function (kernel or + device). +* Issue #970: Fix an uncommon double to uint64 conversion bug on CentOS5 + 32-bit (C compiler issue). + + +Version 0.16.0 +-------------- + +This release contains a major refactor to switch from llvmpy to `llvmlite `_ +as our code generation backend. The switch is necessary to reconcile +different compiler requirements for LLVM 3.5 (needs C++11) and Python +extensions (need specific compiler versions on Windows). As a bonus, we have +found the use of llvmlite speeds up compilation by a factor of 2! + +Other Major Changes: + +* Faster dispatch for numpy structured arrays +* Optimized array.flat() +* Improved CPU feature selection +* Fix constant tuple regression in macro expansion code + +Known Issues: + +* AVX code generation is still disabled by default due to performance + regressions when operating on misaligned NumPy arrays. We hope to have a + workaround in the future. +* In *extremely* rare circumstances, a `known issue with LLVM 3.5 `_ + code generation can cause an ELF relocation error on 64-bit Linux systems. + + +Version 0.15.1 +-------------- + +(This was a bug-fix release that superceded version 0.15 before it was +announced.) + +Fixes: + +* Workaround for missing __ftol2 on Windows XP. +* Do not lift loops for compilation that contain break statements. +* Fix a bug in loop-lifting when multiple values need to be returned to + the enclosing scope. +* Handle the loop-lifting case where an accumulator needs to be updated when + the loop count is zero. + +Version 0.15 +------------ + +Features: + +* Support for the Python ``cmath`` module. (NumPy complex functions were + already supported.) +* Support for ``.real``, ``.imag``, and `.conjugate()`` on non-complex + numbers. +* Add support for ``math.isfinite()`` and ``math.copysign()``. +* Compatibility mode: If enabled (off by default), a failure to compile in + object mode will fall back to using the pure Python implementation of the + function. +* *Experimental* support for serializing JIT functions with cloudpickle. +* Loop-jitting in object mode now works with loops that modify scalars that + are accessed after the loop, such as accumulators. +* ``@vectorize`` functions can be compiled in object mode. +* Numba can now be built using the `Visual C++ Compiler for Python 2.7 `_ + on Windows platforms. +* CUDA JIT functions can be returned by factory functions with variables in + the closure frozen as constants. +* Support for "optional" types in nopython mode, which allow ``None`` to be a + valid value. + +Fixes: + +* If nopython mode compilation fails for any reason, automatically fall back + to object mode (unless nopython=True is passed to @jit) rather than raise + an exeception. +* Allow function objects to be returned from a function compiled in object + mode. +* Fix a linking problem that caused slower platform math functions (such as + ``exp()``) to be used on Windows, leading to performance regressions against + NumPy. +* ``min()`` and ``max()`` no longer accept scalars arguments in nopython mode. +* Fix handling of ambigous type promotion among several compiled versions of a + JIT function. The dispatcher will now compile a new version to resolve the + problem. (issue #776) +* Fix float32 to uint64 casting bug on 32-bit Linux. +* Fix type inference to allow forced casting of return types. +* Allow the shape of a 1D ``cuda.shared.array`` and ``cuda.local.array`` to be + a one-element tuple. +* More correct handling of signed zeros. +* Add custom implementation of ``atan2()`` on Windows to handle special cases + properly. +* Eliminated race condition in the handling of the pagelocked staging area + used when transferring CUDA arrays. +* Fix non-deterministic type unification leading to varying performance. + (issue #797) + + +Version 0.14 +------------ + +Features: + +* Support for nearly all the Numpy math functions (including comparison, + logical, bitwise and some previously missing float functions) in nopython mode. +* The Numpy datetime64 and timedelta64 dtypes are supported in nopython mode + with Numpy 1.7 and later. +* Support for Numpy math functions on complex numbers in nopython mode. +* ndarray.sum() is supported in nopython mode. +* Better error messages when unsupported types are used in Numpy math functions. +* Set NUMBA_WARNINGS=1 in the environment to see which functions are compiled + in object mode vs. nopython mode. +* Add support for the two-argument pow() builtin function in nopython mode. +* New developer documentation describing how Numba works, and how to + add new types. +* Support for Numpy record arrays on the GPU. (Note: Improper alignment of dtype + fields will cause an exception to be raised.) +* Slices on GPU device arrays. +* GPU objects can be used as Python context managers to select the active + device in a block. +* GPU device arrays can be bound to a CUDA stream. All subsequent operations + (such as memory copies) will be queued on that stream instead of the default. + This can prevent unnecessary synchronization with other streams. + +Fixes: + +* Generation of AVX instructions has been disabled to avoid performance bugs + when calling external math functions that may use SSE instructions, + especially on OS X. +* JIT functions can be removed by the garbage collector when they are no + longer accessible. +* Various other reference counting fixes to prevent memory leaks. +* Fixed handling of exception when input argument is out of range. +* Prevent autojit functions from making unsafe numeric conversions when + called with different numeric types. +* Fix a compilation error when an unhashable global value is accessed. +* Gracefully handle failure to enable faulthandler in the IPython Notebook. +* Fix a bug that caused loop lifting to fail if the loop was inside an + ``else`` block. +* Fixed a problem with selecting CUDA devices in multithreaded programs on + Linux. +* The ``pow()`` function (and ``**`` operation) applied to two integers now + returns an integer rather than a float. +* Numpy arrays using the object dtype no longer cause an exception in the + autojit. +* Attempts to write to a global array will cause compilation to fall back + to object mode, rather than attempt and fail at nopython mode. +* ``range()`` works with all negative arguments (ex: ``range(-10, -12, -1)``) + +Version 0.13.4 +-------------- + +Features: + +* Setting and deleting attributes in object mode +* Added documentation of supported and currently unsupported numpy ufuncs +* Assignment to 1-D numpy array slices +* Closure variables and functions can be used in object mode +* All numeric global values in modules can be used as constants in JIT + compiled code +* Support for the start argument in enumerate() +* Inplace arithmetic operations (+=, -=, etc.) +* Direct iteration over a 1D numpy array (e.g. "for x in array: ...") + in nopython mode + +Fixes: + +* Support for NVIDIA compute capability 5.0 devices (such as the GTX 750) +* Vectorize no longer crashes/gives an error when bool\_ is used as return type +* Return the correct dictionary when globals() is used in JIT functions +* Fix crash bug when creating dictionary literals in object +* Report more informative error message on import if llvmpy is too old +* Temporarily disable pycc --header, which generates incorrect function + signatures. + +Version 0.13.3 +-------------- + +Features: + +* Support for enumerate() and zip() in nopython mode +* Increased LLVM optimization of JIT functions to -O1, enabling automatic + vectorization of compiled code in some cases +* Iteration over tuples and unpacking of tuples in nopython mode +* Support for dict and set (Python >= 2.7) literals in object mode + +Fixes: + +* JIT functions have the same __name__ and __doc__ as the original function. +* Numerous improvements to better match the data types and behavior of Python + math functions in JIT compiled code on different platforms. +* Importing Numba will no longer throw an exception if the CUDA driver is + present, but cannot be initialized. +* guvectorize now properly supports functions with scalar arguments. +* CUDA driver is lazily initialized + +Version 0.13.2 +-------------- + +Features: + +* @vectorize ufunc now can generate SIMD fast path for unit strided array +* Added cuda.gridsize +* Added preliminary exception handling (raise exception class) + +Fixes: + +* UNARY_POSITIVE +* Handling of closures and dynamically generated functions +* Global None value + +Version 0.13.1 +-------------- + +Features: + +* Initial support for CUDA array slicing + +Fixes: + +* Indirectly fixes numbapro when the system has a incompatible CUDA driver +* Fix numba.cuda.detect +* Export numba.intp and numba.intc + +Version 0.13 +------------ + +Features: + +* Opensourcing NumbaPro CUDA python support in `numba.cuda` +* Add support for ufunc array broadcasting +* Add support for mixed input types for ufuncs +* Add support for returning tuple from jitted function + +Fixes: + +* Fix store slice bytecode handling for Python2 +* Fix inplace subtract +* Fix pycc so that correct header is emitted +* Allow vectorize to work on functions with jit decorator + + +Version 0.12.2 +-------------- + +Fixes: + +* Improved NumPy ufunc support in nopython mode +* Misc bug fixes + + +Version 0.12.1 +-------------- + +This version fixed many regressions reported by user for the 0.12 release. +This release contains a new loop-lifting mechanism that specializes certains +loop patterns for nopython mode compilation. This avoid direct support +for heap-allocating and other very dynamic operations. + +Improvements: + +* Add loop-lifting--jit-ing loops in nopython for object mode code. This allows + functions to allocate NumPy arrays and use Python objects, while the tight + loops in the function can still be compiled in nopython mode. Any arrays that + the tight loop uses should be created before the loop is entered. + +Fixes: + +* Add support for majority of "math" module functions +* Fix for...else handling +* Add support for builtin round() +* Fix tenary if...else support +* Revive "numba" script +* Fix problems with some boolean expressions +* Add support for more NumPy ufuncs + + +Version 0.12 +------------ + +Version 0.12 contains a big refactor of the compiler. The main objective for +this refactor was to simplify the code base to create a better foundation for +further work. A secondary objective was to improve the worst case performance +to ensure that compiled functions in object mode never run slower than pure +Python code (this was a problem in several cases with the old code base). This +refactor is still a work in progress and further testing is needed. + +Main improvements: + +* Major refactor of compiler for performance and maintenance reasons +* Better fallback to object mode when native mode fails +* Improved worst case performance in object mode + +The public interface of numba has been slightly changed. The idea is to +make it cleaner and more rational: + +* jit decorator has been modified, so that it can be called without a signature. + When called without a signature, it behaves as the old autojit. Autojit + has been deprecated in favour of this approach. +* Jitted functions can now be overloaded. +* Added a "njit" decorator that behaves like "jit" decorator with nopython=True. +* The numba.vectorize namespace is gone. The vectorize decorator will + be in the main numba namespace. +* Added a guvectorize decorator in the main numba namespace. It is + similar to numba.vectorize, but takes a dimension signature. It + generates gufuncs. This is a replacement for the GUVectorize gufunc + factory which has been deprecated. + +Main regressions (will be fixed in a future release): + +* Creating new NumPy arrays is not supported in nopython mode +* Returning NumPy arrays is not supported in nopython mode +* NumPy array slicing is not supported in nopython mode +* lists and tuples are not supported in nopython mode +* string, datetime, cdecimal, and struct types are not implemented yet +* Extension types (classes) are not supported in nopython mode +* Closures are not supported +* Raise keyword is not supported +* Recursion is not support in nopython mode + +Version 0.11 +------------ +* Experimental support for NumPy datetime type + +Version 0.10 +------------ +* Annotation tool (./bin/numba --annotate --fancy) (thanks to Jay Bourque) +* Open sourced prange +* Support for raise statement +* Pluggable array representation +* Support for enumerate and zip (thanks to Eugene Toder) +* Better string formatting support (thanks to Eugene Toder) +* Builtins min(), max() and bool() (thanks to Eugene Toder) +* Fix some code reloading issues (thanks to Björn Linse) +* Recognize NumPy scalar objects (thanks to Björn Linse) + + +Version 0.9 +----------- +* Improved math support +* Open sourced generalized ufuncs +* Improved array expressions + +Version 0.8 +----------- +* Support for autojit classes + * Inheritance not yet supported +* Python 3 support for pycc +* Allow retrieval of ctypes function wrapper + * And hence support retrieval of a pointer to the function +* Fixed a memory leak of array slicing views + +Version 0.7.2 +------------- +* Official Python 3 support (python 3.2 and 3.3) +* Support for intrinsics and instructions +* Various bug fixes (see https://github.com/numba/numba/issues?milestone=7&state=closed) + +Version 0.7.1 +------------- +* Various bug fixes + +Version 0.7 +----------- +* Open sourced single-threaded ufunc vectorizer +* Open sourced NumPy array expression compilation +* Open sourced fast NumPy array slicing +* Experimental Python 3 support +* Support for typed containers + * typed lists and tuples +* Support for iteration over objects +* Support object comparisons +* Preliminary CFFI support + * Jit calls to CFFI functions (passed into autojit functions) + * TODO: Recognize ffi_lib.my_func attributes +* Improved support for ctypes +* Allow declaring extension attribute types as through class attributes +* Support for type casting in Python + * Get the same semantics with or without numba compilation +* Support for recursion + * For jit methods and extension classes +* Allow jit functions as C callbacks +* Friendlier error reporting +* Internal improvements +* A variety of bug fixes + +Version 0.6.1 +-------------- +* Support for bitwise operations + +Version 0.6 +-------------- +* Python 2.6 support +* Programmable typing + * Allow users to add type inference for external code +* Better NumPy type inference + * outer, inner, dot, vdot, tensordot, nonzero, where, + binary ufuncs + methods (reduce, accumulate, reduceat, outer) +* Type based alias analysis + * Support for strict aliasing +* Much faster autojit dispatch when calling from Python +* Faster numerical loops through data and stride pre-loading +* Integral overflow and underflow checking for conversions from objects +* Make Meta dependency optional + +Version 0.5 +-------------- +* SSA-based type inference + * Allows variable reuse + * Allow referring to variables before lexical definition +* Support multiple comparisons +* Support for template types +* List comprehensions +* Support for pointers +* Many bug fixes +* Added user documentation + +Version 0.4 +-------------- + +Version 0.3.2 +-------------- + +* Add support for object arithmetic (issue 56). +* Bug fixes (issue 55). + +Version 0.3 +-------------- +* Changed default compilation approach to ast +* Added support for cross-module linking +* Added support for closures (can jit inner functions and return them) (see examples/closure.py) +* Added support for dtype structures (can access elements of structure with attribute access) (see examples/structures.py) +* Added support for extension types (numba classes) (see examples/numbaclasses.py) +* Added support for general Python code (use nopython to raise an error if Python C-API is used to avoid unexpected slowness because of lack of implementation defaulting to generic Python) +* Fixed many bugs +* Added support to detect math operations. +* Added with python and with nopython contexts +* Added more examples + +Many features need to be documented still. Look at examples and tests for more information. + + +Version 0.2 +-------------- +* Added an ast approach to compilation +* Removed d, f, i, b from numba namespace (use f8, f4, i4, b1) +* Changed function to autojit2 +* Added autojit function to decorate calls to the function and use types of the variable to create compiled versions. +* changed keyword arguments to jit and autojit functions to restype and argtypes to be consistent with ctypes module. +* Added pycc -- a python to shared library compiler diff --git a/cv/3d_detection/centerpoint/pytorch/numba/CONTRIBUTING.md b/cv/3d_detection/centerpoint/pytorch/numba/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..59d6b807c098da986ad2c943d2199c33c0affc83 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/CONTRIBUTING.md @@ -0,0 +1,53 @@ + +We welcome people who want to make contributions to Numba, big or small! +Even simple documentation improvements are encouraged. + +# Asking questions + +Numba has a [discourse forum](https://numba.discourse.group/) for longer/more +involved questions and an IRC channel on +[gitter.im](https://gitter.im/numba/numba) for quick questions and interactive +help. + +# Ways to help: + +There's lots of ways to help improve Numba, some of these require creating code +changes, see **contributing patches** below. + +## Quick things: + +* Answer a question asked on [discourse](https://numba.discourse.group/) or + [gitter.im](https://gitter.im/numba/numba). +* Review a page of documentation, check it makes sense, that it's clear and + still relevant, that the examples are present, good and working. Fix anything + that needs updating in a pull request. +* Make a file that is not `flake8` compliant meet the standard, a list of all + failing files is in the `exclude` section of the [`.flake8` config](https://github.com/numba/numba/blob/main/.flake8), + then create a pull request with the change. + +## More involved things: + +* Review a pull request, you don't need to be a compiler engineer to do an + initial review of a pull request. It's incredibly helpful to have pull + requests go through a review to just make sure the code change is well formed, + documented, efficient and clear. Further, if the code is fixing a bug, making + sure that tests are present demonstrating it is fixed! Look out for PRs with + the [`needs initial review`](https://github.com/numba/numba/labels/needs%20initial%20review) + label. +* Work on fixing or implementing something in the code base, there are a lot of + [`good first issue's`](https://github.com/numba/numba/labels/good%20first%20issue) + and [`good second issue's`](https://github.com/numba/numba/labels/good%20first%20issue). + For implementing new features/functionality, the extension API is the best + thing to use and a guide to using `@overload` in particular is + [here](https://numba.pydata.org/numba-doc/dev/extending/overloading-guide.html) + and the API documentation is [here](https://numba.pydata.org/numba-doc/latest/extending/high-level.html#implementing-functions). + +## Contributing patches + +Please fork the Numba repository on Github, and create a new branch +containing your work. When you are done, open a pull request. + +# Further reading + +Please read the [contributing guide]( +https://numba.pydata.org/numba-doc/dev/developer/contributing.html). diff --git a/cv/3d_detection/centerpoint/pytorch/numba/LICENSE b/cv/3d_detection/centerpoint/pytorch/numba/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..7d19426e7a09d04cef6dbd2a2857434036362ee1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2012, Anaconda, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/LICENSES.third-party b/cv/3d_detection/centerpoint/pytorch/numba/LICENSES.third-party new file mode 100644 index 0000000000000000000000000000000000000000..056142b01e67ea0f3accc504908dda7fb325af61 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/LICENSES.third-party @@ -0,0 +1,493 @@ +The Numba source tree includes vendored libraries governed by the following +licenses. + + +appdirs +------- + +# This is the MIT license + +Copyright (c) 2010 ActiveState Software Inc. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +NetworkX +-------- +The dominance frontier algorithm is from a pull request +https://github.com/numba/numba/pull/4149/files which is based +on the implementation of NetworkX of dominance. NetworkX has the following +license: + +NetworkX is distributed with the 3-clause BSD license. + +:: + + Copyright (C) 2004-2019, NetworkX Developers + Aric Hagberg + Dan Schult + Pieter Swart + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the NetworkX Developers nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +jquery.graphviz.svg (https://github.com/mountainstorm/jquery.graphviz.svg/) +--------------------------------------------------------------------------- +The DAG roadmap rendering code in docs/dagmap/ uses Javascript from this +package to draw graphs in HTML. + +Copyright (c) 2015 Mountainstorm +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + + +CPython (https://github.com/python/cpython) +------------------------------------------- +Numba source code that references URLs starting with: + +https://github.com/python/cpython/ + +relates to use/inclusion of CPython source code which has the following license: + +A. HISTORY OF THE SOFTWARE +========================== + +Python was created in the early 1990s by Guido van Rossum at Stichting +Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands +as a successor of a language called ABC. Guido remains Python's +principal author, although it includes many contributions from others. + +In 1995, Guido continued his work on Python at the Corporation for +National Research Initiatives (CNRI, see http://www.cnri.reston.va.us) +in Reston, Virginia where he released several versions of the +software. + +In May 2000, Guido and the Python core development team moved to +BeOpen.com to form the BeOpen PythonLabs team. In October of the same +year, the PythonLabs team moved to Digital Creations, which became +Zope Corporation. In 2001, the Python Software Foundation (PSF, see +https://www.python.org/psf/) was formed, a non-profit organization +created specifically to own Python-related Intellectual Property. +Zope Corporation was a sponsoring member of the PSF. + +All Python releases are Open Source (see http://www.opensource.org for +the Open Source Definition). Historically, most, but not all, Python +releases have also been GPL-compatible; the table below summarizes +the various releases. + + Release Derived Year Owner GPL- + from compatible? (1) + + 0.9.0 thru 1.2 1991-1995 CWI yes + 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes + 1.6 1.5.2 2000 CNRI no + 2.0 1.6 2000 BeOpen.com no + 1.6.1 1.6 2001 CNRI yes (2) + 2.1 2.0+1.6.1 2001 PSF no + 2.0.1 2.0+1.6.1 2001 PSF yes + 2.1.1 2.1+2.0.1 2001 PSF yes + 2.1.2 2.1.1 2002 PSF yes + 2.1.3 2.1.2 2002 PSF yes + 2.2 and above 2.1.1 2001-now PSF yes + +Footnotes: + +(1) GPL-compatible doesn't mean that we're distributing Python under + the GPL. All Python licenses, unlike the GPL, let you distribute + a modified version without making your changes open source. The + GPL-compatible licenses make it possible to combine Python with + other software that is released under the GPL; the others don't. + +(2) According to Richard Stallman, 1.6.1 is not GPL-compatible, + because its license has a choice of law clause. According to + CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 + is "not incompatible" with the GPL. + +Thanks to the many outside volunteers who have worked under Guido's +direction to make these releases possible. + + +B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON +=============================================================== + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +-------------------------------------------- + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF hereby +grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, +analyze, test, perform and/or display publicly, prepare derivative works, +distribute, and otherwise use Python alone or in any derivative version, +provided, however, that PSF's License Agreement and PSF's notice of copyright, +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019 Python Software Foundation; +All Rights Reserved" are retained in Python alone or in any derivative version +prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 +------------------------------------------- + +BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 + +1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an +office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the +Individual or Organization ("Licensee") accessing and otherwise using +this software in source or binary form and its associated +documentation ("the Software"). + +2. Subject to the terms and conditions of this BeOpen Python License +Agreement, BeOpen hereby grants Licensee a non-exclusive, +royalty-free, world-wide license to reproduce, analyze, test, perform +and/or display publicly, prepare derivative works, distribute, and +otherwise use the Software alone or in any derivative version, +provided, however, that the BeOpen Python License is retained in the +Software, alone or in any derivative version prepared by Licensee. + +3. BeOpen is making the Software available to Licensee on an "AS IS" +basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE +SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS +AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY +DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +5. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +6. This License Agreement shall be governed by and interpreted in all +respects by the law of the State of California, excluding conflict of +law provisions. Nothing in this License Agreement shall be deemed to +create any relationship of agency, partnership, or joint venture +between BeOpen and Licensee. This License Agreement does not grant +permission to use BeOpen trademarks or trade names in a trademark +sense to endorse or promote products or services of Licensee, or any +third party. As an exception, the "BeOpen Python" logos available at +http://www.pythonlabs.com/logos.html may be used according to the +permissions granted on that web page. + +7. By copying, installing or otherwise using the software, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 +--------------------------------------- + +1. This LICENSE AGREEMENT is between the Corporation for National +Research Initiatives, having an office at 1895 Preston White Drive, +Reston, VA 20191 ("CNRI"), and the Individual or Organization +("Licensee") accessing and otherwise using Python 1.6.1 software in +source or binary form and its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, CNRI +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python 1.6.1 +alone or in any derivative version, provided, however, that CNRI's +License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) +1995-2001 Corporation for National Research Initiatives; All Rights +Reserved" are retained in Python 1.6.1 alone or in any derivative +version prepared by Licensee. Alternately, in lieu of CNRI's License +Agreement, Licensee may substitute the following text (omitting the +quotes): "Python 1.6.1 is made available subject to the terms and +conditions in CNRI's License Agreement. This Agreement together with +Python 1.6.1 may be located on the Internet using the following +unique, persistent identifier (known as a handle): 1895.22/1013. This +Agreement may also be obtained from a proxy server on the Internet +using the following URL: http://hdl.handle.net/1895.22/1013". + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python 1.6.1 or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python 1.6.1. + +4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" +basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. This License Agreement shall be governed by the federal +intellectual property law of the United States, including without +limitation the federal copyright law, and, to the extent such +U.S. federal law does not apply, by the law of the Commonwealth of +Virginia, excluding Virginia's conflict of law provisions. +Notwithstanding the foregoing, with regard to derivative works based +on Python 1.6.1 that incorporate non-separable material that was +previously distributed under the GNU General Public License (GPL), the +law of the Commonwealth of Virginia shall govern this License +Agreement only as to issues arising under or with respect to +Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this +License Agreement shall be deemed to create any relationship of +agency, partnership, or joint venture between CNRI and Licensee. This +License Agreement does not grant permission to use CNRI trademarks or +trade name in a trademark sense to endorse or promote products or +services of Licensee, or any third party. + +8. By clicking on the "ACCEPT" button where indicated, or by copying, +installing or otherwise using Python 1.6.1, Licensee agrees to be +bound by the terms and conditions of this License Agreement. + + ACCEPT + + +CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 +-------------------------------------------------- + +Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, +The Netherlands. All rights reserved. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of Stichting Mathematisch +Centrum or CWI not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE +FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + +CPython unicode (https://github.com/python/cpython) +--------------------------------------------------- +Numba's unicode support includes source code/algorithms from CPython's unicode +implementation, Numba source code that has a reference starting with: + +https://github.com/python/cpython/ + +and contains in the path "Objects/unicodeobject.c" relates to use/inclusion of +CPython source code which has the following license along with the standard +CPython license: + + +Unicode implementation based on original code by Fredrik Lundh, +modified by Marc-Andre Lemburg . + +Major speed upgrades to the method implementations at the Reykjavik +NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. + +Copyright (c) Corporation for National Research Initiatives. + +-------------------------------------------------------------------- +The original string type implementation is: + + Copyright (c) 1999 by Secret Labs AB + Copyright (c) 1999 by Fredrik Lundh + +By obtaining, using, and/or copying this software and/or its +associated documentation, you agree that you have read, understood, +and will comply with the following terms and conditions: + +Permission to use, copy, modify, and distribute this software and its +associated documentation for any purpose and without fee is hereby +granted, provided that the above copyright notice appears in all +copies, and that both that copyright notice and this permission notice +appear in supporting documentation, and that the name of Secret Labs +AB or the author not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR +ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +-------------------------------------------------------------------- + + +cloudpickle +----------- + +This module was extracted from the `cloud` package, developed by +PiCloud, Inc. + +Copyright (c) 2015, Cloudpickle contributors. +Copyright (c) 2012, Regents of the University of California. +Copyright (c) 2009 PiCloud, Inc. http://www.picloud.com. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the University of California, Berkeley nor the + names of its contributors may be used to endorse or promote + products derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +© 2020 GitHub, Inc. + + +NumPy (https://github.com/numpy/numpy) +-------------------------------------- +Numba source code that references URLs starting with: + +https://github.com/numpy/numpy + +relates to use of/inclusion of/derivate work based on NumPy source code which +has the following license: + + +Copyright (c) 2005-2021, NumPy Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the NumPy Developers nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/MANIFEST.in b/cv/3d_detection/centerpoint/pytorch/numba/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..449dab31c5c798c11952781b4ae158cf9f7c607a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/MANIFEST.in @@ -0,0 +1,10 @@ +include MANIFEST.in +include README.rst setup.py runtests.py versioneer.py CHANGE_LOG LICENSE + +recursive-include numba *.c *.cpp *.h *.hpp *.inc +recursive-include docs *.ipynb *.txt *.py Makefile *.rst +recursive-include examples *.py + +prune docs/_build +prune docs/gh-pages +include numba/_version.py diff --git a/cv/3d_detection/centerpoint/pytorch/numba/README.rst b/cv/3d_detection/centerpoint/pytorch/numba/README.rst new file mode 100644 index 0000000000000000000000000000000000000000..48b2855b6d25b90b4e192aa7afae3f3a1820efc1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/README.rst @@ -0,0 +1,61 @@ +***** +Numba +***** + +.. image:: https://badges.gitter.im/numba/numba.svg + :target: https://gitter.im/numba/numba?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge + :alt: Gitter + +.. image:: https://img.shields.io/badge/discuss-on%20discourse-blue + :target: https://numba.discourse.group/ + :alt: Discourse + +.. image:: https://zenodo.org/badge/3659275.svg + :target: https://zenodo.org/badge/latestdoi/3659275 + :alt: Zenodo DOI + +.. image:: https://img.shields.io/pypi/v/numba.svg + :target: https://pypi.python.org/pypi/numba/ + :alt: PyPI + +.. image:: https://dev.azure.com/numba/numba/_apis/build/status/numba.numba?branchName=main + :target: https://dev.azure.com/numba/numba/_build/latest?definitionId=1?branchName=main + :alt: Azure Pipelines + +A Just-In-Time Compiler for Numerical Functions in Python +######################################################### + +Numba is an open source, NumPy-aware optimizing compiler for Python sponsored +by Anaconda, Inc. It uses the LLVM compiler project to generate machine code +from Python syntax. + +Numba can compile a large subset of numerically-focused Python, including many +NumPy functions. Additionally, Numba has support for automatic +parallelization of loops, generation of GPU-accelerated code, and creation of +ufuncs and C callbacks. + +For more information about Numba, see the Numba homepage: +https://numba.pydata.org and the online documentation: +https://numba.readthedocs.io/en/stable/index.html + +Installation +============ + +Please follow the instructions: + +https://numba.readthedocs.io/en/stable/user/installing.html + +Demo +==== + +Please have a look and the demo notebooks via the mybinder service: + +https://mybinder.org/v2/gh/numba/numba-examples/master?filepath=notebooks + +Contact +======= + +Numba has a discourse forum for discussions: + +* https://numba.discourse.group + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/azure-pipelines.yml b/cv/3d_detection/centerpoint/pytorch/numba/azure-pipelines.yml new file mode 100644 index 0000000000000000000000000000000000000000..79980d31b71c221cb3b6644ec7daf7c4ea5653a1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/azure-pipelines.yml @@ -0,0 +1,129 @@ +trigger: + batch: true + +variables: + # Change the following along with adding new TEST_START_INDEX. + TEST_COUNT: 20 + +jobs: +# Mac and Linux use the same template with different matrixes +- template: buildscripts/azure/azure-linux-macos.yml + parameters: + name: macOS + vmImage: macos-11 + matrix: + py37_np118: + PYTHON: '3.7' + NUMPY: '1.18' + CONDA_ENV: 'azure_ci' + TEST_START_INDEX: 0 + py310_np123: + PYTHON: '3.10' + NUMPY: '1.23' + CONDA_ENV: 'azure_ci' + TEST_THREADING: 'tbb' + TEST_START_INDEX: 1 + +- template: buildscripts/azure/azure-linux-macos.yml + parameters: + name: Linux + vmImage: ubuntu-20.04 + matrix: + py37_np118_32bit: + # 32 bit linux only has np 1.15 + PYTHON: '3.7' + NUMPY: '1.18' + CONDA_ENV: azure_ci + BITS32: yes + TEST_START_INDEX: 2 + py37_np118_vanilla: + PYTHON: '3.7' + NUMPY: '1.18' + CONDA_ENV: azure_ci + VANILLA_INSTALL: yes + TEST_START_INDEX: 3 + py38_np118_cov: + PYTHON: '3.8' + NUMPY: '1.18' + CONDA_ENV: azure_ci + RUN_COVERAGE: yes + RUN_FLAKE8: yes + RUN_MYPY: yes + TEST_START_INDEX: 4 + py38_np119_tbb: + PYTHON: '3.8' + NUMPY: '1.19.2=*_0' + CONDA_ENV: azure_ci + TEST_THREADING: 'tbb' + TEST_START_INDEX: 5 + py38_np119_omp: + PYTHON: '3.8' + NUMPY: '1.19.2=*_0' + CONDA_ENV: azure_ci + TEST_THREADING: omp + TEST_START_INDEX: 6 + py38_np119_workqueue: + PYTHON: '3.8' + NUMPY: '1.19.2=*_0' + CONDA_ENV: azure_ci + TEST_THREADING: workqueue + TEST_START_INDEX: 7 + py38_np120_doc: + PYTHON: '3.8' + NUMPY: '1.20' + CONDA_ENV: azure_ci + BUILD_DOC: yes + TEST_START_INDEX: 8 + py38_np120_pickle5: + PYTHON: '3.8' + NUMPY: '1.20' + CONDA_ENV: azure_ci + TEST_PICKLE5: yes + TEST_START_INDEX: 9 + py38_np120_svml: + PYTHON: '3.8' + NUMPY: '1.20' + CONDA_ENV: azure_ci + TEST_SVML: yes + TEST_START_INDEX: 10 + py38_np122: + PYTHON: '3.8' + NUMPY: '1.22' + CONDA_ENV: azure_ci + TEST_START_INDEX: 11 + py39_np119: + PYTHON: '3.9' + NUMPY: '1.19.2=*_0' + CONDA_ENV: azure_ci + TEST_START_INDEX: 12 + py39_np120_typeguard: + PYTHON: '3.9' + NUMPY: '1.20' + CONDA_ENV: azure_ci + RUN_TYPEGUARD: yes + TEST_START_INDEX: 13 + py39_np121: + PYTHON: '3.9' + NUMPY: '1.21' + CONDA_ENV: azure_ci + TEST_START_INDEX: 14 + py39_np123: + PYTHON: '3.9' + NUMPY: '1.23' + CONDA_ENV: azure_ci + TEST_START_INDEX: 15 + py310_np121: + PYTHON: '3.10' + NUMPY: '1.21' + CONDA_ENV: azure_ci + TEST_START_INDEX: 16 + py310_np123: + PYTHON: '3.10' + NUMPY: '1.23' + CONDA_ENV: azure_ci + TEST_START_INDEX: 17 + +- template: buildscripts/azure/azure-windows.yml + parameters: + name: Windows + vmImage: windows-2019 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/bin/numba b/cv/3d_detection/centerpoint/pytorch/numba/bin/numba new file mode 100755 index 0000000000000000000000000000000000000000..32e7180df65620d842c786cffaa40520d77d0e87 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/bin/numba @@ -0,0 +1,8 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +from __future__ import print_function, division, absolute_import + +from numba.misc.numba_entry import main + +if __name__ == "__main__": + main() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/build_numba.sh b/cv/3d_detection/centerpoint/pytorch/numba/build_numba.sh new file mode 100644 index 0000000000000000000000000000000000000000..2e4dda01e0c85731d710b0ebf24a30f91c879671 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/build_numba.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +clang_version=`clang --version | grep "clang version 16."` +if [[ "${clang_version}" != "" ]]; then + echo "Not support LLVM16 now!" + exit 0 +fi + +COREX_VERSION=${COREX_VERSION:-latest} + +PYTHON_PATH=$(which python3) + +if [[ "${COREX_VERSION}" == "latest" ]]; then + COREX_VERSION=`date --utc +%Y%m%d%H%M%S` +fi +export NUMBA_LOCAL_IDENTIFIER="corex.${COREX_VERSION}" + +${PYTHON_PATH} setup.py bdist_wheel -d build_pip 2>&1 | tee compile.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + +# Return 0 status if all finished +exit 0 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/appveyor/run_with_env.cmd b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/appveyor/run_with_env.cmd new file mode 100644 index 0000000000000000000000000000000000000000..3a56e3e840e6feee53b14cc745320b7ccdd72914 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/appveyor/run_with_env.cmd @@ -0,0 +1,90 @@ +:: From https://github.com/ogrisel/python-appveyor-demo +:: +:: To build extensions for 64 bit Python 3, we need to configure environment +:: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: +:: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) +:: +:: To build extensions for 64 bit Python 2, we need to configure environment +:: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: +:: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) +:: +:: 32 bit builds, and 64-bit builds for 3.5 and beyond, do not require specific +:: environment configurations. +:: +:: Note: this script needs to be run with the /E:ON and /V:ON flags for the +:: cmd interpreter, at least for (SDK v7.0) +:: +:: More details at: +:: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows +:: http://stackoverflow.com/a/13751649/163740 +:: +:: Author: Olivier Grisel +:: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ +:: +:: Notes about batch files for Python people: +:: +:: Quotes in values are literally part of the values: +:: SET FOO="bar" +:: FOO is now five characters long: " b a r " +:: If you don't want quotes, don't include them on the right-hand side. +:: +:: The CALL lines at the end of this file look redundant, but if you move them +:: outside of the IF clauses, they do not run properly in the SET_SDK_64==Y +:: case, I don't know why. +@ECHO OFF + +SET COMMAND_TO_RUN=%* +SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows +SET WIN_WDK=c:\Program Files (x86)\Windows Kits\10\Include\wdf + +:: Extract the major and minor versions, and allow for the minor version to be +:: more than 9. This requires the version number to have two dots in it. +SET MAJOR_PYTHON_VERSION=%PYTHON:~0,1% +IF "%PYTHON:~3,1%" == "." ( + SET MINOR_PYTHON_VERSION=%PYTHON:~2,1% +) ELSE ( + SET MINOR_PYTHON_VERSION=%PYTHON:~2,2% +) + +:: Based on the Python version, determine what SDK version to use, and whether +:: to set the SDK for 64-bit. +IF %MAJOR_PYTHON_VERSION% == 2 ( + SET WINDOWS_SDK_VERSION="v7.0" + SET SET_SDK_64=Y +) ELSE ( + IF %MAJOR_PYTHON_VERSION% == 3 ( + SET WINDOWS_SDK_VERSION="v7.1" + IF %MINOR_PYTHON_VERSION% LEQ 4 ( + SET SET_SDK_64=Y + ) ELSE ( + SET SET_SDK_64=N + IF EXIST "%WIN_WDK%" ( + :: See: https://connect.microsoft.com/VisualStudio/feedback/details/1610302/ + REN "%WIN_WDK%" 0wdf + ) + ) + ) ELSE ( + ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" + EXIT 1 + ) +) + +IF %ARCH% == 64 ( + IF %SET_SDK_64% == Y ( + ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture + SET DISTUTILS_USE_SDK=1 + SET MSSdk=1 + "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% + "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT 1 + ) ELSE ( + ECHO Using default MSVC build environment for 64 bit architecture + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT 1 + ) +) ELSE ( + ECHO Using default MSVC build environment for 32 bit architecture + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT 1 +) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/azure/azure-linux-macos.yml b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/azure/azure-linux-macos.yml new file mode 100644 index 0000000000000000000000000000000000000000..74cd37f411e4d9c8a6a271a62f988d5d95912b99 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/azure/azure-linux-macos.yml @@ -0,0 +1,47 @@ +parameters: + name: '' + vmImage: '' + matrix: [] + +jobs: +- job: ${{ parameters.name }} + pool: + vmImage: ${{ parameters.vmImage }} + strategy: + matrix: + ${{ insert }}: ${{ parameters.matrix }} + + steps: + - script: | + if [ "$(uname)" == "Linux" ] && [[ "$CONDA_SUBDIR" == "linux-32" || "$BITS32" == "yes" ]]; then sudo apt-get install -y libc6-dev-i386; fi + if [ "$(uname)" == "Linux" ] && [[ "$CONDA_SUBDIR" != "linux-32" && "$BITS32" != "yes" ]]; then sudo apt-get install -y gdb; fi + echo "Installing Miniconda" + buildscripts/incremental/install_miniconda.sh + export PATH=$HOME/miniconda3/bin:$PATH + echo "Setting up Conda environment" + buildscripts/incremental/setup_conda_environment.sh + displayName: 'Before Install' + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + buildscripts/incremental/build.sh + displayName: 'Build' + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + conda install -y flake8 + flake8 numba + displayName: 'Flake8' + condition: eq(variables['RUN_FLAKE8'], 'yes') + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + conda install -y mypy + mypy + displayName: 'Mypy' + condition: eq(variables['RUN_MYPY'], 'yes') + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + buildscripts/incremental/test.sh + displayName: 'Test' diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/azure/azure-windows.yml b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/azure/azure-windows.yml new file mode 100644 index 0000000000000000000000000000000000000000..aa38d8b32730d9d8fc25e4d78e6d75ce42c0afde --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/azure/azure-windows.yml @@ -0,0 +1,62 @@ +parameters: + name: '' + vmImage: '' + +jobs: +- job: ${{ parameters.name }} + pool: + vmImage: ${{ parameters.vmImage }} + strategy: + matrix: + py310_np123: + PYTHON: '3.10' + NUMPY: '1.23' + CONDA_ENV: 'testenv' + TEST_START_INDEX: 18 + py37_np118: + PYTHON: '3.7' + NUMPY: '1.18' + CONDA_ENV: 'testenv' + TEST_START_INDEX: 19 + + steps: + - task: CondaEnvironment@1 + inputs: + updateConda: no + packageSpecs: '' + + - script: | + buildscripts\\incremental\\setup_conda_environment.cmd + displayName: 'Before Install' + + - script: | + buildscripts\\incremental\\build.cmd + displayName: 'Build' + + - script: | + call activate %CONDA_ENV% + python -m numba -s + displayName: 'Display numba system information' + + - script: | + call activate %CONDA_ENV% + python -m numba.tests.test_runtests + displayName: 'Verify runtests' + + - script: | + call activate %CONDA_ENV% + python -m numba.runtests -l + displayName: 'List discovered tests' + + - script: | + call activate %CONDA_ENV% + set NUMBA_CAPTURED_ERRORS=new_style + echo "Running slice of discovered tests: %TEST_START_INDEX%,None,%TEST_COUNT%" + python -m numba.runtests -b -v -g -m 2 -- numba.tests + displayName: 'Test modified test files' + + - script: | + call activate %CONDA_ENV% + set NUMBA_CAPTURED_ERRORS=new_style + python runtests.py -m 2 -b -j "%TEST_START_INDEX%,None,%TEST_COUNT%" --exclude-tags='long_running' -- numba.tests + displayName: 'Test slice of test files' diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/bld.bat b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/bld.bat new file mode 100644 index 0000000000000000000000000000000000000000..6372f3a4d2e935b7e19b6decad2771060552cbb2 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/bld.bat @@ -0,0 +1,3 @@ +%PYTHON% setup.py build install --single-version-externally-managed --record=record.txt + +exit /b %errorlevel% diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/build.sh b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..f08a733916bdee855ead0dc0b5327dfd9fdbeca3 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/build.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +if [[ "$(uname -s)" == *"Linux"* ]] && [[ "$(uname -p)" == *"86"* ]]; then + EXTRA_BUILD_EXT_FLAGS="--werror --wall" +else + EXTRA_BUILD_EXT_FLAGS="" +fi + +if [[ "$(uname -s)" == *"Linux"* ]] && [[ "$(uname -p)" == *"ppc64le"* ]]; then + # To workaround https://github.com/numba/numba/issues/7302 + # because of a python build problem that the -pthread could be stripped. + export CC="$CC -pthread" + export CXX="$CXX -pthread" +fi + +MACOSX_DEPLOYMENT_TARGET=10.10 $PYTHON setup.py build_ext $EXTRA_BUILD_EXT_FLAGS build install --single-version-externally-managed --record=record.txt diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/conda_build_config.yaml b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/conda_build_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9798e4b695fa9dfa89ae59a16b496aba0aed50ab --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/conda_build_config.yaml @@ -0,0 +1,12 @@ +# Numba/llvmlite stack needs an older compiler for backwards compatibility. +c_compiler_version: # [linux] + - 7 # [linux and (x86_64 or ppc64le)] + - 9 # [linux and aarch64] + +cxx_compiler_version: # [linux] + - 7 # [linux and (x86_64 or ppc64le)] + - 9 # [linux and aarch64] + +fortran_compiler_version: # [linux] + - 7 # [linux and (x86_64 or ppc64le)] + - 9 # [linux and aarch64] diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/license.txt b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/license.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d19426e7a09d04cef6dbd2a2857434036362ee1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/license.txt @@ -0,0 +1,24 @@ +Copyright (c) 2012, Anaconda, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/meta.yaml b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ad94a68964ffdd69fc463abf3a71afa1bd116b11 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/meta.yaml @@ -0,0 +1,94 @@ +package: + name: numba + version: {{ GIT_DESCRIBE_TAG }} + +source: + path: ../.. + +build: + number: {{ GIT_DESCRIBE_NUMBER|int }} + string: np{{ NPY_VER }}py{{ PY_VER }}h{{ PKG_HASH }}_{{GIT_DESCRIBE_HASH}}_{{ GIT_DESCRIBE_NUMBER }} + entry_points: + - pycc = numba.pycc:main + - numba = numba.misc.numba_entry:main + script_env: + - PY_VCRUNTIME_REDIST + missing_dso_whitelist: # [osx] + # optional dependency: required only when omp is chosen as the backend for + # the threading layer + - lib/libiomp5.dylib # [osx] + ignore_run_exports: + # tbb-devel triggers hard dependency on tbb, this is not the case. + - tbb # [not (armv6l or armv7l or aarch64 or linux32)] + +requirements: + # build and run dependencies are duplicated to avoid setuptools issues + # when we also set install_requires in setup.py + build: + - {{ compiler('c') }} # [not (armv6l or armv7l or aarch64)] + - {{ compiler('cxx') }} # [not (armv6l or armv7l or aarch64)] + # OpenMP headers from llvm needed for OSX. + - llvm-openmp # [osx] + host: + - python + - numpy + - setuptools + - importlib_metadata # [py<39] + # On channel https://anaconda.org/numba/ + - llvmlite 0.39.* + # TBB devel version is to match TBB libs. + # 2020.3 is the last version with the "old" ABI + # NOTE: 2021.1..2021.5 are API compatible for Numba's purposes. + # NOTE: ppc64le exclusion is temporary until packages are more generally + # available. + - tbb-devel >=2021,<2021.6 # [not (armv6l or armv7l or aarch64 or linux32 or ppc64le)] + run: + - python >=3.7 + # NumPy 1.22.0, 1.22.1, 1.22.2 are all broken for ufuncs, see #7756 + - numpy >=1.18, !=1.22.0, !=1.22.1, !=1.22.2, <1.24 + - setuptools + - importlib_metadata # [py<39] + # On channel https://anaconda.org/numba/ + - llvmlite 0.39.* + run_constrained: + # If TBB is present it must be at least version 2021 + - tbb >=2021 # [not (armv6l or armv7l or aarch64 or linux32 or ppc64le)] + # avoid confusion from openblas bugs + - libopenblas !=0.3.6 # [x86_64] + # 0.3.17 buggy on M1 silicon + # https://github.com/xianyi/OpenBLAS/blob/v0.3.20/Changelog.txt#L118 + # https://github.com/numba/numba/issues/7822#issuecomment-1063229855 + # Exclude 0.3.20 too + # https://github.com/numba/numba/issues/8096 + - libopenblas >=0.3.18, !=0.3.20 # [arm64] + # CUDA 10.2 or later is required for CUDA support + - cudatoolkit >=10.2 + # scipy 1.0 or later + - scipy >=1.0 + # CUDA Python 11.6 or later + - cuda-python >=11.6 + +test: + requires: + - jinja2 + # Required to test optional Numba features + - cffi + # temporarily disable scipy testing on ARM, need to build out more packages + - scipy # [not (armv6l or armv7l)] + - ipython # [not (armv6l or armv7l or aarch64)] + - setuptools + - tbb >=2021 # [not (armv6l or armv7l or aarch64 or linux32 or ppc64le)] + - llvm-openmp # [osx] + # This is for driving gdb tests + - pexpect # [linux64] + # For testing ipython + - ipykernel + # Need these for AOT. Do not init msvc as it may not be present + - {{ compiler('c') }} # [not (win or armv6l or armv7l or aarch64)] + - {{ compiler('cxx') }} # [not (win or armv6l or armv7l or aarch64)] + +about: + home: https://numba.pydata.org/ + license: BSD + license_file: LICENSE + summary: a just-in-time Python function compiler based on LLVM diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/run_test.bat b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/run_test.bat new file mode 100644 index 0000000000000000000000000000000000000000..077067e06b88e90178d099d1edf2fdc65e96b65c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/run_test.bat @@ -0,0 +1,19 @@ +set NUMBA_DEVELOPER_MODE=1 +set NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING=1 +set NUMBA_CAPTURED_ERRORS=new_style +set PYTHONFAULTHANDLER=1 + +@rem Check Numba executables are there +pycc -h +numba -h + +@rem Run system info tool +numba -s + +@rem Check test discovery works +python -m numba.tests.test_runtests + +@rem Run the whole test suite +python -m numba.runtests -b -m -- %TESTS_TO_RUN% + +if errorlevel 1 exit 1 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/run_test.sh b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/run_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..69cc460084d0038f657c7170094afeb6572b80ef --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe.local/run_test.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +set -e + +export NUMBA_DEVELOPER_MODE=1 +export NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING=1 +export NUMBA_CAPTURED_ERRORS="new_style" +export PYTHONFAULTHANDLER=1 + +# Disable NumPy dispatching to AVX512_SKX feature extensions if the chip is +# reported to support the feature and NumPy >= 1.22 as this results in the use +# of low accuracy SVML libm replacements in ufunc loops. +_NPY_CMD='from numba.misc import numba_sysinfo;\ + sysinfo=numba_sysinfo.get_sysinfo();\ + print(sysinfo["NumPy AVX512_SKX detected"] and + sysinfo["NumPy Version"]>="1.22")' +NUMPY_DETECTS_AVX512_SKX_NP_GT_122=$(python -c "$_NPY_CMD") +echo "NumPy >= 1.22 with AVX512_SKX detected: $NUMPY_DETECTS_AVX512_SKX_NP_GT_122" + +if [[ "$NUMPY_DETECTS_AVX512_SKX_NP_GT_122" == "True" ]]; then + export NPY_DISABLE_CPU_FEATURES="AVX512_SKX" +fi + + +unamestr=`uname` +if [[ "$unamestr" == 'Linux' ]]; then + SEGVCATCH=catchsegv +elif [[ "$unamestr" == 'Darwin' ]]; then + SEGVCATCH="" +else + echo Error +fi + +# limit CPUs in use on PPC64LE, fork() issues +# occur on high core count systems +archstr=`uname -m` +if [[ "$archstr" == 'ppc64le' ]]; then + TEST_NPROCS=16 +fi + +# Check Numba executables are there +pycc -h +numba -h + +# run system info tool +numba -s + +# Check test discovery works +python -m numba.tests.test_runtests + +# Run the whole test suite +echo "Running: $SEGVCATCH python -m numba.runtests -b -m $TEST_NPROCS -- $TESTS_TO_RUN" +$SEGVCATCH python -m numba.runtests -b -m $TEST_NPROCS -- $TESTS_TO_RUN diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/bld.bat b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/bld.bat new file mode 100644 index 0000000000000000000000000000000000000000..e2fd587c5d71916d5769f58c4335252b4308cd73 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/bld.bat @@ -0,0 +1,3 @@ +%PYTHON% build.py + +exit /b %errorlevel% diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/build.sh b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..9ed12875c46e8e5361cb553552a559ae87db04e9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/build.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -x + +${PYTHON} build.py diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/meta.yaml b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..76f64a6b10ac47ebbaff3f7a50e832504eed7551 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/meta.yaml @@ -0,0 +1,38 @@ +{% set version = "2019.3" %} # this is the intel version to get +{% set win_build_number = "203" %} # the build number from the intel windows version +{% set osx_build_number = "199" %} # the build number from the intel osx version +{% set lnx_build_number = "199" %} # the build number from the intel linux version + +package: + name: icc_rt + version: {{ version }} + +build: + number: {{ win_build_number }} # [win] + number: {{ osx_build_number }} # [osx] + number: {{ lnx_build_number }} # [linux] + +source: + - url: https://anaconda.org/intel/icc_rt/{{ version }}/download/win-64/icc_rt-{{version}}-intel_{{win_build_number}}.tar.bz2 # [win] + - md5: d39bae3218457a4ea045763fdcfc1562 # [win] + - sha256: 2c55b8af1dea35ee4648b671050899a93b7eba1b26acad019bf569ca777a944e # [win] + + - url: https://anaconda.org/intel/icc_rt/{{ version }}/download/osx-64/icc_rt-{{version}}-intel_{{osx_build_number}}.tar.bz2 # [osx] + - md5: 064566ac53e729d3f008e32b1f73d1fa # [osx] + - sha256: 54a372b0d8d5b4d750c28ea122851b52ec9aa3cccb8d4cf4a2999494dfda6656 # [osx] + + - url: https://anaconda.org/intel/icc_rt/{{ version }}/download/linux-64/icc_rt-{{version}}-intel_{{lnx_build_number}}.tar.bz2 # [linux] + - md5: 306c3ee9491577715dbd76c838147078 # [linux] + - sha256: 4cedd10343d1ab4403af2ff080b47afe5399be550f1c215e5a7c7eceec672516 # [linux] + + - path: scripts + +requirements: + build: + - python>=3 + +about: + license: "Intel" + license_family: "Proprietary" + license_file: LICENSE.txt +summary: Intel ICC runtime. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/scripts/build.py b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/scripts/build.py new file mode 100644 index 0000000000000000000000000000000000000000..11eac62822f8087191428544c04c5dc97282b6b8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/condarecipe_clone_icc_rt/scripts/build.py @@ -0,0 +1,33 @@ +import sys +import os +import shutil + +libdir = {'w': 'Library', + 'l': 'lib', + 'd': 'lib'} + + +def run(): + src_dir = os.environ.get('SRC_DIR') + prefix = os.environ.get('PREFIX') + + libd = libdir.get(sys.platform[0], None) + assert libd is not None + + # remove 'lib' from the prefix so a direct copy from the original + # package can be made + lib_dir = os.path.join(prefix, libd) + shutil.rmtree(lib_dir) + # copy in the original package lib dir + shutil.copytree(os.path.join(src_dir, libd), lib_dir) + + # and copy the license + info_dir = os.path.join(src_dir, 'info') + shutil.copy(os.path.join(info_dir, 'LICENSE.txt'), src_dir) + shutil.rmtree(info_dir) + + +if __name__ == "__main__": + args = sys.argv + assert len(args) == 1 + run() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/gpuci/axis.yaml b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/gpuci/axis.yaml new file mode 100644 index 0000000000000000000000000000000000000000..417971411ca34ebbbbb30f85aa8893b47a5537d3 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/gpuci/axis.yaml @@ -0,0 +1,19 @@ +PYTHON_VER: +- "3.8" + +CUDA_VER: +- "11.2" + +CUDA_TOOLKIT_VER: +- "10.2" +- "11.1" +- "11.2" +- "11.5" + +LINUX_VER: +- ubuntu18.04 + +RAPIDS_VER: +- "21.12" + +excludes: diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/gpuci/build.sh b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/gpuci/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..1088b1eebe4462b405d424100522a0a16508c5ed --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/gpuci/build.sh @@ -0,0 +1,74 @@ +############################################## +# Numba GPU build and test script for CI # +############################################## +set -e + +# Set path and build parallel level +export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH +export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} + +# Set home to the job's workspace +export HOME="$WORKSPACE" + +# Switch to project root; also root of repo checkout +cd "$WORKSPACE" + +# Determine CUDA release version +export CUDA_REL=${CUDA_VERSION%.*} + +# Test with NVIDIA Bindings on CUDA 11.5 +if [ $CUDA_TOOLKIT_VER == "11.5" ] +then + export NUMBA_CUDA_USE_NVIDIA_BINDING=1; +else + export NUMBA_CUDA_USE_NVIDIA_BINDING=0; +fi; + +################################################################################ +# SETUP - Check environment +################################################################################ + +gpuci_logger "Check environment variables" +env + +gpuci_logger "Check GPU usage" +nvidia-smi + +gpuci_logger "Create testing env" +. /opt/conda/etc/profile.d/conda.sh +gpuci_mamba_retry create -n numba_ci -y \ + "python=${PYTHON_VER}" \ + "cudatoolkit=${CUDA_TOOLKIT_VER}" \ + "numba/label/dev::llvmlite" \ + "numpy=1.21" \ + "scipy" \ + "cffi" \ + "psutil" \ + "gcc_linux-64=7" \ + "gxx_linux-64=7" \ + "setuptools" + +conda activate numba_ci + +if [ $NUMBA_CUDA_USE_NVIDIA_BINDING == "1" ] +then + gpuci_logger "Install NVIDIA CUDA Python bindings"; + gpuci_mamba_retry install nvidia::cuda-python=11.7.0; +fi; + +gpuci_logger "Install numba" +python setup.py develop + +gpuci_logger "Check Compiler versions" +$CC --version +$CXX --version + +gpuci_logger "Check conda environment" +conda info +conda config --show-sources + +gpuci_logger "Dump system information from Numba" +python -m numba -s + +gpuci_logger "Run tests in numba.cuda.tests" +python -m numba.runtests numba.cuda.tests -v -m diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/MacOSX10.10.sdk.checksum b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/MacOSX10.10.sdk.checksum new file mode 100644 index 0000000000000000000000000000000000000000..0a82e43176900a1cccc3bdd651c1860165b9b92c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/MacOSX10.10.sdk.checksum @@ -0,0 +1 @@ +ea40a3b9dc48cd3593628490f2738b89282f00ab ./MacOSX10.10.sdk.tar.xz diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/after_success.sh b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/after_success.sh new file mode 100755 index 0000000000000000000000000000000000000000..63c55b63ddd51b31abefc28163a024117e22c74e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/after_success.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +source activate $CONDA_ENV + +# Make sure any error below is reported as such +set -v -e + +if [ "$RUN_COVERAGE" == "yes" ]; then + coverage combine + codecov +fi diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/build.cmd b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/build.cmd new file mode 100644 index 0000000000000000000000000000000000000000..645a1562525cebcc6d3dcaf186773b0ea8aa460a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/build.cmd @@ -0,0 +1,10 @@ + +call activate %CONDA_ENV% + +@rem Build numba extensions without silencing compile errors +python setup.py build_ext -q --inplace + +@rem Install numba locally for use in `numba -s` sys info tool at test time +python -m pip install -e . + +if %errorlevel% neq 0 exit /b %errorlevel% diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/build.sh b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/build.sh new file mode 100755 index 0000000000000000000000000000000000000000..4c696d69cb8f9e78ba9f9c1691d6557e1ad19972 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/build.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +source activate +conda activate $CONDA_ENV + +# Make sure any error below is reported as such +set -v -e + +# Build numba extensions without silencing compile errors +if [[ "$(uname -s)" == *"Linux"* ]] && [[ "$(uname -p)" == *"86"* ]]; then + EXTRA_BUILD_EXT_FLAGS="--werror --wall" +else + EXTRA_BUILD_EXT_FLAGS="" +fi + +if [[ $(uname) == "Darwin" ]]; then + # The following is suggested in https://docs.conda.io/projects/conda-build/en/latest/resources/compiler-tools.html?highlight=SDK#macos-sdk + wget -q https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacOSX10.10.sdk.tar.xz + shasum -c ./buildscripts/incremental/MacOSX10.10.sdk.checksum + tar -xf ./MacOSX10.10.sdk.tar.xz + export SDKROOT=`pwd`/MacOSX10.10.sdk +fi +python setup.py build_ext -q --inplace --debug $EXTRA_BUILD_EXT_FLAGS --verbose +# (note we don't install to avoid problems with extra long Windows paths +# during distutils-dependent tests -- e.g. test_pycc) + +# Install numba locally for use in `numba -s` sys info tool at test time +python -m pip install --no-deps -e . diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/install_miniconda.sh b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/install_miniconda.sh new file mode 100755 index 0000000000000000000000000000000000000000..4aa73bac6729ba9df62b4724d1d9b12811acbbbb --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/install_miniconda.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -v -e + +# Install Miniconda +unamestr=`uname` +if [[ "$unamestr" == 'Linux' ]]; then + if [[ "$BITS32" == "yes" ]]; then + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86.sh -O miniconda.sh + else + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + fi +elif [[ "$unamestr" == 'Darwin' ]]; then + wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh +else + echo Error +fi +chmod +x miniconda.sh +./miniconda.sh -b diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/setup_conda_environment.cmd b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/setup_conda_environment.cmd new file mode 100644 index 0000000000000000000000000000000000000000..1a4ecb7c78ccbeb1115f3775c58a4e32cbf6db62 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/setup_conda_environment.cmd @@ -0,0 +1,45 @@ +@rem first configure conda to have more tolerance of network problems, these +@rem numbers are not scientifically chosen, just merely larger than defaults +set CONDA_CONFIG=cmd /C conda config +%CONDA_CONFIG% --write-default +%CONDA_CONFIG% --set remote_connect_timeout_secs 30.15 +%CONDA_CONFIG% --set remote_max_retries 10 +%CONDA_CONFIG% --set remote_read_timeout_secs 120.2 +%CONDA_CONFIG% --set restore_free_channel true +%CONDA_CONFIG% --set show_channel_urls true +cmd /C conda info +%CONDA_CONFIG% --show + +@rem The cmd /C hack circumvents a regression where conda installs a conda.bat +@rem script in non-root environments. +set CONDA_INSTALL=cmd /C conda install -q -y +set PIP_INSTALL=pip install -q + +@echo on + +@rem Deactivate any environment +call deactivate +@rem Display root environment (for debugging) +conda list +@rem Scipy, CFFI, jinja2 and IPython are optional dependencies, but exercised in the test suite +conda create -n %CONDA_ENV% -q -y python=%PYTHON% numpy=%NUMPY% cffi pip jinja2 ipython gitpython pyyaml + +call activate %CONDA_ENV% +@rem Scipy comes from conda-forge for NumPy 1.23 +if %NUMPY% == "1.23" (%CONDA_INSTALL% conda-forge::scipy) else (%CONDA_INSTALL% scipy) +@rem Install latest llvmlite build +%CONDA_INSTALL% -c numba/label/dev llvmlite=0.39 +@rem Install required backports for older Pythons +if %PYTHON% LSS 3.9 (%CONDA_INSTALL% importlib_metadata) +@rem Install dependencies for building the documentation +if "%BUILD_DOC%" == "yes" (%CONDA_INSTALL% sphinx sphinx_rtd_theme pygments) +@rem Install dependencies for code coverage (codecov.io) +if "%RUN_COVERAGE%" == "yes" (%PIP_INSTALL% codecov) +@rem Install TBB +%CONDA_INSTALL% -c numba tbb=2021 "tbb-devel>=2021,<2021.6" +if %errorlevel% neq 0 exit /b %errorlevel% + +echo "DEBUG ENV:" +echo "-------------------------------------------------------------------------" +conda env export +echo "-------------------------------------------------------------------------" diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/setup_conda_environment.sh b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/setup_conda_environment.sh new file mode 100755 index 0000000000000000000000000000000000000000..943c428ade12e799fdbd82cd9924c347b33bf990 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/setup_conda_environment.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +set -v -e + +# first configure conda to have more tolerance of network problems, these +# numbers are not scientifically chosen, just merely larger than defaults +conda config --write-default +conda config --set remote_connect_timeout_secs 30.15 +conda config --set remote_max_retries 10 +conda config --set remote_read_timeout_secs 120.2 +conda config --set show_channel_urls true +if [[ $(uname) == Linux ]]; then + if [[ "$CONDA_SUBDIR" != "linux-32" && "$BITS32" != "yes" ]] ; then + conda config --set restore_free_channel true + fi +fi +conda info +conda config --show + +CONDA_INSTALL="conda install -q -y" +PIP_INSTALL="pip install -q" + + +EXTRA_CHANNELS="" +if [ "${USE_C3I_TEST_CHANNEL}" == "yes" ]; then + EXTRA_CHANNELS="${EXTRA_CHANNELS} -c c3i_test" +fi + + +# Deactivate any environment +source deactivate +# Display root environment (for debugging) +conda list + +# If VANILLA_INSTALL is yes, then only Python, NumPy and pip are installed, this +# is to catch tests/code paths that require an optional package and are not +# guarding against the possibility that it does not exist in the environment. +# Create a base env first and then add to it... +# NOTE: gitpython is needed for CI testing to do the test slicing +# NOTE: pyyaml is used to ensure that the Azure CI config is valid +# NOTE: 32 bit linux... do not install NumPy, there's no conda package for >1.15 +# so it has to come from pip later +if [[ "$CONDA_SUBDIR" == "linux-32" || "$BITS32" == "yes" ]]; then + conda create -n $CONDA_ENV -q -y ${EXTRA_CHANNELS} python=$PYTHON pip gitpython pyyaml +else + conda create -n $CONDA_ENV -q -y ${EXTRA_CHANNELS} python=$PYTHON numpy=$NUMPY pip gitpython pyyaml +fi + +# Activate first +set +v +source activate $CONDA_ENV +set -v + +# Install optional packages into activated env +echo "PYTHON=$PYTHON" +echo "VANILLA_INSTALL=$VANILLA_INSTALL" +if [ "${VANILLA_INSTALL}" != "yes" ]; then + # Scipy, CFFI, jinja2, IPython and pygments are optional + # dependencies, but exercised in the test suite + # pexpect is used to run the gdb tests. + # ipykernel is used for testing ipython behaviours. + $CONDA_INSTALL ${EXTRA_CHANNELS} cffi jinja2 ipython ipykernel pygments pexpect + # Only install scipy on 64bit, else it'll pull in NumPy, 32bit linux needs + # to get scipy from pip + if [[ "$CONDA_SUBDIR" != "linux-32" && "$BITS32" != "yes" ]] ; then + if [[ "$NUMPY" == "1.23" ]] ; then + $CONDA_INSTALL ${EXTRA_CHANNELS} conda-forge::scipy + else + $CONDA_INSTALL ${EXTRA_CHANNELS} scipy + fi + fi +fi + +# Install the compiler toolchain +if [[ $(uname) == Linux ]]; then + if [[ "$CONDA_SUBDIR" == "linux-32" || "$BITS32" == "yes" ]] ; then + $CONDA_INSTALL gcc_linux-32 gxx_linux-32 + else + $CONDA_INSTALL gcc_linux-64 gxx_linux-64 + fi +elif [[ $(uname) == Darwin ]]; then + $CONDA_INSTALL clang_osx-64 clangxx_osx-64 + # Install llvm-openmp on OSX for headers during build and runtime during + # testing + $CONDA_INSTALL llvm-openmp +fi + +# If on 32bit linux, now pip install NumPy (no conda package), SciPy is broken?! +if [[ "$CONDA_SUBDIR" == "linux-32" || "$BITS32" == "yes" ]] ; then + $PIP_INSTALL numpy==$NUMPY +fi + +# Install latest correct build +$CONDA_INSTALL -c numba/label/dev llvmlite=0.39 + +# Install importlib-metadata for Python < 3.9 +if [ $PYTHON \< "3.9" ]; then $CONDA_INSTALL importlib_metadata; fi + +# Install dependencies for building the documentation +if [ "$BUILD_DOC" == "yes" ]; then $CONDA_INSTALL sphinx=2.4.4 docutils=0.17 sphinx_rtd_theme pygments numpydoc; fi +if [ "$BUILD_DOC" == "yes" ]; then $PIP_INSTALL rstcheck; fi +# Install dependencies for code coverage (codecov.io) +if [ "$RUN_COVERAGE" == "yes" ]; then $PIP_INSTALL codecov; fi +# Install SVML +if [ "$TEST_SVML" == "yes" ]; then $CONDA_INSTALL -c numba icc_rt; fi +# Install Intel TBB parallel backend +if [ "$TEST_THREADING" == "tbb" ]; then $CONDA_INSTALL -c numba tbb=2021 "tbb-devel>=2021,<2021.6"; fi +# Install pickle5 +if [ "$TEST_PICKLE5" == "yes" ]; then $PIP_INSTALL pickle5; fi +# Install typeguard +if [ "$RUN_TYPEGUARD" == "yes" ]; then $CONDA_INSTALL conda-forge::typeguard; fi + +# environment dump for debug +# echo "DEBUG ENV:" +# echo "-------------------------------------------------------------------------" +# conda env export +# echo "-------------------------------------------------------------------------" diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/test.cmd b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/test.cmd new file mode 100644 index 0000000000000000000000000000000000000000..c35278d2b4fd048c99302afede4b132768b42598 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/test.cmd @@ -0,0 +1,34 @@ + +call activate %CONDA_ENV% + +@rem Ensure that the documentation builds without warnings +if "%BUILD_DOC%" == "yes" python setup.py build_doc +@rem Run system info tool +pushd bin +numba -s +popd + +@rem switch off color messages +set NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING=1 +@rem switch on developer mode +set NUMBA_DEVELOPER_MODE=1 +@rem enable the faulthandler +set PYTHONFAULTHANDLER=1 +@rem enable new style error handling +set NUMBA_CAPTURED_ERRORS=new_style + +@rem First check that the test discovery works +python -m numba.tests.test_runtests +@rem Now run the Numba test suite +@rem Note that coverage is run from the checkout dir to match the "source" +@rem directive in .coveragerc +if "%RUN_COVERAGE%" == "yes" ( + set PYTHONPATH=. + coverage erase + coverage run runtests.py -b --exclude-tags='long_running' -m -- numba.tests +) else ( + set NUMBA_ENABLE_CUDASIM=1 + python -m numba.runtests -b --exclude-tags='long_running' -m -- numba.tests +) + +if %errorlevel% neq 0 exit /b %errorlevel% diff --git a/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/test.sh b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/test.sh new file mode 100755 index 0000000000000000000000000000000000000000..a3e32bd4a313c1484db544ca3e7057ccb617fe99 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/buildscripts/incremental/test.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +source activate $CONDA_ENV + +# Make sure any error below is reported as such +set -v -e + +# Ensure the README is correctly formatted +if [ "$BUILD_DOC" == "yes" ]; then rstcheck README.rst; fi +# Ensure that the documentation builds without warnings +pushd docs +if [ "$BUILD_DOC" == "yes" ]; then make SPHINXOPTS=-W clean html; fi +popd +# Run system info tool +pushd bin +numba -s +popd + +# switch off color messages +export NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING=1 +# switch on developer mode +export NUMBA_DEVELOPER_MODE=1 +# enable the fault handler +export PYTHONFAULTHANDLER=1 + +# enable new style error handling +export NUMBA_CAPTURED_ERRORS="new_style" + +# Disable NumPy dispatching to AVX512_SKX feature extensions if the chip is +# reported to support the feature and NumPy >= 1.22 as this results in the use +# of low accuracy SVML libm replacements in ufunc loops. +_NPY_CMD='from numba.misc import numba_sysinfo;\ + sysinfo=numba_sysinfo.get_sysinfo();\ + print(sysinfo["NumPy AVX512_SKX detected"] and + sysinfo["NumPy Version"]>="1.22")' +NUMPY_DETECTS_AVX512_SKX_NP_GT_122=$(python -c "$_NPY_CMD") +echo "NumPy >= 1.22 with AVX512_SKX detected: $NUMPY_DETECTS_AVX512_SKX_NP_GT_122" + +if [[ "$NUMPY_DETECTS_AVX512_SKX_NP_GT_122" == "True" ]]; then + export NPY_DISABLE_CPU_FEATURES="AVX512_SKX" +fi + +# deal with threading layers +if [ -z ${TEST_THREADING+x} ]; then + echo "INFO: Threading layer not explicitly set." +else + case "${TEST_THREADING}" in "workqueue"|"omp"|"tbb") + export NUMBA_THREADING_LAYER="$TEST_THREADING" + echo "INFO: Threading layer set as: $TEST_THREADING" + ;; + *) + echo "INFO: Threading layer explicitly set to bad value: $TEST_THREADING." + exit 1 + ;; + esac +fi + +# If TEST_THREADING is set in the env, then check that Numba agrees that the +# environment can support the requested threading. +function check_sysinfo() { + cmd="import os;\ + from numba.misc.numba_sysinfo import get_sysinfo;\ + assert get_sysinfo()['$1 Threading'] is True, 'Threading layer $1 '\ + 'is not supported';\ + print('Threading layer $1 is supported')" + python -c "$cmd" +} + +if [[ "$TEST_THREADING" ]]; then + if [[ "$TEST_THREADING" == "tbb" ]]; then + check_sysinfo "TBB" + elif [[ "$TEST_THREADING" == "omp" ]]; then + check_sysinfo "OpenMP" + elif [[ "$TEST_THREADING" == "workqueue" ]]; then + check_sysinfo "Workqueue" + else + echo "Unknown threading layer requested: $TEST_THREADING" + exit 1 + fi +fi + +# Find catchsegv +unamestr=`uname` +if [[ "$unamestr" == 'Linux' ]]; then + if [[ "${BITS32}" == "yes" ]]; then + SEGVCATCH="" + else + SEGVCATCH=catchsegv + fi +elif [[ "$unamestr" == 'Darwin' ]]; then + SEGVCATCH="" +else + echo Error +fi + +# limit CPUs in use on PPC64LE, fork() issues +# occur on high core count systems +archstr=`uname -m` +if [[ "$archstr" == 'ppc64le' ]]; then + TEST_NPROCS=16 +fi + +# setup SDKROOT on Mac +if [[ $(uname) == "Darwin" ]]; then + export SDKROOT=`pwd`/MacOSX10.10.sdk +fi + +# First check that the test discovery works +python -m numba.tests.test_runtests + +# Now run tests based on the changes identified via git +NUMBA_ENABLE_CUDASIM=1 $SEGVCATCH python -m numba.runtests -b -v -g -m $TEST_NPROCS -- numba.tests + +# List the tests found +echo "INFO: All discovered tests:" +python -m numba.runtests -l + +# Now run the Numba test suite with slicing +# Note that coverage is run from the checkout dir to match the "source" +# directive in .coveragerc +echo "INFO: Running slice of discovered tests: ($TEST_START_INDEX,None,$TEST_COUNT)" +if [ "$RUN_COVERAGE" == "yes" ]; then + export PYTHONPATH=. + coverage erase + $SEGVCATCH coverage run runtests.py -b -j "$TEST_START_INDEX,None,$TEST_COUNT" --exclude-tags='long_running' -m $TEST_NPROCS -- numba.tests +elif [ "$RUN_TYPEGUARD" == "yes" ]; then + echo "INFO: Running with typeguard" + NUMBA_USE_TYPEGUARD=1 NUMBA_ENABLE_CUDASIM=1 PYTHONWARNINGS="ignore:::typeguard" $SEGVCATCH python runtests.py -b -j "$TEST_START_INDEX,None,$TEST_COUNT" --exclude-tags='long_running' -m $TEST_NPROCS -- numba.tests +else + NUMBA_ENABLE_CUDASIM=1 $SEGVCATCH python -m numba.runtests -b -j "$TEST_START_INDEX,None,$TEST_COUNT" --exclude-tags='long_running' -m $TEST_NPROCS -- numba.tests +fi diff --git a/cv/3d_detection/centerpoint/pytorch/numba/clean_numba.sh b/cv/3d_detection/centerpoint/pytorch/numba/clean_numba.sh new file mode 100644 index 0000000000000000000000000000000000000000..7d8d2e487b8281286277c4ccfb58fa6e27325616 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/clean_numba.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +PYTHON_PATH=$(which python3) + +${PYTHON_PATH} setup.py clean || true +rm -rf build build_pip + +# Return 0 status if all finished +exit 0 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/codecov.yml b/cv/3d_detection/centerpoint/pytorch/numba/codecov.yml new file mode 100644 index 0000000000000000000000000000000000000000..c293219ba7df26774ef2db93da1384a4c533ad3e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/codecov.yml @@ -0,0 +1,22 @@ +# Configuration for codecov.io +# When editing this file, please validate its contents using: +# curl -X POST --data-binary @- https://codecov.io/validate < codecov.yml + +comment: + layout: "header, diff, changes, uncovered" + +coverage: + ignore: + - "numba/cuda/.*" + - "numba/hsa/.*" + + status: + project: + default: + # The build fails if total project coverage drops by more than 3% + target: auto + threshold: "3%" + # These checks can mark a build failed if too much new code + # is not covered (which happens often with JITted functions). + changes: false + patch: false diff --git a/cv/3d_detection/centerpoint/pytorch/numba/compile.log b/cv/3d_detection/centerpoint/pytorch/numba/compile.log new file mode 100644 index 0000000000000000000000000000000000000000..960598af76f8fb1b4f10396914e19af9d6803083 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/compile.log @@ -0,0 +1,2493 @@ +TBB not found +Using OpenMP from: True +running bdist_wheel +running build +got version from VCS {'version': 'daily_2024-01-04_1035', 'full': '263889a8357fea48211481210dc524080b04ae6a'} +running build_py +creating build +creating build/lib.linux-x86_64-3.7 +creating build/lib.linux-x86_64-3.7/numba +copying numba/__init__.py -> build/lib.linux-x86_64-3.7/numba +copying numba/__main__.py -> build/lib.linux-x86_64-3.7/numba +copying numba/_version.py -> build/lib.linux-x86_64-3.7/numba +copying numba/extending.py -> build/lib.linux-x86_64-3.7/numba +copying numba/runtests.py -> build/lib.linux-x86_64-3.7/numba +creating build/lib.linux-x86_64-3.7/numba/cext +copying numba/cext/__init__.py -> build/lib.linux-x86_64-3.7/numba/cext +creating build/lib.linux-x86_64-3.7/numba/cloudpickle +copying numba/cloudpickle/__init__.py -> build/lib.linux-x86_64-3.7/numba/cloudpickle +copying numba/cloudpickle/cloudpickle.py -> build/lib.linux-x86_64-3.7/numba/cloudpickle +copying numba/cloudpickle/cloudpickle_fast.py -> build/lib.linux-x86_64-3.7/numba/cloudpickle +copying numba/cloudpickle/compat.py -> build/lib.linux-x86_64-3.7/numba/cloudpickle +creating build/lib.linux-x86_64-3.7/numba/core +copying numba/core/__init__.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/analysis.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/base.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/boxing.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/bytecode.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/byteflow.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/caching.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/callconv.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/callwrapper.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/ccallback.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/cgutils.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/codegen.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/compiler.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/compiler_lock.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/compiler_machinery.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/config.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/consts.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/controlflow.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/cpu.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/cpu_options.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/dataflow.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/debuginfo.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/decorators.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/descriptors.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/dispatcher.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/entrypoints.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/environment.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/errors.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/event.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/extending.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/externals.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/fastmathpass.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/funcdesc.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/generators.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/imputils.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/inline_closurecall.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/interpreter.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/intrinsics.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/ir.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/ir_utils.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/itanium_mangler.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/llvm_bindings.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/lowering.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/object_mode_passes.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/optional.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/options.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/overload_glue.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/postproc.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/pylowering.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/pythonapi.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/registry.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/removerefctpass.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/retarget.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/serialize.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/sigutils.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/ssa.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/target_extension.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/targetconfig.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/tracing.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/transforms.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/typed_passes.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/typeinfer.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/untyped_passes.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/utils.py -> build/lib.linux-x86_64-3.7/numba/core +copying numba/core/withcontexts.py -> build/lib.linux-x86_64-3.7/numba/core +creating build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/__init__.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/builtins.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/charseq.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/cmathimpl.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/enumimpl.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/hashing.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/heapq.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/iterators.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/listobj.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/mathimpl.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/numbers.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/printimpl.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/randomimpl.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/rangeobj.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/setobj.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/slicing.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/tupleobj.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/unicode.py -> build/lib.linux-x86_64-3.7/numba/cpython +copying numba/cpython/unicode_support.py -> build/lib.linux-x86_64-3.7/numba/cpython +creating build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/api.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/api_util.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/args.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/codegen.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/compiler.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/cuda_paths.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/cudadecl.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/cudaimpl.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/cudamath.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/decorators.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/descriptor.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/device_init.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/dispatcher.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/errors.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/initialize.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/intrinsic_wrapper.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/libdevice.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/libdevicedecl.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/libdevicefuncs.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/libdeviceimpl.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/mathimpl.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/models.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/nvvmutils.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/printimpl.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/random.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/simulator_init.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/stubs.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/target.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/testing.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/types.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/vector_types.py -> build/lib.linux-x86_64-3.7/numba/cuda +copying numba/cuda/vectorizers.py -> build/lib.linux-x86_64-3.7/numba/cuda +creating build/lib.linux-x86_64-3.7/numba/experimental +copying numba/experimental/__init__.py -> build/lib.linux-x86_64-3.7/numba/experimental +copying numba/experimental/function_type.py -> build/lib.linux-x86_64-3.7/numba/experimental +copying numba/experimental/structref.py -> build/lib.linux-x86_64-3.7/numba/experimental +creating build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/__init__.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/appdirs.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/cffiimpl.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/dummyarray.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/dump_style.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/findlib.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/firstlinefinder.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/gdb_hook.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/gdb_print_extension.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/init_utils.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/inspection.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/literal.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/llvm_pass_timings.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/mergesort.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/numba_entry.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/numba_gdbinfo.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/numba_sysinfo.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/quicksort.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/special.py -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/misc/timsort.py -> build/lib.linux-x86_64-3.7/numba/misc +creating build/lib.linux-x86_64-3.7/numba/np +copying numba/np/__init__.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/arraymath.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/arrayobj.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/extensions.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/linalg.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/npdatetime.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/npdatetime_helpers.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/npyfuncs.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/npyimpl.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/numpy_support.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/polynomial.py -> build/lib.linux-x86_64-3.7/numba/np +copying numba/np/ufunc_db.py -> build/lib.linux-x86_64-3.7/numba/np +creating build/lib.linux-x86_64-3.7/numba/parfors +copying numba/parfors/__init__.py -> build/lib.linux-x86_64-3.7/numba/parfors +copying numba/parfors/array_analysis.py -> build/lib.linux-x86_64-3.7/numba/parfors +copying numba/parfors/parfor.py -> build/lib.linux-x86_64-3.7/numba/parfors +copying numba/parfors/parfor_lowering.py -> build/lib.linux-x86_64-3.7/numba/parfors +copying numba/parfors/parfor_lowering_utils.py -> build/lib.linux-x86_64-3.7/numba/parfors +creating build/lib.linux-x86_64-3.7/numba/pycc +copying numba/pycc/__init__.py -> build/lib.linux-x86_64-3.7/numba/pycc +copying numba/pycc/cc.py -> build/lib.linux-x86_64-3.7/numba/pycc +copying numba/pycc/compiler.py -> build/lib.linux-x86_64-3.7/numba/pycc +copying numba/pycc/decorators.py -> build/lib.linux-x86_64-3.7/numba/pycc +copying numba/pycc/llvm_types.py -> build/lib.linux-x86_64-3.7/numba/pycc +copying numba/pycc/platform.py -> build/lib.linux-x86_64-3.7/numba/pycc +creating build/lib.linux-x86_64-3.7/numba/scripts +copying numba/scripts/__init__.py -> build/lib.linux-x86_64-3.7/numba/scripts +copying numba/scripts/generate_lower_listing.py -> build/lib.linux-x86_64-3.7/numba/scripts +creating build/lib.linux-x86_64-3.7/numba/stencils +copying numba/stencils/__init__.py -> build/lib.linux-x86_64-3.7/numba/stencils +copying numba/stencils/stencil.py -> build/lib.linux-x86_64-3.7/numba/stencils +copying numba/stencils/stencilparfor.py -> build/lib.linux-x86_64-3.7/numba/stencils +creating build/lib.linux-x86_64-3.7/numba/testing +copying numba/testing/__init__.py -> build/lib.linux-x86_64-3.7/numba/testing +copying numba/testing/__main__.py -> build/lib.linux-x86_64-3.7/numba/testing +copying numba/testing/_runtests.py -> build/lib.linux-x86_64-3.7/numba/testing +copying numba/testing/loader.py -> build/lib.linux-x86_64-3.7/numba/testing +copying numba/testing/main.py -> build/lib.linux-x86_64-3.7/numba/testing +copying numba/testing/notebook.py -> build/lib.linux-x86_64-3.7/numba/testing +creating build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/__init__.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/annotation_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/cache_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/cffi_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/cfunc_cache_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/cloudpickle_main_class.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/compile_with_pycc.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/complex_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/ctypes_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/dummy_module.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/enum_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/error_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/gdb_support.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/inlining_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/matmul_usecase.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/orphaned_semaphore_usecase.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/overload_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/parfors_cache_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/parfors_max_label_error.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/pdlike_usecase.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/recursion_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/serialize_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/support.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_alignment.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_analysis.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_annotations.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_api.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_array_analysis.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_array_attr.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_array_constants.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_array_exprs.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_array_iterators.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_array_manipulation.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_array_methods.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_array_reductions.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_array_return.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_asnumbatype.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_auto_constants.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_blackscholes.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_boundscheck.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_buffer_protocol.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_builtins.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_byteflow.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_caching.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_casting.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_cffi.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_cfunc.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_cgutils.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_chained_assign.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_chrome_trace.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_cli.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_closure.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_codegen.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_compile_cache.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_compiler_flags.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_compiler_lock.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_complex.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_comprehension.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_conditions_as_predicates.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_config.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_conversion.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_copy_propagate.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_ctypes.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_dataflow.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_datamodel.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_debug.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_debuginfo.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_deprecations.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_dictimpl.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_dictobject.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_dicts.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_dispatcher.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_dummyarray.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_dyn_array.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_dyn_func.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_entrypoints.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_enums.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_errorhandling.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_errormodels.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_event.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_exceptions.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_extended_arg.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_extending.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_extending_types.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_fancy_indexing.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_fastmath.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_firstlinefinder.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_flow_control.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_func_interface.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_func_lifetime.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_funcdesc.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_function_type.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_gdb_bindings.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_gdb_dwarf.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_generators.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_gil.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_globals.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_hashing.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_heapq.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_help.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_import.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_indexing.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_init_utils.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_inlining.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_interpreter.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_interproc.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_intwidth.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_ir.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_ir_inlining.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_ir_utils.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_itanium_mangler.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_iteration.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_jit_module.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_jitclasses.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_jitmethod.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_linalg.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_listimpl.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_listobject.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_lists.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_literal_dispatch.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_llvm_pass_timings.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_llvm_version_check.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_locals.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_looplifting.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_make_function_to_jit_function.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_mandelbrot.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_mangling.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_map_filter_reduce.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_mathlib.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_maxmin.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_mixed_tuple_unroller.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_moved_modules.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_multi3.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_nan.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_ndarray_subclasses.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_nested_calls.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_np_functions.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_np_randomgen.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_npdatetime.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_nrt.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_nrt_refct.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_num_threads.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_numberctor.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_numbers.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_numconv.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_numpy_support.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_numpyadapt.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_obj_lifetime.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_object_mode.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_objects.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_operators.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_optional.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_overlap.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_parallel_backend.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_parfors.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_parfors_caching.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_parfors_passes.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_pipeline.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_polynomial.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_practical_lowering_issues.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_print.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_profiler.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_pycc.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_python_int.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_random.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_range.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_recarray_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_record_dtype.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_recursion.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_refop_pruning.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_remove_dead.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_retargeting.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_return_values.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_runtests.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_serialize.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_sets.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_slices.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_sort.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_ssa.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_stencils.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_storeslice.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_struct_ref.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_support.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_svml.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_sys_stdin_assignment.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_sysinfo.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_target_extension.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_target_overloadselector.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_threadsafety.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_tracing.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_try_except.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_tuples.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_typeconv.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_typedlist.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_typedobjectutils.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_typeguard.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_typeinfer.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_typenames.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_typeof.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_types.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_typingerror.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_ufuncs.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_unicode.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_unicode_array.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_unicode_names.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_unpack_sequence.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_unpickle_without_module.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_unsafe_intrinsics.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_vectorization.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_vectorization_type_inference.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_warnings.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_withlifting.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/test_wrapper.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/threading_backend_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +copying numba/tests/usecases.py -> build/lib.linux-x86_64-3.7/numba/tests +creating build/lib.linux-x86_64-3.7/numba/typed +copying numba/typed/__init__.py -> build/lib.linux-x86_64-3.7/numba/typed +copying numba/typed/dictimpl.py -> build/lib.linux-x86_64-3.7/numba/typed +copying numba/typed/dictobject.py -> build/lib.linux-x86_64-3.7/numba/typed +copying numba/typed/listobject.py -> build/lib.linux-x86_64-3.7/numba/typed +copying numba/typed/typeddict.py -> build/lib.linux-x86_64-3.7/numba/typed +copying numba/typed/typedlist.py -> build/lib.linux-x86_64-3.7/numba/typed +copying numba/typed/typedobjectutils.py -> build/lib.linux-x86_64-3.7/numba/typed +creating build/lib.linux-x86_64-3.7/numba/types +copying numba/types/__init__.py -> build/lib.linux-x86_64-3.7/numba/types +creating build/lib.linux-x86_64-3.7/numba/core/annotations +copying numba/core/annotations/__init__.py -> build/lib.linux-x86_64-3.7/numba/core/annotations +copying numba/core/annotations/pretty_annotate.py -> build/lib.linux-x86_64-3.7/numba/core/annotations +copying numba/core/annotations/type_annotations.py -> build/lib.linux-x86_64-3.7/numba/core/annotations +creating build/lib.linux-x86_64-3.7/numba/core/datamodel +copying numba/core/datamodel/__init__.py -> build/lib.linux-x86_64-3.7/numba/core/datamodel +copying numba/core/datamodel/manager.py -> build/lib.linux-x86_64-3.7/numba/core/datamodel +copying numba/core/datamodel/models.py -> build/lib.linux-x86_64-3.7/numba/core/datamodel +copying numba/core/datamodel/packer.py -> build/lib.linux-x86_64-3.7/numba/core/datamodel +copying numba/core/datamodel/registry.py -> build/lib.linux-x86_64-3.7/numba/core/datamodel +copying numba/core/datamodel/testing.py -> build/lib.linux-x86_64-3.7/numba/core/datamodel +creating build/lib.linux-x86_64-3.7/numba/core/rewrites +copying numba/core/rewrites/__init__.py -> build/lib.linux-x86_64-3.7/numba/core/rewrites +copying numba/core/rewrites/ir_print.py -> build/lib.linux-x86_64-3.7/numba/core/rewrites +copying numba/core/rewrites/registry.py -> build/lib.linux-x86_64-3.7/numba/core/rewrites +copying numba/core/rewrites/static_binop.py -> build/lib.linux-x86_64-3.7/numba/core/rewrites +copying numba/core/rewrites/static_getitem.py -> build/lib.linux-x86_64-3.7/numba/core/rewrites +copying numba/core/rewrites/static_raise.py -> build/lib.linux-x86_64-3.7/numba/core/rewrites +creating build/lib.linux-x86_64-3.7/numba/core/runtime +copying numba/core/runtime/__init__.py -> build/lib.linux-x86_64-3.7/numba/core/runtime +copying numba/core/runtime/context.py -> build/lib.linux-x86_64-3.7/numba/core/runtime +copying numba/core/runtime/nrt.py -> build/lib.linux-x86_64-3.7/numba/core/runtime +copying numba/core/runtime/nrtdynmod.py -> build/lib.linux-x86_64-3.7/numba/core/runtime +copying numba/core/runtime/nrtopt.py -> build/lib.linux-x86_64-3.7/numba/core/runtime +creating build/lib.linux-x86_64-3.7/numba/core/typeconv +copying numba/core/typeconv/__init__.py -> build/lib.linux-x86_64-3.7/numba/core/typeconv +copying numba/core/typeconv/castgraph.py -> build/lib.linux-x86_64-3.7/numba/core/typeconv +copying numba/core/typeconv/rules.py -> build/lib.linux-x86_64-3.7/numba/core/typeconv +copying numba/core/typeconv/typeconv.py -> build/lib.linux-x86_64-3.7/numba/core/typeconv +creating build/lib.linux-x86_64-3.7/numba/core/types +copying numba/core/types/__init__.py -> build/lib.linux-x86_64-3.7/numba/core/types +copying numba/core/types/abstract.py -> build/lib.linux-x86_64-3.7/numba/core/types +copying numba/core/types/common.py -> build/lib.linux-x86_64-3.7/numba/core/types +copying numba/core/types/containers.py -> build/lib.linux-x86_64-3.7/numba/core/types +copying numba/core/types/function_type.py -> build/lib.linux-x86_64-3.7/numba/core/types +copying numba/core/types/functions.py -> build/lib.linux-x86_64-3.7/numba/core/types +copying numba/core/types/iterators.py -> build/lib.linux-x86_64-3.7/numba/core/types +copying numba/core/types/misc.py -> build/lib.linux-x86_64-3.7/numba/core/types +copying numba/core/types/npytypes.py -> build/lib.linux-x86_64-3.7/numba/core/types +copying numba/core/types/scalars.py -> build/lib.linux-x86_64-3.7/numba/core/types +creating build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/__init__.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/arraydecl.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/asnumbatype.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/bufproto.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/builtins.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/cffi_utils.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/cmathdecl.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/collections.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/context.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/ctypes_utils.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/dictdecl.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/enumdecl.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/listdecl.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/mathdecl.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/npdatetime.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/npydecl.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/randomdecl.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/setdecl.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/templates.py -> build/lib.linux-x86_64-3.7/numba/core/typing +copying numba/core/typing/typeof.py -> build/lib.linux-x86_64-3.7/numba/core/typing +creating build/lib.linux-x86_64-3.7/numba/core/unsafe +copying numba/core/unsafe/__init__.py -> build/lib.linux-x86_64-3.7/numba/core/unsafe +copying numba/core/unsafe/bytes.py -> build/lib.linux-x86_64-3.7/numba/core/unsafe +copying numba/core/unsafe/eh.py -> build/lib.linux-x86_64-3.7/numba/core/unsafe +copying numba/core/unsafe/nrt.py -> build/lib.linux-x86_64-3.7/numba/core/unsafe +copying numba/core/unsafe/refcount.py -> build/lib.linux-x86_64-3.7/numba/core/unsafe +creating build/lib.linux-x86_64-3.7/numba/cpython/unsafe +copying numba/cpython/unsafe/__init__.py -> build/lib.linux-x86_64-3.7/numba/cpython/unsafe +copying numba/cpython/unsafe/numbers.py -> build/lib.linux-x86_64-3.7/numba/cpython/unsafe +copying numba/cpython/unsafe/tuple.py -> build/lib.linux-x86_64-3.7/numba/cpython/unsafe +creating build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/devicearray.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/devices.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/driver.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/drvapi.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/enums.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/error.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/libs.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/ndarray.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/nvvm.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/rtapi.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +copying numba/cuda/cudadrv/runtime.py -> build/lib.linux-x86_64-3.7/numba/cuda/cudadrv +creating build/lib.linux-x86_64-3.7/numba/cuda/kernels +copying numba/cuda/kernels/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/kernels +copying numba/cuda/kernels/reduction.py -> build/lib.linux-x86_64-3.7/numba/cuda/kernels +copying numba/cuda/kernels/transpose.py -> build/lib.linux-x86_64-3.7/numba/cuda/kernels +creating build/lib.linux-x86_64-3.7/numba/cuda/simulator +copying numba/cuda/simulator/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator +copying numba/cuda/simulator/api.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator +copying numba/cuda/simulator/compiler.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator +copying numba/cuda/simulator/kernel.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator +copying numba/cuda/simulator/kernelapi.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator +copying numba/cuda/simulator/reduction.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator +copying numba/cuda/simulator/vector_types.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator +creating build/lib.linux-x86_64-3.7/numba/cuda/tests +copying numba/cuda/tests/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests +creating build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv +copying numba/cuda/simulator/cudadrv/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv +copying numba/cuda/simulator/cudadrv/devicearray.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv +copying numba/cuda/simulator/cudadrv/devices.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv +copying numba/cuda/simulator/cudadrv/driver.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv +copying numba/cuda/simulator/cudadrv/drvapi.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv +copying numba/cuda/simulator/cudadrv/error.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv +copying numba/cuda/simulator/cudadrv/libs.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv +copying numba/cuda/simulator/cudadrv/nvvm.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv +copying numba/cuda/simulator/cudadrv/runtime.py -> build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv +creating build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_array_attr.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_context_stack.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_cuda_array_slicing.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_cuda_auto_context.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_cuda_devicerecord.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_cuda_driver.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_cuda_libraries.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_cuda_memory.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_cuda_ndarray.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_deallocations.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_detect.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_emm_plugins.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_events.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_host_alloc.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_init.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_inline_ptx.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_ir_patch.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_linker.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_managed_alloc.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_nvvm_driver.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_pinned.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_profiler.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_ptds.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_reset_device.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_runtime.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_select_device.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +copying numba/cuda/tests/cudadrv/test_streams.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv +creating build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/cache_usecases.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/cache_with_cpu_usecases.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/extensions_usecases.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/recursion_usecases.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_alignment.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_array.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_array_args.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_array_methods.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_atomics.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_blackscholes.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_boolean.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_caching.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_casting.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_compiler.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_complex.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_complex_kernel.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_const_string.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_constmem.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_cooperative_groups.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_cuda_array_interface.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_cuda_jit_no_types.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_datetime.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_debug.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_debuginfo.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_device_func.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_dispatcher.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_enums.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_errors.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_exception.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_extending.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_fastmath.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_forall.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_freevar.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_frexp_ldexp.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_globals.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_gufunc.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_gufunc_scalar.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_gufunc_scheduling.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_idiv.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_inspect.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_intrinsics.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_ipc.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_iterators.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_lang.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_laplace.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_libdevice.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_lineinfo.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_localmem.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_mandel.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_math.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_matmul.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_minmax.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_montecarlo.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_multigpu.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_multiprocessing.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_multithreads.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_nondet.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_operator.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_optimization.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_overload.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_powi.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_print.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_py2_div_issue.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_random.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_record_dtype.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_recursion.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_reduction.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_serialize.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_slicing.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_sm.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_sm_creation.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_sync.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_transpose.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_userexc.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_vector_type.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_vectorize.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_vectorize_complex.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_vectorize_decor.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_vectorize_device.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_warning.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +copying numba/cuda/tests/cudapy/test_warp_ops.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy +creating build/lib.linux-x86_64-3.7/numba/cuda/tests/cudasim +copying numba/cuda/tests/cudasim/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudasim +copying numba/cuda/tests/cudasim/support.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudasim +copying numba/cuda/tests/cudasim/test_cudasim_issues.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudasim +creating build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/test_cg.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/test_ffi.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/test_laplace.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/test_matmul.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/test_montecarlo.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/test_random.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/test_reduction.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/test_sessionize.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +copying numba/cuda/tests/doc_examples/test_vecadd.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples +creating build/lib.linux-x86_64-3.7/numba/cuda/tests/nocuda +copying numba/cuda/tests/nocuda/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/nocuda +copying numba/cuda/tests/nocuda/test_import.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/nocuda +copying numba/cuda/tests/nocuda/test_library_lookup.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/nocuda +copying numba/cuda/tests/nocuda/test_nvvm.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/nocuda +creating build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data +copying numba/cuda/tests/cudadrv/data/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data +creating build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/ffi +copying numba/cuda/tests/doc_examples/ffi/__init__.py -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/ffi +creating build/lib.linux-x86_64-3.7/numba/experimental/jitclass +copying numba/experimental/jitclass/__init__.py -> build/lib.linux-x86_64-3.7/numba/experimental/jitclass +copying numba/experimental/jitclass/base.py -> build/lib.linux-x86_64-3.7/numba/experimental/jitclass +copying numba/experimental/jitclass/boxing.py -> build/lib.linux-x86_64-3.7/numba/experimental/jitclass +copying numba/experimental/jitclass/decorators.py -> build/lib.linux-x86_64-3.7/numba/experimental/jitclass +copying numba/experimental/jitclass/overloads.py -> build/lib.linux-x86_64-3.7/numba/experimental/jitclass +creating build/lib.linux-x86_64-3.7/numba/misc/help +copying numba/misc/help/__init__.py -> build/lib.linux-x86_64-3.7/numba/misc/help +copying numba/misc/help/inspector.py -> build/lib.linux-x86_64-3.7/numba/misc/help +creating build/lib.linux-x86_64-3.7/numba/np/random +copying numba/np/random/__init__.py -> build/lib.linux-x86_64-3.7/numba/np/random +copying numba/np/random/generator_core.py -> build/lib.linux-x86_64-3.7/numba/np/random +copying numba/np/random/generator_methods.py -> build/lib.linux-x86_64-3.7/numba/np/random +creating build/lib.linux-x86_64-3.7/numba/np/ufunc +copying numba/np/ufunc/__init__.py -> build/lib.linux-x86_64-3.7/numba/np/ufunc +copying numba/np/ufunc/array_exprs.py -> build/lib.linux-x86_64-3.7/numba/np/ufunc +copying numba/np/ufunc/decorators.py -> build/lib.linux-x86_64-3.7/numba/np/ufunc +copying numba/np/ufunc/deviceufunc.py -> build/lib.linux-x86_64-3.7/numba/np/ufunc +copying numba/np/ufunc/dufunc.py -> build/lib.linux-x86_64-3.7/numba/np/ufunc +copying numba/np/ufunc/gufunc.py -> build/lib.linux-x86_64-3.7/numba/np/ufunc +copying numba/np/ufunc/parallel.py -> build/lib.linux-x86_64-3.7/numba/np/ufunc +copying numba/np/ufunc/sigparse.py -> build/lib.linux-x86_64-3.7/numba/np/ufunc +copying numba/np/ufunc/ufuncbuilder.py -> build/lib.linux-x86_64-3.7/numba/np/ufunc +copying numba/np/ufunc/wrappers.py -> build/lib.linux-x86_64-3.7/numba/np/ufunc +creating build/lib.linux-x86_64-3.7/numba/np/unsafe +copying numba/np/unsafe/__init__.py -> build/lib.linux-x86_64-3.7/numba/np/unsafe +copying numba/np/unsafe/ndarray.py -> build/lib.linux-x86_64-3.7/numba/np/unsafe +creating build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/__init__.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_examples.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_jitclass.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_literal_container_usage.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_literally_usage.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_llvm_pass_timings.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_numpy_generators.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_parallel_chunksize.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_rec_array.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_structref_usage.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_typed_dict_usage.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +copying numba/tests/doc_examples/test_typed_list_usage.py -> build/lib.linux-x86_64-3.7/numba/tests/doc_examples +creating build/lib.linux-x86_64-3.7/numba/tests/gdb +copying numba/tests/gdb/__init__.py -> build/lib.linux-x86_64-3.7/numba/tests/gdb +copying numba/tests/gdb/test_array_arg.py -> build/lib.linux-x86_64-3.7/numba/tests/gdb +copying numba/tests/gdb/test_basic.py -> build/lib.linux-x86_64-3.7/numba/tests/gdb +copying numba/tests/gdb/test_break_on_symbol.py -> build/lib.linux-x86_64-3.7/numba/tests/gdb +copying numba/tests/gdb/test_break_on_symbol_version.py -> build/lib.linux-x86_64-3.7/numba/tests/gdb +copying numba/tests/gdb/test_conditional_breakpoint.py -> build/lib.linux-x86_64-3.7/numba/tests/gdb +copying numba/tests/gdb/test_pretty_print.py -> build/lib.linux-x86_64-3.7/numba/tests/gdb +creating build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/__init__.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/cache_usecases.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/test_caching.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/test_dufunc.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/test_errors.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/test_gufunc.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/test_parallel_env_variable.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/test_parallel_low_work.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/test_parallel_ufunc_issues.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/test_ufunc.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/test_ufuncbuilding.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +copying numba/tests/npyufunc/test_vectorize_decor.py -> build/lib.linux-x86_64-3.7/numba/tests/npyufunc +creating build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase +copying numba/tests/pycc_distutils_usecase/__init__.py -> build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase +copying numba/tests/pycc_distutils_usecase/setup_distutils.py -> build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase +copying numba/tests/pycc_distutils_usecase/setup_distutils_nested.py -> build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase +copying numba/tests/pycc_distutils_usecase/setup_setuptools.py -> build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase +copying numba/tests/pycc_distutils_usecase/setup_setuptools_nested.py -> build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase +copying numba/tests/pycc_distutils_usecase/source_module.py -> build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase +creating build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/nested +copying numba/tests/pycc_distutils_usecase/nested/__init__.py -> build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/nested +copying numba/tests/pycc_distutils_usecase/nested/source_module.py -> build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/nested +copying numba/_dynfunc.c -> build/lib.linux-x86_64-3.7/numba +copying numba/_dynfuncmod.c -> build/lib.linux-x86_64-3.7/numba +copying numba/_hashtable.c -> build/lib.linux-x86_64-3.7/numba +copying numba/_helperlib.c -> build/lib.linux-x86_64-3.7/numba +copying numba/_helpermod.c -> build/lib.linux-x86_64-3.7/numba +copying numba/_lapack.c -> build/lib.linux-x86_64-3.7/numba +copying numba/_npymath_exports.c -> build/lib.linux-x86_64-3.7/numba +copying numba/_random.c -> build/lib.linux-x86_64-3.7/numba +copying numba/_typeof.c -> build/lib.linux-x86_64-3.7/numba +copying numba/mviewbuf.c -> build/lib.linux-x86_64-3.7/numba +copying numba/_arraystruct.h -> build/lib.linux-x86_64-3.7/numba +copying numba/_devicearray.h -> build/lib.linux-x86_64-3.7/numba +copying numba/_hashtable.h -> build/lib.linux-x86_64-3.7/numba +copying numba/_numba_common.h -> build/lib.linux-x86_64-3.7/numba +copying numba/_pymodule.h -> build/lib.linux-x86_64-3.7/numba +copying numba/_typeof.h -> build/lib.linux-x86_64-3.7/numba +copying numba/_unicodetype_db.h -> build/lib.linux-x86_64-3.7/numba +copying numba/capsulethunk.h -> build/lib.linux-x86_64-3.7/numba +copying numba/mathnames.h -> build/lib.linux-x86_64-3.7/numba +copying numba/cext/dictobject.c -> build/lib.linux-x86_64-3.7/numba/cext +copying numba/cext/listobject.c -> build/lib.linux-x86_64-3.7/numba/cext +copying numba/cext/utils.c -> build/lib.linux-x86_64-3.7/numba/cext +copying numba/cext/cext.h -> build/lib.linux-x86_64-3.7/numba/cext +copying numba/cext/dictobject.h -> build/lib.linux-x86_64-3.7/numba/cext +copying numba/cext/listobject.h -> build/lib.linux-x86_64-3.7/numba/cext +copying numba/misc/cmdlang.gdb -> build/lib.linux-x86_64-3.7/numba/misc +copying numba/pycc/modulemixin.c -> build/lib.linux-x86_64-3.7/numba/pycc +copying numba/typed/py.typed -> build/lib.linux-x86_64-3.7/numba/typed +copying numba/core/annotations/template.html -> build/lib.linux-x86_64-3.7/numba/core/annotations +copying numba/core/runtime/_nrt_python.c -> build/lib.linux-x86_64-3.7/numba/core/runtime +copying numba/core/runtime/_nrt_pythonmod.c -> build/lib.linux-x86_64-3.7/numba/core/runtime +copying numba/core/runtime/nrt.c -> build/lib.linux-x86_64-3.7/numba/core/runtime +copying numba/core/runtime/nrt.h -> build/lib.linux-x86_64-3.7/numba/core/runtime +copying numba/core/runtime/nrt_external.h -> build/lib.linux-x86_64-3.7/numba/core/runtime +copying numba/cuda/tests/cudadrv/data/jitlink.ptx -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data +copying numba/cuda/tests/cudadrv/data/cuda_include.cu -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data +copying numba/cuda/tests/cudadrv/data/error.cu -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data +copying numba/cuda/tests/cudadrv/data/jitlink.cu -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data +copying numba/cuda/tests/cudadrv/data/warn.cu -> build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data +copying numba/cuda/tests/doc_examples/ffi/functions.cu -> build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/ffi +running build_ext +building 'numba._dynfunc' extension +Warning: Can't read registry to find the necessary compiler setting +Make sure that Python modules winreg, win32api or win32con are installed. +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +creating build/temp.linux-x86_64-3.7 +creating build/temp.linux-x86_64-3.7/numba +compile options: '-I/usr/local/include/python3.7m -c' +gcc: numba/_dynfuncmod.c +gcc -pthread -shared build/temp.linux-x86_64-3.7/numba/_dynfuncmod.o -o build/lib.linux-x86_64-3.7/numba/_dynfunc.cpython-37m-x86_64-linux-gnu.so +building 'numba._dispatcher' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +creating build/temp.linux-x86_64-3.7/numba/core +creating build/temp.linux-x86_64-3.7/numba/core/typeconv +compile options: '-I/usr/local/lib/python3.7/site-packages/numpy/core/include -I/usr/local/include/python3.7m -c' +gcc: numba/_dispatcher.cpp +gcc: numba/_typeof.c +gcc: numba/_hashtable.c +gcc: numba/core/typeconv/typeconv.cpp +g++ -pthread -shared build/temp.linux-x86_64-3.7/numba/_dispatcher.o build/temp.linux-x86_64-3.7/numba/_typeof.o build/temp.linux-x86_64-3.7/numba/_hashtable.o build/temp.linux-x86_64-3.7/numba/core/typeconv/typeconv.o -L/usr/local/lib/python3.7/site-packages/numpy/core/lib -lnpymath -lm -o build/lib.linux-x86_64-3.7/numba/_dispatcher.cpython-37m-x86_64-linux-gnu.so +building 'numba._helperlib' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +creating build/temp.linux-x86_64-3.7/numba/cext +compile options: '-I/usr/local/lib/python3.7/site-packages/numpy/core/include -I/usr/local/include/python3.7m -c' +gcc: numba/_helpermod.c +gcc: numba/cext/utils.c +gcc: numba/cext/listobject.c +gcc: numba/cext/dictobject.c +gcc -pthread -shared build/temp.linux-x86_64-3.7/numba/_helpermod.o build/temp.linux-x86_64-3.7/numba/cext/utils.o build/temp.linux-x86_64-3.7/numba/cext/dictobject.o build/temp.linux-x86_64-3.7/numba/cext/listobject.o -L/usr/local/lib/python3.7/site-packages/numpy/core/lib -lnpymath -lm -o build/lib.linux-x86_64-3.7/numba/_helperlib.cpython-37m-x86_64-linux-gnu.so +building 'numba.core.typeconv._typeconv' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +compile options: '-I/usr/local/include/python3.7m -c' +gcc: numba/core/typeconv/typeconv.cpp +gcc: numba/core/typeconv/_typeconv.cpp +g++ -pthread -shared build/temp.linux-x86_64-3.7/numba/core/typeconv/typeconv.o build/temp.linux-x86_64-3.7/numba/core/typeconv/_typeconv.o -o build/lib.linux-x86_64-3.7/numba/core/typeconv/_typeconv.cpython-37m-x86_64-linux-gnu.so +building 'numba.np.ufunc._internal' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +creating build/temp.linux-x86_64-3.7/numba/np +creating build/temp.linux-x86_64-3.7/numba/np/ufunc +compile options: '-I/usr/local/lib/python3.7/site-packages/numpy/core/include -I/usr/local/include/python3.7m -c' +gcc: numba/np/ufunc/_internal.c +gcc -pthread -shared build/temp.linux-x86_64-3.7/numba/np/ufunc/_internal.o -L/usr/local/lib/python3.7/site-packages/numpy/core/lib -lnpymath -lm -o build/lib.linux-x86_64-3.7/numba/np/ufunc/_internal.cpython-37m-x86_64-linux-gnu.so +building 'numba.np.ufunc._num_threads' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +compile options: '-I/usr/local/include/python3.7m -c' +gcc: numba/np/ufunc/_num_threads.c +gcc -pthread -shared build/temp.linux-x86_64-3.7/numba/np/ufunc/_num_threads.o -o build/lib.linux-x86_64-3.7/numba/np/ufunc/_num_threads.cpython-37m-x86_64-linux-gnu.so +building 'numba.mviewbuf' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +compile options: '-I/usr/local/include/python3.7m -c' +gcc: numba/mviewbuf.c +gcc -pthread -shared build/temp.linux-x86_64-3.7/numba/mviewbuf.o -o build/lib.linux-x86_64-3.7/numba/mviewbuf.cpython-37m-x86_64-linux-gnu.so +building 'numba.core.runtime._nrt_python' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +creating build/temp.linux-x86_64-3.7/numba/core/runtime +compile options: '-I/usr/local/lib/python3.7/site-packages/numpy/core/include -I/usr/local/include/python3.7m -c' +gcc: numba/core/runtime/_nrt_pythonmod.c +gcc: numba/core/runtime/nrt.c +gcc -pthread -shared build/temp.linux-x86_64-3.7/numba/core/runtime/_nrt_pythonmod.o build/temp.linux-x86_64-3.7/numba/core/runtime/nrt.o -L/usr/local/lib/python3.7/site-packages/numpy/core/lib -lnpymath -lm -o build/lib.linux-x86_64-3.7/numba/core/runtime/_nrt_python.cpython-37m-x86_64-linux-gnu.so +building 'numba.experimental.jitclass._box' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +creating build/temp.linux-x86_64-3.7/numba/experimental +creating build/temp.linux-x86_64-3.7/numba/experimental/jitclass +compile options: '-I/usr/local/include/python3.7m -c' +gcc: numba/experimental/jitclass/_box.c +gcc -pthread -shared build/temp.linux-x86_64-3.7/numba/experimental/jitclass/_box.o -o build/lib.linux-x86_64-3.7/numba/experimental/jitclass/_box.cpython-37m-x86_64-linux-gnu.so +building 'numba.cuda.cudadrv._extras' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +creating build/temp.linux-x86_64-3.7/numba/cuda +creating build/temp.linux-x86_64-3.7/numba/cuda/cudadrv +compile options: '-Inumba -I/usr/local/include/python3.7m -c' +gcc: numba/cuda/cudadrv/_extras.c +gcc -pthread -shared build/temp.linux-x86_64-3.7/numba/cuda/cudadrv/_extras.o -o build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/_extras.cpython-37m-x86_64-linux-gnu.so +building 'numba._devicearray' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +compile options: '-Inumba -I/usr/local/include/python3.7m -c' +extra options: '-std=c++11' +gcc: numba/_devicearray.cpp +g++ -pthread -shared build/temp.linux-x86_64-3.7/numba/_devicearray.o -o build/lib.linux-x86_64-3.7/numba/_devicearray.cpython-37m-x86_64-linux-gnu.so +building 'numba.np.ufunc.omppool' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +compile options: '-I/usr/local/include/python3.7m -c' +extra options: '-fopenmp -std=c++11' +gcc: numba/np/ufunc/omppool.cpp +gcc: numba/np/ufunc/gufunc_scheduler.cpp +g++ -pthread -shared build/temp.linux-x86_64-3.7/numba/np/ufunc/omppool.o build/temp.linux-x86_64-3.7/numba/np/ufunc/gufunc_scheduler.o -o build/lib.linux-x86_64-3.7/numba/np/ufunc/omppool.cpython-37m-x86_64-linux-gnu.so -fopenmp +building 'numba.np.ufunc.workqueue' extension +C compiler: gcc -pthread -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -fPIC + +compile options: '-I/usr/local/include/python3.7m -c' +gcc: numba/np/ufunc/gufunc_scheduler.cpp +gcc: numba/np/ufunc/workqueue.c +g++ -pthread -shared build/temp.linux-x86_64-3.7/numba/np/ufunc/workqueue.o build/temp.linux-x86_64-3.7/numba/np/ufunc/gufunc_scheduler.o -o build/lib.linux-x86_64-3.7/numba/np/ufunc/workqueue.cpython-37m-x86_64-linux-gnu.so +running build_scripts +creating build/scripts-3.7 +copying and adjusting numba/pycc/pycc -> build/scripts-3.7 +copying and adjusting bin/numba -> build/scripts-3.7 +changing mode of build/scripts-3.7/pycc from 644 to 755 +changing mode of build/scripts-3.7/numba from 644 to 755 +UPDATING build/lib.linux-x86_64-3.7/numba/_version.py +installing to build/bdist.linux-x86_64/wheel +running install +running install_lib +creating build/bdist.linux-x86_64 +creating build/bdist.linux-x86_64/wheel +creating build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/__init__.py -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/__main__.py -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/extending.py -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/runtests.py -> build/bdist.linux-x86_64/wheel/numba +creating build/bdist.linux-x86_64/wheel/numba/cext +copying build/lib.linux-x86_64-3.7/numba/cext/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cext +copying build/lib.linux-x86_64-3.7/numba/cext/dictobject.c -> build/bdist.linux-x86_64/wheel/numba/cext +copying build/lib.linux-x86_64-3.7/numba/cext/listobject.c -> build/bdist.linux-x86_64/wheel/numba/cext +copying build/lib.linux-x86_64-3.7/numba/cext/utils.c -> build/bdist.linux-x86_64/wheel/numba/cext +copying build/lib.linux-x86_64-3.7/numba/cext/cext.h -> build/bdist.linux-x86_64/wheel/numba/cext +copying build/lib.linux-x86_64-3.7/numba/cext/dictobject.h -> build/bdist.linux-x86_64/wheel/numba/cext +copying build/lib.linux-x86_64-3.7/numba/cext/listobject.h -> build/bdist.linux-x86_64/wheel/numba/cext +creating build/bdist.linux-x86_64/wheel/numba/cloudpickle +copying build/lib.linux-x86_64-3.7/numba/cloudpickle/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cloudpickle +copying build/lib.linux-x86_64-3.7/numba/cloudpickle/cloudpickle.py -> build/bdist.linux-x86_64/wheel/numba/cloudpickle +copying build/lib.linux-x86_64-3.7/numba/cloudpickle/cloudpickle_fast.py -> build/bdist.linux-x86_64/wheel/numba/cloudpickle +copying build/lib.linux-x86_64-3.7/numba/cloudpickle/compat.py -> build/bdist.linux-x86_64/wheel/numba/cloudpickle +creating build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/__init__.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/analysis.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/base.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/boxing.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/bytecode.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/byteflow.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/caching.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/callconv.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/callwrapper.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/ccallback.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/cgutils.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/codegen.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/compiler.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/compiler_lock.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/compiler_machinery.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/config.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/consts.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/controlflow.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/cpu.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/cpu_options.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/dataflow.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/debuginfo.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/decorators.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/descriptors.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/dispatcher.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/entrypoints.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/environment.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/errors.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/event.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/extending.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/externals.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/fastmathpass.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/funcdesc.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/generators.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/imputils.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/inline_closurecall.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/interpreter.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/intrinsics.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/ir.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/ir_utils.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/itanium_mangler.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/llvm_bindings.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/lowering.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/object_mode_passes.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/optional.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/options.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/overload_glue.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/postproc.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/pylowering.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/pythonapi.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/registry.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/removerefctpass.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/retarget.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/serialize.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/sigutils.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/ssa.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/target_extension.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/targetconfig.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/tracing.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/transforms.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/typed_passes.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/typeinfer.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/untyped_passes.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/utils.py -> build/bdist.linux-x86_64/wheel/numba/core +copying build/lib.linux-x86_64-3.7/numba/core/withcontexts.py -> build/bdist.linux-x86_64/wheel/numba/core +creating build/bdist.linux-x86_64/wheel/numba/core/annotations +copying build/lib.linux-x86_64-3.7/numba/core/annotations/__init__.py -> build/bdist.linux-x86_64/wheel/numba/core/annotations +copying build/lib.linux-x86_64-3.7/numba/core/annotations/pretty_annotate.py -> build/bdist.linux-x86_64/wheel/numba/core/annotations +copying build/lib.linux-x86_64-3.7/numba/core/annotations/type_annotations.py -> build/bdist.linux-x86_64/wheel/numba/core/annotations +copying build/lib.linux-x86_64-3.7/numba/core/annotations/template.html -> build/bdist.linux-x86_64/wheel/numba/core/annotations +creating build/bdist.linux-x86_64/wheel/numba/core/datamodel +copying build/lib.linux-x86_64-3.7/numba/core/datamodel/__init__.py -> build/bdist.linux-x86_64/wheel/numba/core/datamodel +copying build/lib.linux-x86_64-3.7/numba/core/datamodel/manager.py -> build/bdist.linux-x86_64/wheel/numba/core/datamodel +copying build/lib.linux-x86_64-3.7/numba/core/datamodel/models.py -> build/bdist.linux-x86_64/wheel/numba/core/datamodel +copying build/lib.linux-x86_64-3.7/numba/core/datamodel/packer.py -> build/bdist.linux-x86_64/wheel/numba/core/datamodel +copying build/lib.linux-x86_64-3.7/numba/core/datamodel/registry.py -> build/bdist.linux-x86_64/wheel/numba/core/datamodel +copying build/lib.linux-x86_64-3.7/numba/core/datamodel/testing.py -> build/bdist.linux-x86_64/wheel/numba/core/datamodel +creating build/bdist.linux-x86_64/wheel/numba/core/rewrites +copying build/lib.linux-x86_64-3.7/numba/core/rewrites/__init__.py -> build/bdist.linux-x86_64/wheel/numba/core/rewrites +copying build/lib.linux-x86_64-3.7/numba/core/rewrites/ir_print.py -> build/bdist.linux-x86_64/wheel/numba/core/rewrites +copying build/lib.linux-x86_64-3.7/numba/core/rewrites/registry.py -> build/bdist.linux-x86_64/wheel/numba/core/rewrites +copying build/lib.linux-x86_64-3.7/numba/core/rewrites/static_binop.py -> build/bdist.linux-x86_64/wheel/numba/core/rewrites +copying build/lib.linux-x86_64-3.7/numba/core/rewrites/static_getitem.py -> build/bdist.linux-x86_64/wheel/numba/core/rewrites +copying build/lib.linux-x86_64-3.7/numba/core/rewrites/static_raise.py -> build/bdist.linux-x86_64/wheel/numba/core/rewrites +creating build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/__init__.py -> build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/context.py -> build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/nrt.py -> build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/nrtdynmod.py -> build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/nrtopt.py -> build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/_nrt_python.c -> build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/_nrt_pythonmod.c -> build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/nrt.c -> build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/nrt.h -> build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/nrt_external.h -> build/bdist.linux-x86_64/wheel/numba/core/runtime +copying build/lib.linux-x86_64-3.7/numba/core/runtime/_nrt_python.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba/core/runtime +creating build/bdist.linux-x86_64/wheel/numba/core/typeconv +copying build/lib.linux-x86_64-3.7/numba/core/typeconv/__init__.py -> build/bdist.linux-x86_64/wheel/numba/core/typeconv +copying build/lib.linux-x86_64-3.7/numba/core/typeconv/castgraph.py -> build/bdist.linux-x86_64/wheel/numba/core/typeconv +copying build/lib.linux-x86_64-3.7/numba/core/typeconv/rules.py -> build/bdist.linux-x86_64/wheel/numba/core/typeconv +copying build/lib.linux-x86_64-3.7/numba/core/typeconv/typeconv.py -> build/bdist.linux-x86_64/wheel/numba/core/typeconv +copying build/lib.linux-x86_64-3.7/numba/core/typeconv/_typeconv.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba/core/typeconv +creating build/bdist.linux-x86_64/wheel/numba/core/types +copying build/lib.linux-x86_64-3.7/numba/core/types/__init__.py -> build/bdist.linux-x86_64/wheel/numba/core/types +copying build/lib.linux-x86_64-3.7/numba/core/types/abstract.py -> build/bdist.linux-x86_64/wheel/numba/core/types +copying build/lib.linux-x86_64-3.7/numba/core/types/common.py -> build/bdist.linux-x86_64/wheel/numba/core/types +copying build/lib.linux-x86_64-3.7/numba/core/types/containers.py -> build/bdist.linux-x86_64/wheel/numba/core/types +copying build/lib.linux-x86_64-3.7/numba/core/types/function_type.py -> build/bdist.linux-x86_64/wheel/numba/core/types +copying build/lib.linux-x86_64-3.7/numba/core/types/functions.py -> build/bdist.linux-x86_64/wheel/numba/core/types +copying build/lib.linux-x86_64-3.7/numba/core/types/iterators.py -> build/bdist.linux-x86_64/wheel/numba/core/types +copying build/lib.linux-x86_64-3.7/numba/core/types/misc.py -> build/bdist.linux-x86_64/wheel/numba/core/types +copying build/lib.linux-x86_64-3.7/numba/core/types/npytypes.py -> build/bdist.linux-x86_64/wheel/numba/core/types +copying build/lib.linux-x86_64-3.7/numba/core/types/scalars.py -> build/bdist.linux-x86_64/wheel/numba/core/types +creating build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/__init__.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/arraydecl.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/asnumbatype.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/bufproto.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/builtins.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/cffi_utils.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/cmathdecl.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/collections.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/context.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/ctypes_utils.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/dictdecl.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/enumdecl.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/listdecl.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/mathdecl.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/npdatetime.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/npydecl.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/randomdecl.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/setdecl.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/templates.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +copying build/lib.linux-x86_64-3.7/numba/core/typing/typeof.py -> build/bdist.linux-x86_64/wheel/numba/core/typing +creating build/bdist.linux-x86_64/wheel/numba/core/unsafe +copying build/lib.linux-x86_64-3.7/numba/core/unsafe/__init__.py -> build/bdist.linux-x86_64/wheel/numba/core/unsafe +copying build/lib.linux-x86_64-3.7/numba/core/unsafe/bytes.py -> build/bdist.linux-x86_64/wheel/numba/core/unsafe +copying build/lib.linux-x86_64-3.7/numba/core/unsafe/eh.py -> build/bdist.linux-x86_64/wheel/numba/core/unsafe +copying build/lib.linux-x86_64-3.7/numba/core/unsafe/nrt.py -> build/bdist.linux-x86_64/wheel/numba/core/unsafe +copying build/lib.linux-x86_64-3.7/numba/core/unsafe/refcount.py -> build/bdist.linux-x86_64/wheel/numba/core/unsafe +creating build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/builtins.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/charseq.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/cmathimpl.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/enumimpl.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/hashing.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/heapq.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/iterators.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/listobj.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/mathimpl.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/numbers.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/printimpl.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/randomimpl.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/rangeobj.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/setobj.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/slicing.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/tupleobj.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/unicode.py -> build/bdist.linux-x86_64/wheel/numba/cpython +copying build/lib.linux-x86_64-3.7/numba/cpython/unicode_support.py -> build/bdist.linux-x86_64/wheel/numba/cpython +creating build/bdist.linux-x86_64/wheel/numba/cpython/unsafe +copying build/lib.linux-x86_64-3.7/numba/cpython/unsafe/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cpython/unsafe +copying build/lib.linux-x86_64-3.7/numba/cpython/unsafe/numbers.py -> build/bdist.linux-x86_64/wheel/numba/cpython/unsafe +copying build/lib.linux-x86_64-3.7/numba/cpython/unsafe/tuple.py -> build/bdist.linux-x86_64/wheel/numba/cpython/unsafe +creating build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/api.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/api_util.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/args.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/codegen.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/compiler.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/cuda_paths.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadecl.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/cudaimpl.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/cudamath.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/decorators.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/descriptor.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/device_init.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/dispatcher.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/errors.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/initialize.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/intrinsic_wrapper.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/libdevice.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/libdevicedecl.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/libdevicefuncs.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/libdeviceimpl.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/mathimpl.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/models.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/nvvmutils.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/printimpl.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/random.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator_init.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/stubs.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/target.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/testing.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/types.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/vector_types.py -> build/bdist.linux-x86_64/wheel/numba/cuda +copying build/lib.linux-x86_64-3.7/numba/cuda/vectorizers.py -> build/bdist.linux-x86_64/wheel/numba/cuda +creating build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/devicearray.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/devices.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/driver.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/drvapi.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/enums.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/error.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/libs.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/ndarray.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/nvvm.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/rtapi.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/runtime.py -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/cudadrv/_extras.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba/cuda/cudadrv +creating build/bdist.linux-x86_64/wheel/numba/cuda/kernels +copying build/lib.linux-x86_64-3.7/numba/cuda/kernels/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/kernels +copying build/lib.linux-x86_64-3.7/numba/cuda/kernels/reduction.py -> build/bdist.linux-x86_64/wheel/numba/cuda/kernels +copying build/lib.linux-x86_64-3.7/numba/cuda/kernels/transpose.py -> build/bdist.linux-x86_64/wheel/numba/cuda/kernels +creating build/bdist.linux-x86_64/wheel/numba/cuda/simulator +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/api.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/compiler.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/kernel.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/kernelapi.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/reduction.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/vector_types.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator +creating build/bdist.linux-x86_64/wheel/numba/cuda/simulator/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv/devicearray.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv/devices.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv/driver.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv/drvapi.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv/error.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv/libs.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv/nvvm.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/simulator/cudadrv/runtime.py -> build/bdist.linux-x86_64/wheel/numba/cuda/simulator/cudadrv +creating build/bdist.linux-x86_64/wheel/numba/cuda/tests +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests +creating build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_array_attr.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_context_stack.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_cuda_auto_context.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_cuda_driver.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_cuda_libraries.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_cuda_memory.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_cuda_ndarray.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_deallocations.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_detect.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_emm_plugins.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_events.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_host_alloc.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_init.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_inline_ptx.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_ir_patch.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_linker.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_managed_alloc.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_nvvm_driver.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_pinned.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_profiler.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_ptds.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_reset_device.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_runtime.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_select_device.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/test_streams.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv +creating build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv/data +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv/data +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data/jitlink.ptx -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv/data +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data/cuda_include.cu -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv/data +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data/error.cu -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv/data +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data/jitlink.cu -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv/data +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudadrv/data/warn.cu -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudadrv/data +creating build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/cache_usecases.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/extensions_usecases.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/recursion_usecases.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_alignment.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_array.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_array_args.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_array_methods.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_atomics.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_blackscholes.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_boolean.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_caching.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_casting.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_compiler.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_complex.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_complex_kernel.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_const_string.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_constmem.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_cooperative_groups.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_cuda_array_interface.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_datetime.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_debug.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_debuginfo.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_device_func.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_dispatcher.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_enums.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_errors.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_exception.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_extending.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_fastmath.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_forall.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_freevar.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_frexp_ldexp.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_globals.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_gufunc.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_gufunc_scalar.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_gufunc_scheduling.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_idiv.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_inspect.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_intrinsics.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_ipc.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_iterators.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_lang.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_laplace.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_libdevice.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_lineinfo.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_localmem.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_mandel.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_math.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_matmul.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_minmax.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_montecarlo.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_multigpu.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_multiprocessing.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_multithreads.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_nondet.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_operator.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_optimization.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_overload.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_powi.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_print.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_py2_div_issue.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_random.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_record_dtype.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_recursion.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_reduction.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_serialize.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_slicing.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_sm.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_sm_creation.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_sync.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_transpose.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_userexc.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_vector_type.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_vectorize.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_vectorize_complex.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_vectorize_decor.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_vectorize_device.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_warning.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudapy/test_warp_ops.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudapy +creating build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudasim +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudasim/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudasim +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudasim/support.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudasim +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/cudasim/test_cudasim_issues.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/cudasim +creating build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/test_cg.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/test_ffi.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/test_laplace.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/test_matmul.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/test_montecarlo.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/test_random.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/test_reduction.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/test_sessionize.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/test_vecadd.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples +creating build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples/ffi +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/ffi/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples/ffi +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/doc_examples/ffi/functions.cu -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/doc_examples/ffi +creating build/bdist.linux-x86_64/wheel/numba/cuda/tests/nocuda +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/nocuda/__init__.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/nocuda +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/nocuda/test_import.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/nocuda +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/nocuda/test_library_lookup.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/nocuda +copying build/lib.linux-x86_64-3.7/numba/cuda/tests/nocuda/test_nvvm.py -> build/bdist.linux-x86_64/wheel/numba/cuda/tests/nocuda +creating build/bdist.linux-x86_64/wheel/numba/experimental +copying build/lib.linux-x86_64-3.7/numba/experimental/__init__.py -> build/bdist.linux-x86_64/wheel/numba/experimental +copying build/lib.linux-x86_64-3.7/numba/experimental/function_type.py -> build/bdist.linux-x86_64/wheel/numba/experimental +copying build/lib.linux-x86_64-3.7/numba/experimental/structref.py -> build/bdist.linux-x86_64/wheel/numba/experimental +creating build/bdist.linux-x86_64/wheel/numba/experimental/jitclass +copying build/lib.linux-x86_64-3.7/numba/experimental/jitclass/__init__.py -> build/bdist.linux-x86_64/wheel/numba/experimental/jitclass +copying build/lib.linux-x86_64-3.7/numba/experimental/jitclass/base.py -> build/bdist.linux-x86_64/wheel/numba/experimental/jitclass +copying build/lib.linux-x86_64-3.7/numba/experimental/jitclass/boxing.py -> build/bdist.linux-x86_64/wheel/numba/experimental/jitclass +copying build/lib.linux-x86_64-3.7/numba/experimental/jitclass/decorators.py -> build/bdist.linux-x86_64/wheel/numba/experimental/jitclass +copying build/lib.linux-x86_64-3.7/numba/experimental/jitclass/overloads.py -> build/bdist.linux-x86_64/wheel/numba/experimental/jitclass +copying build/lib.linux-x86_64-3.7/numba/experimental/jitclass/_box.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba/experimental/jitclass +creating build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/__init__.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/appdirs.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/cffiimpl.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/dummyarray.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/dump_style.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/findlib.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/firstlinefinder.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/gdb_hook.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/gdb_print_extension.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/init_utils.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/inspection.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/literal.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/llvm_pass_timings.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/mergesort.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/numba_entry.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/numba_gdbinfo.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/numba_sysinfo.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/quicksort.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/special.py -> build/bdist.linux-x86_64/wheel/numba/misc +copying build/lib.linux-x86_64-3.7/numba/misc/timsort.py -> build/bdist.linux-x86_64/wheel/numba/misc +creating build/bdist.linux-x86_64/wheel/numba/misc/help +copying build/lib.linux-x86_64-3.7/numba/misc/help/__init__.py -> build/bdist.linux-x86_64/wheel/numba/misc/help +copying build/lib.linux-x86_64-3.7/numba/misc/help/inspector.py -> build/bdist.linux-x86_64/wheel/numba/misc/help +copying build/lib.linux-x86_64-3.7/numba/misc/cmdlang.gdb -> build/bdist.linux-x86_64/wheel/numba/misc +creating build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/__init__.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/arraymath.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/arrayobj.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/extensions.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/linalg.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/npdatetime.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/npdatetime_helpers.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/npyfuncs.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/npyimpl.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/numpy_support.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/polynomial.py -> build/bdist.linux-x86_64/wheel/numba/np +copying build/lib.linux-x86_64-3.7/numba/np/ufunc_db.py -> build/bdist.linux-x86_64/wheel/numba/np +creating build/bdist.linux-x86_64/wheel/numba/np/random +copying build/lib.linux-x86_64-3.7/numba/np/random/__init__.py -> build/bdist.linux-x86_64/wheel/numba/np/random +copying build/lib.linux-x86_64-3.7/numba/np/random/generator_core.py -> build/bdist.linux-x86_64/wheel/numba/np/random +copying build/lib.linux-x86_64-3.7/numba/np/random/generator_methods.py -> build/bdist.linux-x86_64/wheel/numba/np/random +creating build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/__init__.py -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/array_exprs.py -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/decorators.py -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/deviceufunc.py -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/dufunc.py -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/gufunc.py -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/parallel.py -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/sigparse.py -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/ufuncbuilder.py -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/wrappers.py -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/_internal.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/_num_threads.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/omppool.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +copying build/lib.linux-x86_64-3.7/numba/np/ufunc/workqueue.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba/np/ufunc +creating build/bdist.linux-x86_64/wheel/numba/np/unsafe +copying build/lib.linux-x86_64-3.7/numba/np/unsafe/__init__.py -> build/bdist.linux-x86_64/wheel/numba/np/unsafe +copying build/lib.linux-x86_64-3.7/numba/np/unsafe/ndarray.py -> build/bdist.linux-x86_64/wheel/numba/np/unsafe +creating build/bdist.linux-x86_64/wheel/numba/parfors +copying build/lib.linux-x86_64-3.7/numba/parfors/__init__.py -> build/bdist.linux-x86_64/wheel/numba/parfors +copying build/lib.linux-x86_64-3.7/numba/parfors/array_analysis.py -> build/bdist.linux-x86_64/wheel/numba/parfors +copying build/lib.linux-x86_64-3.7/numba/parfors/parfor.py -> build/bdist.linux-x86_64/wheel/numba/parfors +copying build/lib.linux-x86_64-3.7/numba/parfors/parfor_lowering.py -> build/bdist.linux-x86_64/wheel/numba/parfors +copying build/lib.linux-x86_64-3.7/numba/parfors/parfor_lowering_utils.py -> build/bdist.linux-x86_64/wheel/numba/parfors +creating build/bdist.linux-x86_64/wheel/numba/pycc +copying build/lib.linux-x86_64-3.7/numba/pycc/__init__.py -> build/bdist.linux-x86_64/wheel/numba/pycc +copying build/lib.linux-x86_64-3.7/numba/pycc/cc.py -> build/bdist.linux-x86_64/wheel/numba/pycc +copying build/lib.linux-x86_64-3.7/numba/pycc/compiler.py -> build/bdist.linux-x86_64/wheel/numba/pycc +copying build/lib.linux-x86_64-3.7/numba/pycc/decorators.py -> build/bdist.linux-x86_64/wheel/numba/pycc +copying build/lib.linux-x86_64-3.7/numba/pycc/llvm_types.py -> build/bdist.linux-x86_64/wheel/numba/pycc +copying build/lib.linux-x86_64-3.7/numba/pycc/platform.py -> build/bdist.linux-x86_64/wheel/numba/pycc +copying build/lib.linux-x86_64-3.7/numba/pycc/modulemixin.c -> build/bdist.linux-x86_64/wheel/numba/pycc +creating build/bdist.linux-x86_64/wheel/numba/scripts +copying build/lib.linux-x86_64-3.7/numba/scripts/__init__.py -> build/bdist.linux-x86_64/wheel/numba/scripts +copying build/lib.linux-x86_64-3.7/numba/scripts/generate_lower_listing.py -> build/bdist.linux-x86_64/wheel/numba/scripts +creating build/bdist.linux-x86_64/wheel/numba/stencils +copying build/lib.linux-x86_64-3.7/numba/stencils/__init__.py -> build/bdist.linux-x86_64/wheel/numba/stencils +copying build/lib.linux-x86_64-3.7/numba/stencils/stencil.py -> build/bdist.linux-x86_64/wheel/numba/stencils +copying build/lib.linux-x86_64-3.7/numba/stencils/stencilparfor.py -> build/bdist.linux-x86_64/wheel/numba/stencils +creating build/bdist.linux-x86_64/wheel/numba/testing +copying build/lib.linux-x86_64-3.7/numba/testing/__init__.py -> build/bdist.linux-x86_64/wheel/numba/testing +copying build/lib.linux-x86_64-3.7/numba/testing/__main__.py -> build/bdist.linux-x86_64/wheel/numba/testing +copying build/lib.linux-x86_64-3.7/numba/testing/_runtests.py -> build/bdist.linux-x86_64/wheel/numba/testing +copying build/lib.linux-x86_64-3.7/numba/testing/loader.py -> build/bdist.linux-x86_64/wheel/numba/testing +copying build/lib.linux-x86_64-3.7/numba/testing/main.py -> build/bdist.linux-x86_64/wheel/numba/testing +copying build/lib.linux-x86_64-3.7/numba/testing/notebook.py -> build/bdist.linux-x86_64/wheel/numba/testing +creating build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/__init__.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/annotation_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/cache_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/cffi_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/cfunc_cache_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/cloudpickle_main_class.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/compile_with_pycc.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/complex_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/ctypes_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/dummy_module.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/enum_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/error_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/gdb_support.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/inlining_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/matmul_usecase.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/orphaned_semaphore_usecase.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/overload_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/parfors_cache_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/parfors_max_label_error.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/pdlike_usecase.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/recursion_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/serialize_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/support.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_alignment.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_analysis.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_annotations.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_api.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_array_analysis.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_array_attr.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_array_constants.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_array_exprs.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_array_iterators.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_array_manipulation.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_array_methods.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_array_reductions.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_array_return.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_asnumbatype.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_auto_constants.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_blackscholes.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_boundscheck.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_buffer_protocol.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_builtins.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_byteflow.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_caching.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_casting.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_cffi.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_cfunc.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_cgutils.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_chained_assign.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_chrome_trace.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_cli.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_closure.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_codegen.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_compile_cache.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_compiler_flags.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_compiler_lock.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_complex.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_comprehension.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_conditions_as_predicates.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_config.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_conversion.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_copy_propagate.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_ctypes.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_dataflow.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_datamodel.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_debug.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_debuginfo.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_deprecations.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_dictimpl.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_dictobject.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_dicts.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_dispatcher.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_dummyarray.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_dyn_array.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_dyn_func.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_entrypoints.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_enums.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_errorhandling.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_errormodels.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_event.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_exceptions.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_extended_arg.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_extending.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_extending_types.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_fancy_indexing.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_fastmath.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_firstlinefinder.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_flow_control.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_func_interface.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_func_lifetime.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_funcdesc.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_function_type.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_gdb_bindings.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_gdb_dwarf.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_generators.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_gil.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_globals.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_hashing.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_heapq.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_help.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_import.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_indexing.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_init_utils.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_inlining.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_interpreter.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_interproc.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_intwidth.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_ir.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_ir_inlining.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_ir_utils.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_itanium_mangler.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_iteration.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_jit_module.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_jitclasses.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_jitmethod.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_linalg.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_listimpl.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_listobject.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_nan.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_lists.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_literal_dispatch.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_llvm_pass_timings.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_llvm_version_check.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_locals.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_looplifting.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_make_function_to_jit_function.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_mandelbrot.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_mangling.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_map_filter_reduce.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_mathlib.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_maxmin.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_mixed_tuple_unroller.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_moved_modules.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_multi3.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_ndarray_subclasses.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_nested_calls.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_np_functions.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_np_randomgen.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_npdatetime.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_nrt.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_nrt_refct.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_num_threads.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_numberctor.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_numbers.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_numconv.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_numpy_support.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_numpyadapt.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_obj_lifetime.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_object_mode.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_objects.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_operators.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_optional.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_overlap.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_parallel_backend.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_parfors.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_parfors_caching.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_parfors_passes.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_pipeline.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_polynomial.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_practical_lowering_issues.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_print.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_profiler.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_pycc.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_python_int.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_random.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_range.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_recarray_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_record_dtype.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_recursion.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_refop_pruning.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_remove_dead.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_retargeting.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_return_values.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_runtests.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_serialize.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_sets.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_slices.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_sort.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_ssa.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_stencils.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_storeslice.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_struct_ref.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_support.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_svml.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_sys_stdin_assignment.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_sysinfo.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_target_extension.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_target_overloadselector.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_threadsafety.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_tracing.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_try_except.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_tuples.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_typeconv.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_typedlist.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_typedobjectutils.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_typeguard.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_typeinfer.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_typenames.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_typeof.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_types.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_typingerror.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_ufuncs.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_unicode.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_unicode_array.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_unicode_names.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_unpack_sequence.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_unpickle_without_module.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_unsafe_intrinsics.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_vectorization.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_vectorization_type_inference.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_warnings.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_withlifting.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/test_wrapper.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/threading_backend_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +copying build/lib.linux-x86_64-3.7/numba/tests/usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests +creating build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/__init__.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_examples.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_jitclass.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_literal_container_usage.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_literally_usage.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_llvm_pass_timings.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_numpy_generators.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_parallel_chunksize.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_rec_array.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_structref_usage.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_typed_dict_usage.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +copying build/lib.linux-x86_64-3.7/numba/tests/doc_examples/test_typed_list_usage.py -> build/bdist.linux-x86_64/wheel/numba/tests/doc_examples +creating build/bdist.linux-x86_64/wheel/numba/tests/gdb +copying build/lib.linux-x86_64-3.7/numba/tests/gdb/__init__.py -> build/bdist.linux-x86_64/wheel/numba/tests/gdb +copying build/lib.linux-x86_64-3.7/numba/tests/gdb/test_array_arg.py -> build/bdist.linux-x86_64/wheel/numba/tests/gdb +copying build/lib.linux-x86_64-3.7/numba/tests/gdb/test_basic.py -> build/bdist.linux-x86_64/wheel/numba/tests/gdb +copying build/lib.linux-x86_64-3.7/numba/tests/gdb/test_break_on_symbol.py -> build/bdist.linux-x86_64/wheel/numba/tests/gdb +copying build/lib.linux-x86_64-3.7/numba/tests/gdb/test_break_on_symbol_version.py -> build/bdist.linux-x86_64/wheel/numba/tests/gdb +copying build/lib.linux-x86_64-3.7/numba/tests/gdb/test_conditional_breakpoint.py -> build/bdist.linux-x86_64/wheel/numba/tests/gdb +copying build/lib.linux-x86_64-3.7/numba/tests/gdb/test_pretty_print.py -> build/bdist.linux-x86_64/wheel/numba/tests/gdb +creating build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/__init__.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/cache_usecases.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/test_caching.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/test_dufunc.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/test_errors.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/test_gufunc.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/test_parallel_env_variable.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/test_parallel_low_work.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/test_parallel_ufunc_issues.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/test_ufunc.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/test_ufuncbuilding.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +copying build/lib.linux-x86_64-3.7/numba/tests/npyufunc/test_vectorize_decor.py -> build/bdist.linux-x86_64/wheel/numba/tests/npyufunc +creating build/bdist.linux-x86_64/wheel/numba/tests/pycc_distutils_usecase +copying build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/__init__.py -> build/bdist.linux-x86_64/wheel/numba/tests/pycc_distutils_usecase +copying build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/setup_distutils.py -> build/bdist.linux-x86_64/wheel/numba/tests/pycc_distutils_usecase +copying build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/setup_distutils_nested.py -> build/bdist.linux-x86_64/wheel/numba/tests/pycc_distutils_usecase +copying build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/setup_setuptools.py -> build/bdist.linux-x86_64/wheel/numba/tests/pycc_distutils_usecase +copying build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/setup_setuptools_nested.py -> build/bdist.linux-x86_64/wheel/numba/tests/pycc_distutils_usecase +copying build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/source_module.py -> build/bdist.linux-x86_64/wheel/numba/tests/pycc_distutils_usecase +creating build/bdist.linux-x86_64/wheel/numba/tests/pycc_distutils_usecase/nested +copying build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/nested/__init__.py -> build/bdist.linux-x86_64/wheel/numba/tests/pycc_distutils_usecase/nested +copying build/lib.linux-x86_64-3.7/numba/tests/pycc_distutils_usecase/nested/source_module.py -> build/bdist.linux-x86_64/wheel/numba/tests/pycc_distutils_usecase/nested +creating build/bdist.linux-x86_64/wheel/numba/typed +copying build/lib.linux-x86_64-3.7/numba/typed/__init__.py -> build/bdist.linux-x86_64/wheel/numba/typed +copying build/lib.linux-x86_64-3.7/numba/typed/dictimpl.py -> build/bdist.linux-x86_64/wheel/numba/typed +copying build/lib.linux-x86_64-3.7/numba/typed/dictobject.py -> build/bdist.linux-x86_64/wheel/numba/typed +copying build/lib.linux-x86_64-3.7/numba/typed/listobject.py -> build/bdist.linux-x86_64/wheel/numba/typed +copying build/lib.linux-x86_64-3.7/numba/typed/typeddict.py -> build/bdist.linux-x86_64/wheel/numba/typed +copying build/lib.linux-x86_64-3.7/numba/typed/typedlist.py -> build/bdist.linux-x86_64/wheel/numba/typed +copying build/lib.linux-x86_64-3.7/numba/typed/typedobjectutils.py -> build/bdist.linux-x86_64/wheel/numba/typed +copying build/lib.linux-x86_64-3.7/numba/typed/py.typed -> build/bdist.linux-x86_64/wheel/numba/typed +creating build/bdist.linux-x86_64/wheel/numba/types +copying build/lib.linux-x86_64-3.7/numba/types/__init__.py -> build/bdist.linux-x86_64/wheel/numba/types +copying build/lib.linux-x86_64-3.7/numba/_dynfunc.c -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_dynfuncmod.c -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_hashtable.c -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_helperlib.c -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_helpermod.c -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_lapack.c -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_npymath_exports.c -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_random.c -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_typeof.c -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/mviewbuf.c -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_arraystruct.h -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_devicearray.h -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_hashtable.h -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_numba_common.h -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_pymodule.h -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_typeof.h -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_unicodetype_db.h -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/capsulethunk.h -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/mathnames.h -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_dynfunc.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_dispatcher.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_helperlib.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/mviewbuf.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_devicearray.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/numba +copying build/lib.linux-x86_64-3.7/numba/_version.py -> build/bdist.linux-x86_64/wheel/numba +running install_egg_info +running egg_info +creating numba.egg-info +writing numba.egg-info/PKG-INFO +writing dependency_links to numba.egg-info/dependency_links.txt +writing requirements to numba.egg-info/requires.txt +writing top-level names to numba.egg-info/top_level.txt +writing manifest file 'numba.egg-info/SOURCES.txt' +reading manifest file 'numba.egg-info/SOURCES.txt' +reading manifest template 'MANIFEST.in' +warning: no files found matching '*.inc' under directory 'numba' +warning: no files found matching '*.ipynb' under directory 'docs' +warning: no files found matching '*.py' under directory 'examples' +no previously-included directories found matching 'docs/_build' +no previously-included directories found matching 'docs/gh-pages' +adding license file 'LICENSE' +adding license file 'LICENSES.third-party' +writing manifest file 'numba.egg-info/SOURCES.txt' +Copying numba.egg-info to build/bdist.linux-x86_64/wheel/numba-0.56.4+corex.20240111064751-py3.7.egg-info +running install_scripts +creating build/bdist.linux-x86_64/wheel/numba-0.56.4+corex.20240111064751.data +creating build/bdist.linux-x86_64/wheel/numba-0.56.4+corex.20240111064751.data/scripts +copying build/scripts-3.7/pycc -> build/bdist.linux-x86_64/wheel/numba-0.56.4+corex.20240111064751.data/scripts +copying build/scripts-3.7/numba -> build/bdist.linux-x86_64/wheel/numba-0.56.4+corex.20240111064751.data/scripts +changing mode of build/bdist.linux-x86_64/wheel/numba-0.56.4+corex.20240111064751.data/scripts/pycc to 755 +changing mode of build/bdist.linux-x86_64/wheel/numba-0.56.4+corex.20240111064751.data/scripts/numba to 755 +creating build/bdist.linux-x86_64/wheel/numba-0.56.4+corex.20240111064751.dist-info/WHEEL +creating 'build_pip/numba-0.56.4+corex.20240111064751-cp37-cp37m-linux_x86_64.whl' and adding 'build/bdist.linux-x86_64/wheel' to it +adding 'numba/__init__.py' +adding 'numba/__main__.py' +adding 'numba/_arraystruct.h' +adding 'numba/_devicearray.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/_devicearray.h' +adding 'numba/_dispatcher.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/_dynfunc.c' +adding 'numba/_dynfunc.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/_dynfuncmod.c' +adding 'numba/_hashtable.c' +adding 'numba/_hashtable.h' +adding 'numba/_helperlib.c' +adding 'numba/_helperlib.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/_helpermod.c' +adding 'numba/_lapack.c' +adding 'numba/_npymath_exports.c' +adding 'numba/_numba_common.h' +adding 'numba/_pymodule.h' +adding 'numba/_random.c' +adding 'numba/_typeof.c' +adding 'numba/_typeof.h' +adding 'numba/_unicodetype_db.h' +adding 'numba/_version.py' +adding 'numba/capsulethunk.h' +adding 'numba/extending.py' +adding 'numba/mathnames.h' +adding 'numba/mviewbuf.c' +adding 'numba/mviewbuf.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/runtests.py' +adding 'numba/cext/__init__.py' +adding 'numba/cext/cext.h' +adding 'numba/cext/dictobject.c' +adding 'numba/cext/dictobject.h' +adding 'numba/cext/listobject.c' +adding 'numba/cext/listobject.h' +adding 'numba/cext/utils.c' +adding 'numba/cloudpickle/__init__.py' +adding 'numba/cloudpickle/cloudpickle.py' +adding 'numba/cloudpickle/cloudpickle_fast.py' +adding 'numba/cloudpickle/compat.py' +adding 'numba/core/__init__.py' +adding 'numba/core/analysis.py' +adding 'numba/core/base.py' +adding 'numba/core/boxing.py' +adding 'numba/core/bytecode.py' +adding 'numba/core/byteflow.py' +adding 'numba/core/caching.py' +adding 'numba/core/callconv.py' +adding 'numba/core/callwrapper.py' +adding 'numba/core/ccallback.py' +adding 'numba/core/cgutils.py' +adding 'numba/core/codegen.py' +adding 'numba/core/compiler.py' +adding 'numba/core/compiler_lock.py' +adding 'numba/core/compiler_machinery.py' +adding 'numba/core/config.py' +adding 'numba/core/consts.py' +adding 'numba/core/controlflow.py' +adding 'numba/core/cpu.py' +adding 'numba/core/cpu_options.py' +adding 'numba/core/dataflow.py' +adding 'numba/core/debuginfo.py' +adding 'numba/core/decorators.py' +adding 'numba/core/descriptors.py' +adding 'numba/core/dispatcher.py' +adding 'numba/core/entrypoints.py' +adding 'numba/core/environment.py' +adding 'numba/core/errors.py' +adding 'numba/core/event.py' +adding 'numba/core/extending.py' +adding 'numba/core/externals.py' +adding 'numba/core/fastmathpass.py' +adding 'numba/core/funcdesc.py' +adding 'numba/core/generators.py' +adding 'numba/core/imputils.py' +adding 'numba/core/inline_closurecall.py' +adding 'numba/core/interpreter.py' +adding 'numba/core/intrinsics.py' +adding 'numba/core/ir.py' +adding 'numba/core/ir_utils.py' +adding 'numba/core/itanium_mangler.py' +adding 'numba/core/llvm_bindings.py' +adding 'numba/core/lowering.py' +adding 'numba/core/object_mode_passes.py' +adding 'numba/core/optional.py' +adding 'numba/core/options.py' +adding 'numba/core/overload_glue.py' +adding 'numba/core/postproc.py' +adding 'numba/core/pylowering.py' +adding 'numba/core/pythonapi.py' +adding 'numba/core/registry.py' +adding 'numba/core/removerefctpass.py' +adding 'numba/core/retarget.py' +adding 'numba/core/serialize.py' +adding 'numba/core/sigutils.py' +adding 'numba/core/ssa.py' +adding 'numba/core/target_extension.py' +adding 'numba/core/targetconfig.py' +adding 'numba/core/tracing.py' +adding 'numba/core/transforms.py' +adding 'numba/core/typed_passes.py' +adding 'numba/core/typeinfer.py' +adding 'numba/core/untyped_passes.py' +adding 'numba/core/utils.py' +adding 'numba/core/withcontexts.py' +adding 'numba/core/annotations/__init__.py' +adding 'numba/core/annotations/pretty_annotate.py' +adding 'numba/core/annotations/template.html' +adding 'numba/core/annotations/type_annotations.py' +adding 'numba/core/datamodel/__init__.py' +adding 'numba/core/datamodel/manager.py' +adding 'numba/core/datamodel/models.py' +adding 'numba/core/datamodel/packer.py' +adding 'numba/core/datamodel/registry.py' +adding 'numba/core/datamodel/testing.py' +adding 'numba/core/rewrites/__init__.py' +adding 'numba/core/rewrites/ir_print.py' +adding 'numba/core/rewrites/registry.py' +adding 'numba/core/rewrites/static_binop.py' +adding 'numba/core/rewrites/static_getitem.py' +adding 'numba/core/rewrites/static_raise.py' +adding 'numba/core/runtime/__init__.py' +adding 'numba/core/runtime/_nrt_python.c' +adding 'numba/core/runtime/_nrt_python.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/core/runtime/_nrt_pythonmod.c' +adding 'numba/core/runtime/context.py' +adding 'numba/core/runtime/nrt.c' +adding 'numba/core/runtime/nrt.h' +adding 'numba/core/runtime/nrt.py' +adding 'numba/core/runtime/nrt_external.h' +adding 'numba/core/runtime/nrtdynmod.py' +adding 'numba/core/runtime/nrtopt.py' +adding 'numba/core/typeconv/__init__.py' +adding 'numba/core/typeconv/_typeconv.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/core/typeconv/castgraph.py' +adding 'numba/core/typeconv/rules.py' +adding 'numba/core/typeconv/typeconv.py' +adding 'numba/core/types/__init__.py' +adding 'numba/core/types/abstract.py' +adding 'numba/core/types/common.py' +adding 'numba/core/types/containers.py' +adding 'numba/core/types/function_type.py' +adding 'numba/core/types/functions.py' +adding 'numba/core/types/iterators.py' +adding 'numba/core/types/misc.py' +adding 'numba/core/types/npytypes.py' +adding 'numba/core/types/scalars.py' +adding 'numba/core/typing/__init__.py' +adding 'numba/core/typing/arraydecl.py' +adding 'numba/core/typing/asnumbatype.py' +adding 'numba/core/typing/bufproto.py' +adding 'numba/core/typing/builtins.py' +adding 'numba/core/typing/cffi_utils.py' +adding 'numba/core/typing/cmathdecl.py' +adding 'numba/core/typing/collections.py' +adding 'numba/core/typing/context.py' +adding 'numba/core/typing/ctypes_utils.py' +adding 'numba/core/typing/dictdecl.py' +adding 'numba/core/typing/enumdecl.py' +adding 'numba/core/typing/listdecl.py' +adding 'numba/core/typing/mathdecl.py' +adding 'numba/core/typing/npdatetime.py' +adding 'numba/core/typing/npydecl.py' +adding 'numba/core/typing/randomdecl.py' +adding 'numba/core/typing/setdecl.py' +adding 'numba/core/typing/templates.py' +adding 'numba/core/typing/typeof.py' +adding 'numba/core/unsafe/__init__.py' +adding 'numba/core/unsafe/bytes.py' +adding 'numba/core/unsafe/eh.py' +adding 'numba/core/unsafe/nrt.py' +adding 'numba/core/unsafe/refcount.py' +adding 'numba/cpython/__init__.py' +adding 'numba/cpython/builtins.py' +adding 'numba/cpython/charseq.py' +adding 'numba/cpython/cmathimpl.py' +adding 'numba/cpython/enumimpl.py' +adding 'numba/cpython/hashing.py' +adding 'numba/cpython/heapq.py' +adding 'numba/cpython/iterators.py' +adding 'numba/cpython/listobj.py' +adding 'numba/cpython/mathimpl.py' +adding 'numba/cpython/numbers.py' +adding 'numba/cpython/printimpl.py' +adding 'numba/cpython/randomimpl.py' +adding 'numba/cpython/rangeobj.py' +adding 'numba/cpython/setobj.py' +adding 'numba/cpython/slicing.py' +adding 'numba/cpython/tupleobj.py' +adding 'numba/cpython/unicode.py' +adding 'numba/cpython/unicode_support.py' +adding 'numba/cpython/unsafe/__init__.py' +adding 'numba/cpython/unsafe/numbers.py' +adding 'numba/cpython/unsafe/tuple.py' +adding 'numba/cuda/__init__.py' +adding 'numba/cuda/api.py' +adding 'numba/cuda/api_util.py' +adding 'numba/cuda/args.py' +adding 'numba/cuda/codegen.py' +adding 'numba/cuda/compiler.py' +adding 'numba/cuda/cuda_paths.py' +adding 'numba/cuda/cudadecl.py' +adding 'numba/cuda/cudaimpl.py' +adding 'numba/cuda/cudamath.py' +adding 'numba/cuda/decorators.py' +adding 'numba/cuda/descriptor.py' +adding 'numba/cuda/device_init.py' +adding 'numba/cuda/dispatcher.py' +adding 'numba/cuda/errors.py' +adding 'numba/cuda/initialize.py' +adding 'numba/cuda/intrinsic_wrapper.py' +adding 'numba/cuda/libdevice.py' +adding 'numba/cuda/libdevicedecl.py' +adding 'numba/cuda/libdevicefuncs.py' +adding 'numba/cuda/libdeviceimpl.py' +adding 'numba/cuda/mathimpl.py' +adding 'numba/cuda/models.py' +adding 'numba/cuda/nvvmutils.py' +adding 'numba/cuda/printimpl.py' +adding 'numba/cuda/random.py' +adding 'numba/cuda/simulator_init.py' +adding 'numba/cuda/stubs.py' +adding 'numba/cuda/target.py' +adding 'numba/cuda/testing.py' +adding 'numba/cuda/types.py' +adding 'numba/cuda/vector_types.py' +adding 'numba/cuda/vectorizers.py' +adding 'numba/cuda/cudadrv/__init__.py' +adding 'numba/cuda/cudadrv/_extras.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/cuda/cudadrv/devicearray.py' +adding 'numba/cuda/cudadrv/devices.py' +adding 'numba/cuda/cudadrv/driver.py' +adding 'numba/cuda/cudadrv/drvapi.py' +adding 'numba/cuda/cudadrv/enums.py' +adding 'numba/cuda/cudadrv/error.py' +adding 'numba/cuda/cudadrv/libs.py' +adding 'numba/cuda/cudadrv/ndarray.py' +adding 'numba/cuda/cudadrv/nvvm.py' +adding 'numba/cuda/cudadrv/rtapi.py' +adding 'numba/cuda/cudadrv/runtime.py' +adding 'numba/cuda/kernels/__init__.py' +adding 'numba/cuda/kernels/reduction.py' +adding 'numba/cuda/kernels/transpose.py' +adding 'numba/cuda/simulator/__init__.py' +adding 'numba/cuda/simulator/api.py' +adding 'numba/cuda/simulator/compiler.py' +adding 'numba/cuda/simulator/kernel.py' +adding 'numba/cuda/simulator/kernelapi.py' +adding 'numba/cuda/simulator/reduction.py' +adding 'numba/cuda/simulator/vector_types.py' +adding 'numba/cuda/simulator/cudadrv/__init__.py' +adding 'numba/cuda/simulator/cudadrv/devicearray.py' +adding 'numba/cuda/simulator/cudadrv/devices.py' +adding 'numba/cuda/simulator/cudadrv/driver.py' +adding 'numba/cuda/simulator/cudadrv/drvapi.py' +adding 'numba/cuda/simulator/cudadrv/error.py' +adding 'numba/cuda/simulator/cudadrv/libs.py' +adding 'numba/cuda/simulator/cudadrv/nvvm.py' +adding 'numba/cuda/simulator/cudadrv/runtime.py' +adding 'numba/cuda/tests/__init__.py' +adding 'numba/cuda/tests/cudadrv/__init__.py' +adding 'numba/cuda/tests/cudadrv/test_array_attr.py' +adding 'numba/cuda/tests/cudadrv/test_context_stack.py' +adding 'numba/cuda/tests/cudadrv/test_cuda_array_slicing.py' +adding 'numba/cuda/tests/cudadrv/test_cuda_auto_context.py' +adding 'numba/cuda/tests/cudadrv/test_cuda_devicerecord.py' +adding 'numba/cuda/tests/cudadrv/test_cuda_driver.py' +adding 'numba/cuda/tests/cudadrv/test_cuda_libraries.py' +adding 'numba/cuda/tests/cudadrv/test_cuda_memory.py' +adding 'numba/cuda/tests/cudadrv/test_cuda_ndarray.py' +adding 'numba/cuda/tests/cudadrv/test_deallocations.py' +adding 'numba/cuda/tests/cudadrv/test_detect.py' +adding 'numba/cuda/tests/cudadrv/test_emm_plugins.py' +adding 'numba/cuda/tests/cudadrv/test_events.py' +adding 'numba/cuda/tests/cudadrv/test_host_alloc.py' +adding 'numba/cuda/tests/cudadrv/test_init.py' +adding 'numba/cuda/tests/cudadrv/test_inline_ptx.py' +adding 'numba/cuda/tests/cudadrv/test_ir_patch.py' +adding 'numba/cuda/tests/cudadrv/test_linker.py' +adding 'numba/cuda/tests/cudadrv/test_managed_alloc.py' +adding 'numba/cuda/tests/cudadrv/test_nvvm_driver.py' +adding 'numba/cuda/tests/cudadrv/test_pinned.py' +adding 'numba/cuda/tests/cudadrv/test_profiler.py' +adding 'numba/cuda/tests/cudadrv/test_ptds.py' +adding 'numba/cuda/tests/cudadrv/test_reset_device.py' +adding 'numba/cuda/tests/cudadrv/test_runtime.py' +adding 'numba/cuda/tests/cudadrv/test_select_device.py' +adding 'numba/cuda/tests/cudadrv/test_streams.py' +adding 'numba/cuda/tests/cudadrv/data/__init__.py' +adding 'numba/cuda/tests/cudadrv/data/cuda_include.cu' +adding 'numba/cuda/tests/cudadrv/data/error.cu' +adding 'numba/cuda/tests/cudadrv/data/jitlink.cu' +adding 'numba/cuda/tests/cudadrv/data/jitlink.ptx' +adding 'numba/cuda/tests/cudadrv/data/warn.cu' +adding 'numba/cuda/tests/cudapy/__init__.py' +adding 'numba/cuda/tests/cudapy/cache_usecases.py' +adding 'numba/cuda/tests/cudapy/cache_with_cpu_usecases.py' +adding 'numba/cuda/tests/cudapy/extensions_usecases.py' +adding 'numba/cuda/tests/cudapy/recursion_usecases.py' +adding 'numba/cuda/tests/cudapy/test_alignment.py' +adding 'numba/cuda/tests/cudapy/test_array.py' +adding 'numba/cuda/tests/cudapy/test_array_args.py' +adding 'numba/cuda/tests/cudapy/test_array_methods.py' +adding 'numba/cuda/tests/cudapy/test_atomics.py' +adding 'numba/cuda/tests/cudapy/test_blackscholes.py' +adding 'numba/cuda/tests/cudapy/test_boolean.py' +adding 'numba/cuda/tests/cudapy/test_caching.py' +adding 'numba/cuda/tests/cudapy/test_casting.py' +adding 'numba/cuda/tests/cudapy/test_compiler.py' +adding 'numba/cuda/tests/cudapy/test_complex.py' +adding 'numba/cuda/tests/cudapy/test_complex_kernel.py' +adding 'numba/cuda/tests/cudapy/test_const_string.py' +adding 'numba/cuda/tests/cudapy/test_constmem.py' +adding 'numba/cuda/tests/cudapy/test_cooperative_groups.py' +adding 'numba/cuda/tests/cudapy/test_cuda_array_interface.py' +adding 'numba/cuda/tests/cudapy/test_cuda_jit_no_types.py' +adding 'numba/cuda/tests/cudapy/test_datetime.py' +adding 'numba/cuda/tests/cudapy/test_debug.py' +adding 'numba/cuda/tests/cudapy/test_debuginfo.py' +adding 'numba/cuda/tests/cudapy/test_device_func.py' +adding 'numba/cuda/tests/cudapy/test_dispatcher.py' +adding 'numba/cuda/tests/cudapy/test_enums.py' +adding 'numba/cuda/tests/cudapy/test_errors.py' +adding 'numba/cuda/tests/cudapy/test_exception.py' +adding 'numba/cuda/tests/cudapy/test_extending.py' +adding 'numba/cuda/tests/cudapy/test_fastmath.py' +adding 'numba/cuda/tests/cudapy/test_forall.py' +adding 'numba/cuda/tests/cudapy/test_freevar.py' +adding 'numba/cuda/tests/cudapy/test_frexp_ldexp.py' +adding 'numba/cuda/tests/cudapy/test_globals.py' +adding 'numba/cuda/tests/cudapy/test_gufunc.py' +adding 'numba/cuda/tests/cudapy/test_gufunc_scalar.py' +adding 'numba/cuda/tests/cudapy/test_gufunc_scheduling.py' +adding 'numba/cuda/tests/cudapy/test_idiv.py' +adding 'numba/cuda/tests/cudapy/test_inspect.py' +adding 'numba/cuda/tests/cudapy/test_intrinsics.py' +adding 'numba/cuda/tests/cudapy/test_ipc.py' +adding 'numba/cuda/tests/cudapy/test_iterators.py' +adding 'numba/cuda/tests/cudapy/test_lang.py' +adding 'numba/cuda/tests/cudapy/test_laplace.py' +adding 'numba/cuda/tests/cudapy/test_libdevice.py' +adding 'numba/cuda/tests/cudapy/test_lineinfo.py' +adding 'numba/cuda/tests/cudapy/test_localmem.py' +adding 'numba/cuda/tests/cudapy/test_mandel.py' +adding 'numba/cuda/tests/cudapy/test_math.py' +adding 'numba/cuda/tests/cudapy/test_matmul.py' +adding 'numba/cuda/tests/cudapy/test_minmax.py' +adding 'numba/cuda/tests/cudapy/test_montecarlo.py' +adding 'numba/cuda/tests/cudapy/test_multigpu.py' +adding 'numba/cuda/tests/cudapy/test_multiprocessing.py' +adding 'numba/cuda/tests/cudapy/test_multithreads.py' +adding 'numba/cuda/tests/cudapy/test_nondet.py' +adding 'numba/cuda/tests/cudapy/test_operator.py' +adding 'numba/cuda/tests/cudapy/test_optimization.py' +adding 'numba/cuda/tests/cudapy/test_overload.py' +adding 'numba/cuda/tests/cudapy/test_powi.py' +adding 'numba/cuda/tests/cudapy/test_print.py' +adding 'numba/cuda/tests/cudapy/test_py2_div_issue.py' +adding 'numba/cuda/tests/cudapy/test_random.py' +adding 'numba/cuda/tests/cudapy/test_record_dtype.py' +adding 'numba/cuda/tests/cudapy/test_recursion.py' +adding 'numba/cuda/tests/cudapy/test_reduction.py' +adding 'numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py' +adding 'numba/cuda/tests/cudapy/test_serialize.py' +adding 'numba/cuda/tests/cudapy/test_slicing.py' +adding 'numba/cuda/tests/cudapy/test_sm.py' +adding 'numba/cuda/tests/cudapy/test_sm_creation.py' +adding 'numba/cuda/tests/cudapy/test_sync.py' +adding 'numba/cuda/tests/cudapy/test_transpose.py' +adding 'numba/cuda/tests/cudapy/test_userexc.py' +adding 'numba/cuda/tests/cudapy/test_vector_type.py' +adding 'numba/cuda/tests/cudapy/test_vectorize.py' +adding 'numba/cuda/tests/cudapy/test_vectorize_complex.py' +adding 'numba/cuda/tests/cudapy/test_vectorize_decor.py' +adding 'numba/cuda/tests/cudapy/test_vectorize_device.py' +adding 'numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py' +adding 'numba/cuda/tests/cudapy/test_warning.py' +adding 'numba/cuda/tests/cudapy/test_warp_ops.py' +adding 'numba/cuda/tests/cudasim/__init__.py' +adding 'numba/cuda/tests/cudasim/support.py' +adding 'numba/cuda/tests/cudasim/test_cudasim_issues.py' +adding 'numba/cuda/tests/doc_examples/__init__.py' +adding 'numba/cuda/tests/doc_examples/test_cg.py' +adding 'numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py' +adding 'numba/cuda/tests/doc_examples/test_ffi.py' +adding 'numba/cuda/tests/doc_examples/test_laplace.py' +adding 'numba/cuda/tests/doc_examples/test_matmul.py' +adding 'numba/cuda/tests/doc_examples/test_montecarlo.py' +adding 'numba/cuda/tests/doc_examples/test_random.py' +adding 'numba/cuda/tests/doc_examples/test_reduction.py' +adding 'numba/cuda/tests/doc_examples/test_sessionize.py' +adding 'numba/cuda/tests/doc_examples/test_vecadd.py' +adding 'numba/cuda/tests/doc_examples/ffi/__init__.py' +adding 'numba/cuda/tests/doc_examples/ffi/functions.cu' +adding 'numba/cuda/tests/nocuda/__init__.py' +adding 'numba/cuda/tests/nocuda/test_import.py' +adding 'numba/cuda/tests/nocuda/test_library_lookup.py' +adding 'numba/cuda/tests/nocuda/test_nvvm.py' +adding 'numba/experimental/__init__.py' +adding 'numba/experimental/function_type.py' +adding 'numba/experimental/structref.py' +adding 'numba/experimental/jitclass/__init__.py' +adding 'numba/experimental/jitclass/_box.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/experimental/jitclass/base.py' +adding 'numba/experimental/jitclass/boxing.py' +adding 'numba/experimental/jitclass/decorators.py' +adding 'numba/experimental/jitclass/overloads.py' +adding 'numba/misc/__init__.py' +adding 'numba/misc/appdirs.py' +adding 'numba/misc/cffiimpl.py' +adding 'numba/misc/cmdlang.gdb' +adding 'numba/misc/dummyarray.py' +adding 'numba/misc/dump_style.py' +adding 'numba/misc/findlib.py' +adding 'numba/misc/firstlinefinder.py' +adding 'numba/misc/gdb_hook.py' +adding 'numba/misc/gdb_print_extension.py' +adding 'numba/misc/init_utils.py' +adding 'numba/misc/inspection.py' +adding 'numba/misc/literal.py' +adding 'numba/misc/llvm_pass_timings.py' +adding 'numba/misc/mergesort.py' +adding 'numba/misc/numba_entry.py' +adding 'numba/misc/numba_gdbinfo.py' +adding 'numba/misc/numba_sysinfo.py' +adding 'numba/misc/quicksort.py' +adding 'numba/misc/special.py' +adding 'numba/misc/timsort.py' +adding 'numba/misc/help/__init__.py' +adding 'numba/misc/help/inspector.py' +adding 'numba/np/__init__.py' +adding 'numba/np/arraymath.py' +adding 'numba/np/arrayobj.py' +adding 'numba/np/extensions.py' +adding 'numba/np/linalg.py' +adding 'numba/np/npdatetime.py' +adding 'numba/np/npdatetime_helpers.py' +adding 'numba/np/npyfuncs.py' +adding 'numba/np/npyimpl.py' +adding 'numba/np/numpy_support.py' +adding 'numba/np/polynomial.py' +adding 'numba/np/ufunc_db.py' +adding 'numba/np/random/__init__.py' +adding 'numba/np/random/generator_core.py' +adding 'numba/np/random/generator_methods.py' +adding 'numba/np/ufunc/__init__.py' +adding 'numba/np/ufunc/_internal.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/np/ufunc/_num_threads.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/np/ufunc/array_exprs.py' +adding 'numba/np/ufunc/decorators.py' +adding 'numba/np/ufunc/deviceufunc.py' +adding 'numba/np/ufunc/dufunc.py' +adding 'numba/np/ufunc/gufunc.py' +adding 'numba/np/ufunc/omppool.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/np/ufunc/parallel.py' +adding 'numba/np/ufunc/sigparse.py' +adding 'numba/np/ufunc/ufuncbuilder.py' +adding 'numba/np/ufunc/workqueue.cpython-37m-x86_64-linux-gnu.so' +adding 'numba/np/ufunc/wrappers.py' +adding 'numba/np/unsafe/__init__.py' +adding 'numba/np/unsafe/ndarray.py' +adding 'numba/parfors/__init__.py' +adding 'numba/parfors/array_analysis.py' +adding 'numba/parfors/parfor.py' +adding 'numba/parfors/parfor_lowering.py' +adding 'numba/parfors/parfor_lowering_utils.py' +adding 'numba/pycc/__init__.py' +adding 'numba/pycc/cc.py' +adding 'numba/pycc/compiler.py' +adding 'numba/pycc/decorators.py' +adding 'numba/pycc/llvm_types.py' +adding 'numba/pycc/modulemixin.c' +adding 'numba/pycc/platform.py' +adding 'numba/scripts/__init__.py' +adding 'numba/scripts/generate_lower_listing.py' +adding 'numba/stencils/__init__.py' +adding 'numba/stencils/stencil.py' +adding 'numba/stencils/stencilparfor.py' +adding 'numba/testing/__init__.py' +adding 'numba/testing/__main__.py' +adding 'numba/testing/_runtests.py' +adding 'numba/testing/loader.py' +adding 'numba/testing/main.py' +adding 'numba/testing/notebook.py' +adding 'numba/tests/__init__.py' +adding 'numba/tests/annotation_usecases.py' +adding 'numba/tests/cache_usecases.py' +adding 'numba/tests/cffi_usecases.py' +adding 'numba/tests/cfunc_cache_usecases.py' +adding 'numba/tests/cloudpickle_main_class.py' +adding 'numba/tests/compile_with_pycc.py' +adding 'numba/tests/complex_usecases.py' +adding 'numba/tests/ctypes_usecases.py' +adding 'numba/tests/dummy_module.py' +adding 'numba/tests/enum_usecases.py' +adding 'numba/tests/error_usecases.py' +adding 'numba/tests/gdb_support.py' +adding 'numba/tests/inlining_usecases.py' +adding 'numba/tests/matmul_usecase.py' +adding 'numba/tests/orphaned_semaphore_usecase.py' +adding 'numba/tests/overload_usecases.py' +adding 'numba/tests/parfors_cache_usecases.py' +adding 'numba/tests/parfors_max_label_error.py' +adding 'numba/tests/pdlike_usecase.py' +adding 'numba/tests/recursion_usecases.py' +adding 'numba/tests/serialize_usecases.py' +adding 'numba/tests/support.py' +adding 'numba/tests/test_alignment.py' +adding 'numba/tests/test_analysis.py' +adding 'numba/tests/test_annotations.py' +adding 'numba/tests/test_api.py' +adding 'numba/tests/test_array_analysis.py' +adding 'numba/tests/test_array_attr.py' +adding 'numba/tests/test_array_constants.py' +adding 'numba/tests/test_array_exprs.py' +adding 'numba/tests/test_array_iterators.py' +adding 'numba/tests/test_array_manipulation.py' +adding 'numba/tests/test_array_methods.py' +adding 'numba/tests/test_array_reductions.py' +adding 'numba/tests/test_array_return.py' +adding 'numba/tests/test_asnumbatype.py' +adding 'numba/tests/test_auto_constants.py' +adding 'numba/tests/test_blackscholes.py' +adding 'numba/tests/test_boundscheck.py' +adding 'numba/tests/test_buffer_protocol.py' +adding 'numba/tests/test_builtins.py' +adding 'numba/tests/test_byteflow.py' +adding 'numba/tests/test_caching.py' +adding 'numba/tests/test_casting.py' +adding 'numba/tests/test_cffi.py' +adding 'numba/tests/test_cfunc.py' +adding 'numba/tests/test_cgutils.py' +adding 'numba/tests/test_chained_assign.py' +adding 'numba/tests/test_chrome_trace.py' +adding 'numba/tests/test_cli.py' +adding 'numba/tests/test_closure.py' +adding 'numba/tests/test_codegen.py' +adding 'numba/tests/test_compile_cache.py' +adding 'numba/tests/test_compiler_flags.py' +adding 'numba/tests/test_compiler_lock.py' +adding 'numba/tests/test_complex.py' +adding 'numba/tests/test_comprehension.py' +adding 'numba/tests/test_conditions_as_predicates.py' +adding 'numba/tests/test_config.py' +adding 'numba/tests/test_conversion.py' +adding 'numba/tests/test_copy_propagate.py' +adding 'numba/tests/test_ctypes.py' +adding 'numba/tests/test_dataflow.py' +adding 'numba/tests/test_datamodel.py' +adding 'numba/tests/test_debug.py' +adding 'numba/tests/test_debuginfo.py' +adding 'numba/tests/test_deprecations.py' +adding 'numba/tests/test_dictimpl.py' +adding 'numba/tests/test_dictobject.py' +adding 'numba/tests/test_dicts.py' +adding 'numba/tests/test_dispatcher.py' +adding 'numba/tests/test_dummyarray.py' +adding 'numba/tests/test_dyn_array.py' +adding 'numba/tests/test_dyn_func.py' +adding 'numba/tests/test_entrypoints.py' +adding 'numba/tests/test_enums.py' +adding 'numba/tests/test_errorhandling.py' +adding 'numba/tests/test_errormodels.py' +adding 'numba/tests/test_event.py' +adding 'numba/tests/test_exceptions.py' +adding 'numba/tests/test_extended_arg.py' +adding 'numba/tests/test_extending.py' +adding 'numba/tests/test_extending_types.py' +adding 'numba/tests/test_fancy_indexing.py' +adding 'numba/tests/test_fastmath.py' +adding 'numba/tests/test_firstlinefinder.py' +adding 'numba/tests/test_flow_control.py' +adding 'numba/tests/test_func_interface.py' +adding 'numba/tests/test_func_lifetime.py' +adding 'numba/tests/test_funcdesc.py' +adding 'numba/tests/test_function_type.py' +adding 'numba/tests/test_gdb_bindings.py' +adding 'numba/tests/test_gdb_dwarf.py' +adding 'numba/tests/test_generators.py' +adding 'numba/tests/test_gil.py' +adding 'numba/tests/test_globals.py' +adding 'numba/tests/test_hashing.py' +adding 'numba/tests/test_heapq.py' +adding 'numba/tests/test_help.py' +adding 'numba/tests/test_import.py' +adding 'numba/tests/test_indexing.py' +adding 'numba/tests/test_init_utils.py' +adding 'numba/tests/test_inlining.py' +adding 'numba/tests/test_interpreter.py' +adding 'numba/tests/test_interproc.py' +adding 'numba/tests/test_intwidth.py' +adding 'numba/tests/test_ir.py' +adding 'numba/tests/test_ir_inlining.py' +adding 'numba/tests/test_ir_utils.py' +adding 'numba/tests/test_itanium_mangler.py' +adding 'numba/tests/test_iteration.py' +adding 'numba/tests/test_jit_module.py' +adding 'numba/tests/test_jitclasses.py' +adding 'numba/tests/test_jitmethod.py' +adding 'numba/tests/test_linalg.py' +adding 'numba/tests/test_listimpl.py' +adding 'numba/tests/test_listobject.py' +adding 'numba/tests/test_lists.py' +adding 'numba/tests/test_literal_dispatch.py' +adding 'numba/tests/test_llvm_pass_timings.py' +adding 'numba/tests/test_llvm_version_check.py' +adding 'numba/tests/test_locals.py' +adding 'numba/tests/test_looplifting.py' +adding 'numba/tests/test_make_function_to_jit_function.py' +adding 'numba/tests/test_mandelbrot.py' +adding 'numba/tests/test_mangling.py' +adding 'numba/tests/test_map_filter_reduce.py' +adding 'numba/tests/test_mathlib.py' +adding 'numba/tests/test_maxmin.py' +adding 'numba/tests/test_mixed_tuple_unroller.py' +adding 'numba/tests/test_moved_modules.py' +adding 'numba/tests/test_multi3.py' +adding 'numba/tests/test_nan.py' +adding 'numba/tests/test_ndarray_subclasses.py' +adding 'numba/tests/test_nested_calls.py' +adding 'numba/tests/test_np_functions.py' +adding 'numba/tests/test_np_randomgen.py' +adding 'numba/tests/test_npdatetime.py' +adding 'numba/tests/test_nrt.py' +adding 'numba/tests/test_nrt_refct.py' +adding 'numba/tests/test_num_threads.py' +adding 'numba/tests/test_numberctor.py' +adding 'numba/tests/test_numbers.py' +adding 'numba/tests/test_numconv.py' +adding 'numba/tests/test_numpy_support.py' +adding 'numba/tests/test_numpyadapt.py' +adding 'numba/tests/test_obj_lifetime.py' +adding 'numba/tests/test_object_mode.py' +adding 'numba/tests/test_objects.py' +adding 'numba/tests/test_operators.py' +adding 'numba/tests/test_optional.py' +adding 'numba/tests/test_overlap.py' +adding 'numba/tests/test_parallel_backend.py' +adding 'numba/tests/test_parfors.py' +adding 'numba/tests/test_parfors_caching.py' +adding 'numba/tests/test_parfors_passes.py' +adding 'numba/tests/test_pipeline.py' +adding 'numba/tests/test_polynomial.py' +adding 'numba/tests/test_practical_lowering_issues.py' +adding 'numba/tests/test_print.py' +adding 'numba/tests/test_profiler.py' +adding 'numba/tests/test_pycc.py' +adding 'numba/tests/test_python_int.py' +adding 'numba/tests/test_random.py' +adding 'numba/tests/test_range.py' +adding 'numba/tests/test_recarray_usecases.py' +adding 'numba/tests/test_record_dtype.py' +adding 'numba/tests/test_recursion.py' +adding 'numba/tests/test_refop_pruning.py' +adding 'numba/tests/test_remove_dead.py' +adding 'numba/tests/test_retargeting.py' +adding 'numba/tests/test_return_values.py' +adding 'numba/tests/test_runtests.py' +adding 'numba/tests/test_serialize.py' +adding 'numba/tests/test_sets.py' +adding 'numba/tests/test_slices.py' +adding 'numba/tests/test_sort.py' +adding 'numba/tests/test_ssa.py' +adding 'numba/tests/test_stencils.py' +adding 'numba/tests/test_storeslice.py' +adding 'numba/tests/test_struct_ref.py' +adding 'numba/tests/test_support.py' +adding 'numba/tests/test_svml.py' +adding 'numba/tests/test_sys_stdin_assignment.py' +adding 'numba/tests/test_sysinfo.py' +adding 'numba/tests/test_target_extension.py' +adding 'numba/tests/test_target_overloadselector.py' +adding 'numba/tests/test_threadsafety.py' +adding 'numba/tests/test_tracing.py' +adding 'numba/tests/test_try_except.py' +adding 'numba/tests/test_tuples.py' +adding 'numba/tests/test_typeconv.py' +adding 'numba/tests/test_typedlist.py' +adding 'numba/tests/test_typedobjectutils.py' +adding 'numba/tests/test_typeguard.py' +adding 'numba/tests/test_typeinfer.py' +adding 'numba/tests/test_typenames.py' +adding 'numba/tests/test_typeof.py' +adding 'numba/tests/test_types.py' +adding 'numba/tests/test_typingerror.py' +adding 'numba/tests/test_ufuncs.py' +adding 'numba/tests/test_unicode.py' +adding 'numba/tests/test_unicode_array.py' +adding 'numba/tests/test_unicode_names.py' +adding 'numba/tests/test_unpack_sequence.py' +adding 'numba/tests/test_unpickle_without_module.py' +adding 'numba/tests/test_unsafe_intrinsics.py' +adding 'numba/tests/test_usecases.py' +adding 'numba/tests/test_vectorization.py' +adding 'numba/tests/test_vectorization_type_inference.py' +adding 'numba/tests/test_warnings.py' +adding 'numba/tests/test_withlifting.py' +adding 'numba/tests/test_wrapper.py' +adding 'numba/tests/threading_backend_usecases.py' +adding 'numba/tests/usecases.py' +adding 'numba/tests/doc_examples/__init__.py' +adding 'numba/tests/doc_examples/test_examples.py' +adding 'numba/tests/doc_examples/test_jitclass.py' +adding 'numba/tests/doc_examples/test_literal_container_usage.py' +adding 'numba/tests/doc_examples/test_literally_usage.py' +adding 'numba/tests/doc_examples/test_llvm_pass_timings.py' +adding 'numba/tests/doc_examples/test_numpy_generators.py' +adding 'numba/tests/doc_examples/test_parallel_chunksize.py' +adding 'numba/tests/doc_examples/test_rec_array.py' +adding 'numba/tests/doc_examples/test_structref_usage.py' +adding 'numba/tests/doc_examples/test_typed_dict_usage.py' +adding 'numba/tests/doc_examples/test_typed_list_usage.py' +adding 'numba/tests/gdb/__init__.py' +adding 'numba/tests/gdb/test_array_arg.py' +adding 'numba/tests/gdb/test_basic.py' +adding 'numba/tests/gdb/test_break_on_symbol.py' +adding 'numba/tests/gdb/test_break_on_symbol_version.py' +adding 'numba/tests/gdb/test_conditional_breakpoint.py' +adding 'numba/tests/gdb/test_pretty_print.py' +adding 'numba/tests/npyufunc/__init__.py' +adding 'numba/tests/npyufunc/cache_usecases.py' +adding 'numba/tests/npyufunc/test_caching.py' +adding 'numba/tests/npyufunc/test_dufunc.py' +adding 'numba/tests/npyufunc/test_errors.py' +adding 'numba/tests/npyufunc/test_gufunc.py' +adding 'numba/tests/npyufunc/test_parallel_env_variable.py' +adding 'numba/tests/npyufunc/test_parallel_low_work.py' +adding 'numba/tests/npyufunc/test_parallel_ufunc_issues.py' +adding 'numba/tests/npyufunc/test_ufunc.py' +adding 'numba/tests/npyufunc/test_ufuncbuilding.py' +adding 'numba/tests/npyufunc/test_vectorize_decor.py' +adding 'numba/tests/pycc_distutils_usecase/__init__.py' +adding 'numba/tests/pycc_distutils_usecase/setup_distutils.py' +adding 'numba/tests/pycc_distutils_usecase/setup_distutils_nested.py' +adding 'numba/tests/pycc_distutils_usecase/setup_setuptools.py' +adding 'numba/tests/pycc_distutils_usecase/setup_setuptools_nested.py' +adding 'numba/tests/pycc_distutils_usecase/source_module.py' +adding 'numba/tests/pycc_distutils_usecase/nested/__init__.py' +adding 'numba/tests/pycc_distutils_usecase/nested/source_module.py' +adding 'numba/typed/__init__.py' +adding 'numba/typed/dictimpl.py' +adding 'numba/typed/dictobject.py' +adding 'numba/typed/listobject.py' +adding 'numba/typed/py.typed' +adding 'numba/typed/typeddict.py' +adding 'numba/typed/typedlist.py' +adding 'numba/typed/typedobjectutils.py' +adding 'numba/types/__init__.py' +adding 'numba-0.56.4+corex.20240111064751.data/scripts/numba' +adding 'numba-0.56.4+corex.20240111064751.data/scripts/pycc' +adding 'numba-0.56.4+corex.20240111064751.dist-info/LICENSE' +adding 'numba-0.56.4+corex.20240111064751.dist-info/LICENSES.third-party' +adding 'numba-0.56.4+corex.20240111064751.dist-info/METADATA' +adding 'numba-0.56.4+corex.20240111064751.dist-info/WHEEL' +adding 'numba-0.56.4+corex.20240111064751.dist-info/top_level.txt' +adding 'numba-0.56.4+corex.20240111064751.dist-info/RECORD' +removing build/bdist.linux-x86_64/wheel +/usr/local/lib/python3.7/site-packages/setuptools/dist.py:493: UserWarning: Normalizing 'v0.56.4+corex.20240111064751' to '0.56.4+corex.20240111064751' + warnings.warn(tmpl.format(**locals())) +/usr/local/lib/python3.7/site-packages/setuptools/command/install.py:37: SetuptoolsDeprecationWarning: setup.py install is deprecated. Use build and pip and other standards-based tools. + setuptools.SetuptoolsDeprecationWarning, diff --git a/cv/3d_detection/centerpoint/pytorch/numba/contrib/valgrind-numba.supp b/cv/3d_detection/centerpoint/pytorch/numba/contrib/valgrind-numba.supp new file mode 100644 index 0000000000000000000000000000000000000000..26271eb4ef0f6b82ab132f515236abd0241a039f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/contrib/valgrind-numba.supp @@ -0,0 +1,21 @@ +{ + + Memcheck:Cond + fun:_ZN4llvm3sys14getHostCPUNameEv + fun:LLVMPY_GetHostCPUName +} + +{ + + Memcheck:Value8 + fun:_ZN4llvm3sys14getHostCPUNameEv + fun:LLVMPY_GetHostCPUName +} + +{ + + Memcheck:Cond + fun:__intel_sse2_strrchr + fun:_ZN67_INTERNAL_45_______src_thirdparty_tbb_omp_dynamic_link_cpp_c306cade5__kmp12init_dl_dataEv + fun:__sti__$E +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/Makefile b/cv/3d_detection/centerpoint/pytorch/numba/docs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b60d7c1d69bc8e50b262a0c55c7ad930f456cf6e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/Makefile @@ -0,0 +1,177 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = -j1 +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Numba.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Numba.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/Numba" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Numba" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/js/modernizr.min.js b/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/js/modernizr.min.js new file mode 100644 index 0000000000000000000000000000000000000000..939eaaf034c5f9a5c506711e4942a5255e85aded --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/js/modernizr.min.js @@ -0,0 +1,7 @@ +/* modernizr.min.js is unused but causes a reflow on load. In firefox, this + * manifests as the Numba logo flashing up across the whole browser window for a + * split second every time the page is loaded or a documentation link is + * clicked. This empty file overrides the version included by the theme. + * + * Reference: https://github.com/readthedocs/sphinx_rtd_theme/issues/724 + */ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/numba-blue-icon-rgb.svg b/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/numba-blue-icon-rgb.svg new file mode 100644 index 0000000000000000000000000000000000000000..0df9c042e3258c13c50b0434081b62f86284137c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/numba-blue-icon-rgb.svg @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/numba-white-icon-rgb.svg b/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/numba-white-icon-rgb.svg new file mode 100644 index 0000000000000000000000000000000000000000..904b61c13f4812452280dc39a715b8e02e679a7e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/numba-white-icon-rgb.svg @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/rtd-overrides.css b/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/rtd-overrides.css new file mode 100644 index 0000000000000000000000000000000000000000..50c71cdb925f9f286cefefa4b9130c5dd6e905ec --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/_static/rtd-overrides.css @@ -0,0 +1,3 @@ +.wy-nav-content { + max-width: 1200px +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/_templates/EMPTY b/cv/3d_detection/centerpoint/pytorch/numba/docs/_templates/EMPTY new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/README.md b/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/README.md new file mode 100644 index 0000000000000000000000000000000000000000..66fd85853c45a538cb2561105ad4b42e934fd4bb --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/README.md @@ -0,0 +1,57 @@ +# DAG Roadmap + +This directory includes a representation of the Numba roadmap in the form of a +DAG. We have done this to enable a highly granular display of enhancements to +Numba that also shows the relationships between these tasks. Many tasks have +prerequisites, and we've found that issue trackers, Kanban boards, and +time-bucketed roadmap documentation all fail to represent this information in +different ways. + +## Requirements + +``` +conda install jinja2 python-graphviz pyyaml +``` + +## Usage + +``` +./render.py -o dagmap.html dagmap.yaml +``` + +The generated HTML file will look for `jquery.graphviz.svg.js` in the same +directory. + +## Updating the DAG + +Copy one of the existing tasks and edit: + * `label`: text appears on the node. Embed `\n` for line breaks. + * `id`: Referenced to indicate a dependency + * `description`: Shown in the tooltip. Automatically word-wrapped. + * `depends_on`: Optional list of task IDs which this task depends on. + +The `style` section of the file is not used yet. + +## Notes + +The HTML rendering of the graph is based on a slightly modified version of +(jquery.graphviz.svg)[https://github.com/mountainstorm/jquery.graphviz.svg/]. +Its license is: +``` +Copyright (c) 2015 Mountainstorm +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/dagmap.yaml b/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/dagmap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..57df7b7550bda91f4f5c8ab5c03f5ed55acf117d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/dagmap.yaml @@ -0,0 +1,195 @@ +meta: + version: 1 +style: + tags: + performance: + border: red +tasks: + - label: Track allocations in functions + id: track_alloc + description: | + Maintain a list of allocations inside each function which can be used + for freeing things on return, and also for debugging memory usage. + + - label: Catch exceptions + id: catch_exceptions + description: | + Allow exceptions raised in nopython mode to be caught in nopython mode. + depends_on: + - track_alloc + + + - label: New IR + id: new_ir + description: | + New intermediate representation for Numba that is backed by a dictionary + + - label: New Type Matching DSL + id: type_matching + description: | + Replace the current DSL for Numba types with something more expressive + that can match type patterns + + - label: Declarative type signatures\nfor @overload/@overload_method + id: declarative_overload + description: | + Replace the current DSL for Numba types with something more expressive\n + that can match type patterns + depends_on: + - type_matching + + - label: Rewrite "old-style" implementations + id: rewrite_old_impls + description: | + Rewrite implementations of functions that use the old extension API that + separates typing from implementation, and often uses the LLVM builder + unnecessarily. + depends_on: + - declarative_overload + - improve_test_suite_tooling + - faster_pr_testing + + - label: Unify and add more test suite tooling + id: improve_test_suite_tooling + description: | + Add tools to help with common patterns in testing and unify the ones we + have, there's no need for 12 spellings of "is this Python 3" Also decide + on "what to test", do all types need testing if inputs are being + "as_array"'d? + + - label: Pipeline pass formalisation + id: pass_formalisation + description: | + Decide on a formal description of a compiler pass and create supporting + code for it + + - label: Array expression fusion pass + id: new_array_expr_fusion_pass + description: + From parfors extract out the array expression fusion pass + depends_on: + - parfors_clean_up + - pass_formalisation + + - label: LICM Pass + id: new_licm_pass + description: | + Create a LICM Pass + depends_on: + - parfors_clean_up + - pass_formalisation + + - label: Clean up Parfors + id: parfors_clean_up + description: | + General clean up and refactoring of parfors ahead of any additional work + + - label: Mode based pipeline + id: mode_based_pipeline + description: | + Switch the jit decorator to use a mode based pipeline with + `nopython=True` equivalent as default. + + - label: Remove object mode fallback + id: remove_objmode_fallback + description: | + Remove the deprecated object mode fallback + depends_on: + - mode_based_pipeline + + - label: Switch to ORC JIT + id: orc_jit + description: | + MCJIT has been deprecated for some time. Need to switch to the newer + ORC JIT class. + + - label: Performance analysis suite + id: perform_analysis_suite + description: | + Meta task for all performance analysis related functionality + depends_on: + - line_profiling + - assembly_analysis_tooling + - vectorisation_analysis + + - label: Vectorisation analysis + id: vectorisation_analysis + description: | + Obtain LLVMs vectorisation reports and present these in a user friendly + manner + + - label: Line profiling + id: line_profiling + description: | + Support collection of profiling statistics from compiled machine code + and map back to lines of Python. + depends_on: + - orc_jit + - assembly_analysis_tooling + + - label: Assembly analysis tooling + id: assembly_analysis_tooling + description: | + Tie generated assembly back to python lines and annotate instruction + quality + depends_on: + - capstone + + - label: Build capstone against llvmdev + id: capstone + description: | + Build capstone against llvmdev and create conda packages/wheels + + - label: Increase JIT class method performance + id: jit_class_method_performance + description: | + Increase the performance of jitclass methods + depends_on: + - llvm_ref_count_pruning + - new_licm_pass + + - label: LLVM level ref count pruning + id: llvm_ref_count_pruning + description: | + Add a LLVM compiler pass to prune refcounts across entire functions + + - label: JITted coverage information + id: jitted_coverage_info + description: | + Work out how to leverage gcov support in LLVM to enable coverage + information + depends_on: + - compiler_rt + + - label: LLVM compiler_rt support + id: compiler_rt + description: | + Work out how to build compiler_rt into LLVM and how to use it in Numba + + - label: Switch to pytest + id: pytest + description: | + Make it possible to use pytest as test runner for Numba + + - label: Option to run modified tests only + id: run_new_tests + description: | + Use / make pytest plugin to detect all test files which are new / + changed relative to a given branch, and run only those tests + depends_on: + - pytest + + - label: Option to run 1/N slice of tests + id: run_test_slice + description: | + Use / make pytest plugin to run 1/N of enumerated tests. + depends_on: + - pytest + + - label: Faster PR testing + id: faster_pr_testing + description: | + Make automated PR testing with public CI services give faster feedback. + depends_on: + - run_new_tests + - run_test_slice diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/jquery.graphviz.svg.js b/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/jquery.graphviz.svg.js new file mode 100644 index 0000000000000000000000000000000000000000..15bea27bd0d9204e633e83fc8efa2eb1d027ee4f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/jquery.graphviz.svg.js @@ -0,0 +1,537 @@ +/* + * Copyright (c) 2015 Mountainstorm + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +function ($) { + 'use strict' + + // Cross Browser starts/endsWith support + // ===================================== + String.prototype.startsWith = function(prefix) { + return this.indexOf(prefix) == 0; + }; + + String.prototype.endsWith = function(suffix) { + return this.indexOf(suffix, this.length - suffix.length) !== -1; + }; + + // GRAPHVIZSVG PUBLIC CLASS DEFINITION + // =================================== + + var GraphvizSvg = function (element, options) { + this.type = null + this.options = null + this.enabled = null + this.$element = null + + this.init('graphviz.svg', element, options) + } + + GraphvizSvg.VERSION = '1.0.1' + + GraphvizSvg.GVPT_2_PX = 32.5 // used to ease removal of extra space + + GraphvizSvg.DEFAULTS = { + url: null, + svg: null, + shrink: '0.125pt', + tooltips: { + init: function ($graph) { + var $a = $(this) + $a.tooltip({ + container: $graph, + placement: 'auto left', + animation: false, + viewport: null + }).on('hide.bs.tooltip', function() { + // keep them visible even if you accidentally mouse over + if ($a.attr('data-tooltip-keepvisible')) { + return false + } + }) + }, + show: function () { + var $a = $(this) + $a.attr('data-tooltip-keepvisible', true) + $a.tooltip('show') + }, + hide: function () { + var $a = $(this) + $a.removeAttr('data-tooltip-keepvisible') + $a.tooltip('hide') + }, + update: function () { + var $this = $(this) + if ($this.attr('data-tooltip-keepvisible')) { + $this.tooltip('show') + return + } + } + }, + zoom: true, + highlight: { + selected: function (col, bg) { + return col + }, + unselected: function (col, bg) { + return jQuery.Color(col).transition(bg, 0.9) + } + }, + ready: null + } + + GraphvizSvg.prototype.init = function (type, element, options) { + this.enabled = true + this.type = type + this.$element = $(element) + this.options = this.getOptions(options) + + if (options.url) { + var that = this + $.get(options.url, null, function(data) { + var svg = $("svg", data) + that.$element.html(document.adoptNode(svg[0])) + that.setup() + }, "xml") + } else { + if (options.svg) { + this.$element.html(options.svg) + } + this.setup() + } + } + + GraphvizSvg.prototype.getDefaults = function () { + return GraphvizSvg.DEFAULTS + } + + GraphvizSvg.prototype.getOptions = function (options) { + options = $.extend({}, this.getDefaults(), this.$element.data(), options) + + if (options.shrink) { + if (typeof options.shrink != 'object') { + options.shrink = { + x: options.shrink, + y: options.shrink + } + } + options.shrink.x = this.convertToPx(options.shrink.x) + options.shrink.y = this.convertToPx(options.shrink.y) + } + return options + } + + GraphvizSvg.prototype.setup = function () { + var options = this.options + + // save key elements in the graph for easy access + var $svg = $(this.$element.children('svg')) + var $graph = $svg.children('g:first') + this.$svg = $svg + this.$graph = $graph + this.$background = $graph.children('polygon:first') // might not exist + this.$nodes = $graph.children('.node') + this.$edges = $graph.children('.edge') + this._nodesByName = {} + this._edgesByName = {} + + // add top level class and copy background color to element + this.$element.addClass('graphviz-svg') + if (this.$background.length) { + this.$element.css('background', this.$background.attr('fill')) + } + + // setup all the nodes and edges + var that = this + this.$nodes.each(function () { that.setupNodesEdges($(this), true) }) + this.$edges.each(function () { that.setupNodesEdges($(this), false) }) + + // remove the graph title element + var $title = this.$graph.children('title') + this.$graph.attr('data-name', $title.text()) + $title.remove() + + if (options.zoom) { + this.setupZoom() + } + + // tell people we're done + if (options.ready) { + options.ready.call(this) + } + } + + GraphvizSvg.prototype.setupNodesEdges = function ($el, isNode) { + var that = this + var options = this.options + + // save the colors of the paths, ellipses and polygons + $el.find('polygon, ellipse, path').each(function () { + var $this = $(this) + // save original colors + $this.data('graphviz.svg.color', { + fill: $this.attr('fill'), + stroke: $this.attr('stroke') + + }) + + // shrink it if it's a node + if (isNode && options.shrink) { + that.scaleNode($this) + } + }) + + // save the node name and check if theres a comment above; save it + var $title = $el.children('title') + if ($title[0]) { + // remove any compass points: + var title = $title.text().replace(/:[snew][ew]?/g,'') + $el.attr('data-name', title) + $title.remove() + if (isNode) { + this._nodesByName[title] = $el[0] + } else { + this._edgesByName[title] = $el[0] + } + // without a title we can't tell if its a user comment or not + var previousSibling = $el[0].previousSibling + while (previousSibling && previousSibling.nodeType != 8) { + previousSibling = previousSibling.previousSibling + } + if (previousSibling != null && previousSibling.nodeType == 8) { + var htmlDecode = function (input) { + var e = document.createElement('div') + e.innerHTML = input + return e.childNodes[0].nodeValue + } + var value = htmlDecode(previousSibling.nodeValue.trim()) + if (value != title) { + // user added comment + $el.attr('data-comment', value) + } + } + } + + // remove namespace from a[xlink:title] + $el.find('a').filter(function () { + return $(this).attr('xlink:title') }).each(function () { + var $a = $(this) + $a.attr('title', $a.attr('xlink:title')) + $a.removeAttr('xlink:title') + if (options.tooltips) { + options.tooltips.init.call(this, that.$element) + } + }) + } + + GraphvizSvg.prototype.setupZoom = function() { + var that = this + var $element = this.$element + var $svg = this.$svg + this.zoom = {width: $svg.attr('width'), height: $svg.attr('height'), percentage: null } + this.scaleView(100.0) + $element.mousewheel(function (evt) { + if (evt.shiftKey) { + var percentage = that.zoom.percentage + percentage -= evt.deltaY * evt.deltaFactor + if (percentage < 100.0) { + percentage = 100.0 + } + // get pointer offset in view + // ratio offset within svg + var dx = evt.pageX - $svg.offset().left + var dy = evt.pageY - $svg.offset().top + var rx = dx / $svg.width() + var ry = dy / $svg.height() + + // offset within frame ($element) + var px = evt.pageX - $element.offset().left + var py = evt.pageY - $element.offset().top + + that.scaleView(percentage) + // scroll so pointer is still in same place + $element.scrollLeft((rx * $svg.width()) + 0.5 - px) + $element.scrollTop((ry * $svg.height()) + 0.5 - py) + return false // stop propagation + } + }) + } + + GraphvizSvg.prototype.scaleView = function(percentage) { + var that = this + var $svg = this.$svg + $svg.attr('width', percentage + '%') + $svg.attr('height', percentage + '%') + this.zoom.percentage = percentage + // now callback to update tooltip position + var $everything = this.$nodes.add(this.$edges) + $everything.children('a[title]').each(function () { + that.options.tooltips.update.call(this) + }) + } + + GraphvizSvg.prototype.scaleNode = function($node) { + var dx = this.options.shrink.x + var dy = this.options.shrink.y + var tagName = $node.prop('tagName') + if (tagName == 'ellipse') { + $node.attr('rx', parseFloat($node.attr('rx')) - dx) + $node.attr('ry', parseFloat($node.attr('ry')) - dy) + } else if (tagName == 'polygon') { + // this is more complex - we need to scale it manually + var bbox = $node[0].getBBox() + var cx = bbox.x + (bbox.width / 2) + var cy = bbox.y + (bbox.height / 2) + var pts = $node.attr('points').split(' ') + var points = '' // new value + for (var i in pts) { + var xy = pts[i].split(',') + var ox = parseFloat(xy[0]) + var oy = parseFloat(xy[1]) + points += (((cx - ox) / (bbox.width / 2) * dx) + ox) + + ',' + + (((cy - oy) / (bbox.height / 2) * dy) + oy) + + ' ' + } + $node.attr('points', points) + } + } + + GraphvizSvg.prototype.convertToPx = function (val) { + var retval = val + if (typeof val == 'string') { + var end = val.length + var factor = 1.0 + if (val.endsWith('px')) { + end -= 2 + } else if (val.endsWith('pt')) { + end -= 2 + factor = GraphvizSvg.GVPT_2_PX + } + retval = parseFloat(val.substring(0, end)) * factor + } + return retval + } + + GraphvizSvg.prototype.findEdge = function (nodeName, testEdge, $retval) { + var retval = [] + for (var name in this._edgesByName) { + var match = testEdge(nodeName, name) + if (match) { + if ($retval) { + $retval.push(this._edgesByName[name]) + } + retval.push(match) + } + } + return retval + } + + GraphvizSvg.prototype.findLinked = function (node, includeEdges, testEdge, $retval) { + var that = this + var $node = $(node) + var $edges = null + if (includeEdges) { + $edges = $retval + } + var names = this.findEdge($node.attr('data-name'), testEdge, $edges) + for (var i in names) { + var n = this._nodesByName[names[i]] + if (!$retval.is(n)) { + $retval.push(n) + that.findLinked(n, includeEdges, testEdge, $retval) + } + } + } + + GraphvizSvg.prototype.colorElement = function ($el, getColor) { + var bg = this.$element.css('background') + $el.find('polygon, ellipse, path').each(function() { + var $this = $(this) + var color = $this.data('graphviz.svg.color') + if (color.fill && $this.prop('tagName') != 'path') { + $this.attr('fill', getColor(color.fill, bg)) // don't set fill if it's a path + } + if (color.stroke) { + $this.attr('stroke', getColor(color.stroke, bg)) + } + }) + } + + GraphvizSvg.prototype.restoreElement = function ($el) { + $el.find('polygon, ellipse, path').each(function() { + var $this = $(this) + var color = $this.data('graphviz.svg.color') + if (color.fill) { + $this.attr('fill', color.fill) // don't set fill if it's a path + } + if (color.stroke) { + $this.attr('stroke', color.stroke) + } + }) + } + + + // methods users can actually call + GraphvizSvg.prototype.nodes = function () { + return this.$nodes + } + + GraphvizSvg.prototype.edges = function () { + return this.$edges + } + + GraphvizSvg.prototype.nodesByName = function () { + return this._nodesByName + } + + GraphvizSvg.prototype.edgesByName = function () { + return this._edgesByName + } + + GraphvizSvg.prototype.linkedTo = function (node, includeEdges) { + var $retval = $() + this.findLinked(node, includeEdges, function (nodeName, edgeName) { + var other = null; + var match = '->' + nodeName + if (edgeName.endsWith(match)) { + other = edgeName.substring(0, edgeName.length - match.length); + } + return other; + }, $retval) + return $retval + } + + GraphvizSvg.prototype.linkedFrom = function (node, includeEdges) { + var $retval = $() + this.findLinked(node, includeEdges, function (nodeName, edgeName) { + var other = null; + var match = nodeName + '->' + if (edgeName.startsWith(match)) { + other = edgeName.substring(match.length); + } + return other; + }, $retval) + return $retval + } + + GraphvizSvg.prototype.linked = function (node, includeEdges) { + var $retval = $() + this.findLinked(node, includeEdges, function (nodeName, edgeName) { + return '^' + name + '--(.*)$' + }, $retval) + this.findLinked(node, includeEdges, function (nodeName, edgeName) { + return '^(.*)--' + name + '$' + }, $retval) + return $retval + } + + GraphvizSvg.prototype.tooltip = function ($elements, show) { + var that = this + var options = this.options + $elements.each(function () { + $(this).find('a[title]').each(function () { + if (show) { + options.tooltips.show.call(this) + } else { + options.tooltips.hide.call(this) + } + }) + }) + } + + GraphvizSvg.prototype.bringToFront = function ($elements) { + $elements.detach().appendTo(this.$graph) + } + + GraphvizSvg.prototype.sendToBack = function ($elements) { + if (this.$background.length) { + $element.insertAfter(this.$background) + } else { + $elements.detach().prependTo(this.$graph) + } + } + + GraphvizSvg.prototype.highlight = function ($nodesEdges, tooltips) { + var that = this + var options = this.options + var $everything = this.$nodes.add(this.$edges) + if ($nodesEdges && $nodesEdges.length > 0) { + // create set of all other elements and dim them + $everything.not($nodesEdges).each(function () { + that.colorElement($(this), options.highlight.unselected) + $(this).css('font-weight', 'normal') + that.tooltip($(this)) + }) + $nodesEdges.each(function () { + that.colorElement($(this), options.highlight.selected) + $(this).css('font-weight', 'normal') + }) + + this.tooltip($nodesEdges, tooltips) + } else { + $everything.each(function () { + that.restoreElement($(this)) + $(this).css('font-weight', 'normal') + }) + this.tooltip($everything) + } + } + + GraphvizSvg.prototype.destroy = function () { + var that = this + this.hide(function () { + that.$element.off('.' + that.type).removeData(that.type) + }) + } + + + // GRAPHVIZSVG PLUGIN DEFINITION + // ============================= + + function Plugin(option) { + return this.each(function () { + var $this = $(this) + var data = $this.data('graphviz.svg') + var options = typeof option == 'object' && option + + if (!data && /destroy/.test(option)) return + if (!data) $this.data('graphviz.svg', (data = new GraphvizSvg(this, options))) + if (typeof option == 'string') data[option]() + }) + } + + var old = $.fn.graphviz + + $.fn.graphviz = Plugin + $.fn.graphviz.Constructor = GraphvizSvg + + + // GRAPHVIZ NO CONFLICT + // ==================== + + $.fn.graphviz.noConflict = function () { + $.fn.graphviz = old + return this + } + +}(jQuery) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/render.py b/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/render.py new file mode 100755 index 0000000000000000000000000000000000000000..28791e0b7580c9cb88892298a01e69237afc0a28 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/render.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python + +import os.path +import json +import collections +import yaml +import graphviz +from jinja2 import Environment, FileSystemLoader + + +Dagmap = collections.namedtuple('Dagmap', + ['version', 'meta', 'style', 'tasks']) + + +def parse_yaml(filename): + with open(filename, 'r') as f: + contents = yaml.safe_load(f) + + meta = contents['meta'] + version = meta['version'] + if version > 1: + raise Exception('Unsupported version %d' % version) + del meta['version'] + + style = contents['style'] + tasks = contents['tasks'] + if not isinstance(tasks, list): + raise Exception('"tasks" must be a list') + + return Dagmap(version=version, meta=meta, style=style, tasks=tasks) + + +def to_graphviz(dagmap): + G = graphviz.Digraph(format='svg', engine='neato', + graph_attr=dict(bgcolor="#f4f4f4", pad="0.5", overlap="false"), + node_attr=dict(width="0.6", style="filled", + fillcolor="#83c6de", color="#83c6de", penwidth="3", label="", + fontname="helvetica Neue Ultra Light", fontsize="28"), + edge_attr=dict(color="#616a72", arrowsize="2.0", penwidth="4", fontname="helvetica Neue Ultra Light")) + + G.node(name='_nothing', label='', style='invis') + + for task in dagmap.tasks: + G.node(name=task['id'], label=task['label'], + tooltip=task['description'].strip()) + depends_on = task.get('depends_on', ['_nothing']) + for dep in depends_on: + if dep == '_nothing': + attrs = { + 'style': 'invis', + } + else: + attrs = {} + G.edge(dep, task['id'], **attrs) + + return G + + +def main(argv): + import argparse + parser = argparse.ArgumentParser(description='Render Dagmap to Graphviz') + parser.add_argument('-o', '--output', required=True, help='output svg filename') + parser.add_argument('-t', '--template', default='template.html', help='HTML rendering template') + parser.add_argument('input', metavar='INPUT', type=str, + help='YAML input filename') + + args = parser.parse_args(argv[1:]) + + dagmap = parse_yaml(args.input) + graph = to_graphviz(dagmap) + svg = graph.pipe().decode('utf-8') + + template_env = Environment(loader=FileSystemLoader(os.path.dirname(__file__))) + template = template_env.get_template(args.template) + html = template.render(svg=json.dumps(svg)) + + with open(args.output, 'w') as f: + f.write(html) + + return 0 + + +if __name__ == '__main__': + import sys + sys.exit(main(sys.argv)) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/template.html b/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/template.html new file mode 100644 index 0000000000000000000000000000000000000000..0a634757c51b42763c630756fd77230276e5e593 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/dagmap/template.html @@ -0,0 +1,110 @@ + + + + + + + + + + +

Click node to highlight; Shift-scroll to zoom; Esc to unhighlight

+ +
+
Details
+

(Click on a node for details)

+
+ +
+ + + + + + + + + + + + + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/environment.yml b/cv/3d_detection/centerpoint/pytorch/numba/docs/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..8028a33660cb684e68132e3835ba2ce9779e4dee --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/environment.yml @@ -0,0 +1,16 @@ +# This environment is used by the RTD config for PR builds. RTD uses this as the +# base environment and then adds in the sphinx etc tools on top. +# See: https://docs.readthedocs.io/en/stable/guides/conda.html +name: rtd +channels: + - numba/label/dev +dependencies: + - python=3.7 + - llvmlite=0.39 + - numpy + - numpydoc + - setuptools + # https://stackoverflow.com/questions/67542699/readthedocs-sphinx-not-rendering-bullet-list-from-rst-fileA + - docutils==0.16 + # The following is needed to fix RTD. + - conda diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/gh-pages.py b/cv/3d_detection/centerpoint/pytorch/numba/docs/gh-pages.py new file mode 100644 index 0000000000000000000000000000000000000000..3c8093d06619538a00bb390dbc178e0ef8907d3b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/gh-pages.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Script to commit the doc build outputs into the github-pages repo. + +Use: + + gh-pages.py [tag] + +If no tag is given, the current output of 'git describe' is used. If given, +that is how the resulting directory will be named. + +In practice, you should use either actual clean tags from a current build or +something like 'current' as a stable URL for the most current version of the """ +from __future__ import print_function, division, absolute_import + +#----------------------------------------------------------------------------- +# Imports +#----------------------------------------------------------------------------- +import os +import re +import shutil +import sys +from os import chdir as cd +from os.path import join as pjoin + +from subprocess import Popen, PIPE, CalledProcessError, check_call + +#----------------------------------------------------------------------------- +# Globals +#----------------------------------------------------------------------------- + +pages_dir = 'gh-pages' +html_dir = '_build/html' +pdf_dir = '_build/latex' +pages_repo = 'git@github.com:numba/numba-doc.git' + +#----------------------------------------------------------------------------- +# Functions +#----------------------------------------------------------------------------- +def sub_environment(): + """Return an environment dict for executing subcommands in.""" + env = os.environ.copy() + # Force untranslated messages for regex matching + env['LANG'] = 'C' + return env + + +def sh(cmd): + """Execute command in a subshell, return status code.""" + return check_call(cmd, shell=True, env=sub_environment()) + + +def sh2(cmd): + """Execute command in a subshell, return stdout. + + Stderr is unbuffered from the subshell.x""" + p = Popen(cmd, stdout=PIPE, shell=True, env=sub_environment()) + out = p.communicate()[0] + retcode = p.returncode + if retcode: + raise CalledProcessError(retcode, cmd) + else: + return out.rstrip() + + +def sh3(cmd): + """Execute command in a subshell, return stdout, stderr + + If anything appears in stderr, print it out to sys.stderr""" + p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, + env=sub_environment()) + out, err = p.communicate() + retcode = p.returncode + if retcode: + raise CalledProcessError(retcode, cmd) + else: + return out.rstrip(), err.rstrip() + + +def init_repo(path): + """clone the gh-pages repo if we haven't already.""" + sh("git clone %s %s"%(pages_repo, path)) + here = os.getcwd() + cd(path) + sh('git checkout gh-pages') + cd(here) + +#----------------------------------------------------------------------------- +# Script starts +#----------------------------------------------------------------------------- +if __name__ == '__main__': + # The tag can be given as a positional argument + try: + tag = sys.argv[1] + except IndexError: + try: + tag = sh2('git describe --exact-match').decode() + except CalledProcessError: + tag = "dev" # Fallback + print("Using dev") + + startdir = os.getcwd() + if not os.path.exists(pages_dir): + # init the repo + init_repo(pages_dir) + else: + # ensure up-to-date before operating + cd(pages_dir) + sh('git checkout gh-pages') + sh('git pull') + cd(startdir) + + dest = pjoin(pages_dir, tag) + + # don't `make html` here, because gh-pages already depends on html in Makefile + # sh('make html') + if tag != 'dev': + # only build pdf for non-dev targets + #sh2('make pdf') + pass + + # This is pretty unforgiving: we unconditionally nuke the destination + # directory, and then copy the html tree in there + shutil.rmtree(dest, ignore_errors=True) + shutil.copytree(html_dir, dest) + if tag != 'dev': + #shutil.copy(pjoin(pdf_dir, 'ipython.pdf'), pjoin(dest, 'ipython.pdf')) + pass + + try: + cd(pages_dir) + status = sh2('git status | head -1').decode() + branch = re.match('\#?\s*On branch (.*)$', status).group(1) + if branch != 'gh-pages': + e = 'On %r, git branch is %r, MUST be "gh-pages"' % (pages_dir, + branch) + raise RuntimeError(e) + + sh('git add -A %s' % tag) + sh('git commit -m"Updated doc release: %s"' % tag) + print() + print('Most recent 3 commits:') + sys.stdout.flush() + sh('git --no-pager log --oneline HEAD~3..') + finally: + cd(startdir) + + print() + print('Now verify the build in: %r' % dest) + print("If everything looks good, 'git push'") diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/make.bat b/cv/3d_detection/centerpoint/pytorch/numba/docs/make.bat new file mode 100644 index 0000000000000000000000000000000000000000..29b481d888f20514484bb183d66170b71499a6a5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/make.bat @@ -0,0 +1,242 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source +set I18NSPHINXOPTS=%SPHINXOPTS% source +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Numba.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Numba.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/requirements.txt b/cv/3d_detection/centerpoint/pytorch/numba/docs/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..66f88aae4182fa0c779a9b888e45d1a29f7c6c01 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/requirements.txt @@ -0,0 +1 @@ +numpydoc \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/_ext/ghfiles.py b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/_ext/ghfiles.py new file mode 100644 index 0000000000000000000000000000000000000000..d0320cb3abac5f6781bab704a855041bc90089e2 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/_ext/ghfiles.py @@ -0,0 +1,75 @@ +import os.path as path +import subprocess +import shlex +from sphinx.util import logging +from docutils import nodes +logger = logging.getLogger(__name__) + + +# use an old git trick, to get the top-level, could have used ../ etc.. but +# this will be fine.. +top = subprocess.check_output(shlex.split( + "git rev-parse --show-toplevel")).strip().decode("utf-8") + + +def make_ref(text): + """ Make hyperlink to Github """ + full_path = path.join(top, text) + if path.isfile(full_path): + ref = "https://www.github.com/numba/numba/blob/main/" + text + elif path.isdir(full_path): + ref = "https://www.github.com/numba/numba/tree/main/" + text + else: + logger.warn("Failed to find file in repomap: " + text) + ref = "https://www.github.com/numba/numba" + return ref + + +def intersperse(lst, item): + """ Insert item between each item in lst. + + Copied under CC-BY-SA from stackoverflow at: + + https://stackoverflow.com/questions/5920643/ + add-an-item-between-each-item-already-in-the-list + + """ + result = [item] * (len(lst) * 2 - 1) + result[0::2] = lst + return result + + +def ghfile_role(name, rawtext, text, lineno, inliner, options={}, content=[]): + """ Emit hyperlink nodes for a given file in repomap. """ + my_nodes = [] + if "{" in text: # myfile.{c,h} - make two nodes + # could have used regexes, but this will be fine.. + base = text[:text.find(".") + 1] + exts = text[text.find("{") + 1:text.find("}")].split(",") + for e in exts: + node = nodes.reference(rawtext, + base + e, + refuri=make_ref(base + e), + **options) + my_nodes.append(node) + elif "*" in text: # path/*_files.py - link to directory + # Could have used something from os.path, but this will be fine.. + ref = path.dirname(text) + path.sep + node = nodes.reference(rawtext, text, refuri=make_ref(ref), **options) + my_nodes.append(node) + else: # everything else is taken verbatim + node = nodes.reference(rawtext, text, refuri=make_ref(text), **options) + my_nodes.append(node) + + # insert separators if needed + if len(my_nodes) > 1: + my_nodes = intersperse(my_nodes, nodes.Text(" | ")) + return my_nodes, [] + + +def setup(app): + logger.info('Initializing ghfiles plugin') + app.add_role('ghfile', ghfile_role) + + metadata = {'parallel_read_safe': True, 'parallel_write_safe': True} + return metadata diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/conf.py b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..f81d55abd672fc7c5ee335fbca45b4cb30e98bcf --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/conf.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Numba documentation build configuration file, created by +# sphinx-quickstart on Tue Dec 30 11:55:40 2014. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +try: + # Numba is installed + import numba +except ImportError: + # Numba is run from its source checkout + sys.path.insert(0, os.path.abspath('../..')) + import numba + + +on_rtd = os.environ.get('READTHEDOCS') == 'True' + +if on_rtd: + # The following is needed to fix RTD issue with numpydoc + # https://github.com/readthedocs/sphinx_rtd_theme/issues/766 + from conda.cli.python_api import run_command as conda_cmd + + conda_cmd("install", "-c", "conda-forge", "sphinx_rtd_theme>=0.5.1", "-y") + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + #'sphinx.ext.mathjax', + 'sphinx.ext.autodoc', + #'sphinx.ext.graphviz', + 'numpydoc', +] + +# Adding the github files extension +sys.path.append(os.path.abspath(os.path.join(".", "_ext"))) +extensions.append('ghfiles') + +todo_include_todos = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['../_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Numba' +copyright = u'2012-2020, Anaconda, Inc. and others' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +version = '.'.join(numba.__version__.split('.')[:2]) +# The full version, including alpha/beta/rc tags. +release = numba.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'sphinx_rtd_theme' + +# All sphinx_rtd_theme options. Default values commented out; uncomment to +# change. +html_theme_options = { + 'canonical_url': 'https://numba.readthedocs.io/en/stable/', + # 'logo_only': False, + # 'display_version': True, + # 'prev_next_buttons_location': 'bottom', + 'style_external_links': True, + # 'vcs_pageview_mode': '', + 'style_nav_header_background': '#00A3E0', + # Toc options + 'collapse_navigation': False, + # 'sticky_navigation': True, + # 'navigation_depth': 4, + # 'includehidden': True, + # 'titles_only': False +} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = None + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +html_logo = "../_static/numba-white-icon-rgb.svg" + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +html_favicon = '../_static/numba-blue-icon-rgb.svg' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['../_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Numbadoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + #'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'numba.tex', u'Numba Documentation', + u'Anaconda', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'numba', 'Numba Documentation', + ['Anaconda'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'Numba', 'Numba Documentation', + 'Anaconda', 'Numba', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False + + +# Configuration for intersphinx: refer to the Python standard library +# and the Numpy documentation. +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'numpy': ('https://numpy.org/doc/stable/', None), + 'llvmlite': ('https://llvmlite.readthedocs.io/en/latest/', None), +} + + +# numpydoc options + +# To silence "WARNING: toctree contains reference to nonexisting document" +numpydoc_show_class_members = False + +# -- Custom autogeneration ------------------------------------------------ + + +def _autogenerate(): + from numba.scripts.generate_lower_listing import gen_lower_listing + from numba.misc.help.inspector import write_listings + + basedir = os.path.dirname(__file__) + gen_lower_listing(os.path.join(basedir, + 'developer/autogen_lower_listing.rst')) + + # Run inspector on supported packages + for package in ['builtins', 'math', 'cmath', 'numpy']: + write_listings( + package_name=package, + filename=os.path.join( + basedir, 'developer', 'autogen_{}_listing'.format(package), + ), + output_format='rst', + ) + + +_autogenerate() + + +def setup(app): + app.add_css_file('rtd-overrides.css') diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/host.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/host.rst new file mode 100644 index 0000000000000000000000000000000000000000..4c2dd0cfb85d5d06bdae821cad790ec2c730f17d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/host.rst @@ -0,0 +1,232 @@ +CUDA Host API +============= + +Device Management +----------------- + +Device detection and enquiry +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following functions are available for querying the available hardware: + +.. autofunction:: numba.cuda.is_available + +.. autofunction:: numba.cuda.detect + +Context management +~~~~~~~~~~~~~~~~~~ + +CUDA Python functions execute within a CUDA context. Each CUDA device in a +system has an associated CUDA context, and Numba presently allows only one context +per thread. For further details on CUDA Contexts, refer to the `CUDA Driver API +Documentation on Context Management +`_ and the +`CUDA C Programming Guide Context Documentation +`_. CUDA Contexts +are instances of the :class:`~numba.cuda.cudadrv.driver.Context` class: + +.. autoclass:: numba.cuda.cudadrv.driver.Context + :members: reset, get_memory_info, push, pop + +The following functions can be used to get or select the context: + +.. autofunction:: numba.cuda.current_context +.. autofunction:: numba.cuda.require_context + +The following functions affect the current context: + +.. autofunction:: numba.cuda.synchronize +.. autofunction:: numba.cuda.close + +Device management +~~~~~~~~~~~~~~~~~ + +Numba maintains a list of supported CUDA-capable devices: + +.. attribute:: numba.cuda.gpus + + An indexable list of supported CUDA devices. This list is indexed by integer + device ID. + +Alternatively, the current device can be obtained: + +.. function:: numba.cuda.gpus.current + + Return the currently-selected device. + +Getting a device through :attr:`numba.cuda.gpus` always provides an instance of +:class:`numba.cuda.cudadrv.devices._DeviceContextManager`, which acts as a +context manager for the selected device: + +.. autoclass:: numba.cuda.cudadrv.devices._DeviceContextManager + +One may also select a context and device or get the current device using the +following three functions: + +.. autofunction:: numba.cuda.select_device +.. autofunction:: numba.cuda.get_current_device +.. autofunction:: numba.cuda.list_devices + +The :class:`numba.cuda.cudadrv.driver.Device` class can be used to enquire about +the functionality of the selected device: + +.. class:: numba.cuda.cudadrv.driver.Device + + The device associated with a particular context. + + .. attribute:: compute_capability + + A tuple, *(major, minor)* indicating the supported compute capability. + + .. attribute:: id + + The integer ID of the device. + + .. attribute:: name + + The name of the device (e.g. "GeForce GTX 970"). + + .. attribute:: uuid + + The UUID of the device (e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643"). + + .. method:: reset + + Delete the context for the device. This will destroy all memory + allocations, events, and streams created within the context. + + +Compilation +----------- + +Numba provides an entry point for compiling a Python function to PTX without +invoking any of the driver API. This can be useful for: + +- Generating PTX that is to be inlined into other PTX code (e.g. from outside + the Numba / Python ecosystem). +- Generating code when there is no device present. +- Generating code prior to a fork without initializing CUDA. + +.. note:: It is the user's responsibility to manage any ABI issues arising from + the use of compilation to PTX. + +.. autofunction:: numba.cuda.compile_ptx + + +The environment variable ``NUMBA_CUDA_DEFAULT_PTX_CC`` can be set to control +the default compute capability targeted by ``compile_ptx`` - see +:ref:`numba-envvars-gpu-support`. If PTX for the compute capability of the +current device is required, the ``compile_ptx_for_current_device`` function can +be used: + +.. autofunction:: numba.cuda.compile_ptx_for_current_device + + + +Measurement +----------- + +.. _cuda-profiling: + +Profiling +~~~~~~~~~ + +The NVidia Visual Profiler can be used directly on executing CUDA Python code - +it is not a requirement to insert calls to these functions into user code. +However, these functions can be used to allow profiling to be performed +selectively on specific portions of the code. For further information on +profiling, see the `NVidia Profiler User's Guide +`_. + +.. autofunction:: numba.cuda.profile_start +.. autofunction:: numba.cuda.profile_stop +.. autofunction:: numba.cuda.profiling + + +.. _events: + +Events +~~~~~~ + +Events can be used to monitor the progress of execution and to record the +timestamps of specific points being reached. Event creation returns immediately, +and the created event can be queried to determine if it has been reached. For +further information, see the `CUDA C Programming Guide Events section +`_. + +The following functions are used for creating and measuring the time between +events: + +.. autofunction:: numba.cuda.event +.. autofunction:: numba.cuda.event_elapsed_time + +Events are instances of the :class:`numba.cuda.cudadrv.driver.Event` class: + +.. autoclass:: numba.cuda.cudadrv.driver.Event + :members: query, record, synchronize, wait + + +.. _streams: + +Stream Management +----------------- + +Streams allow concurrency of execution on a single device within a given +context. Queued work items in the same stream execute sequentially, but work +items in different streams may execute concurrently. Most operations involving a +CUDA device can be performed asynchronously using streams, including data +transfers and kernel execution. For further details on streams, see the `CUDA C +Programming Guide Streams section +`_. + +Numba defaults to using the legacy default stream as the default stream. The +per-thread default stream can be made the default stream by setting the +environment variable ``NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`` to ``1`` (see the +:ref:`CUDA Environment Variables section `). +Regardless of this setting, the objects representing the legacy and per-thread +default streams can be constructed using the functions below. + +Streams are instances of :class:`numba.cuda.cudadrv.driver.Stream`: + +.. autoclass:: numba.cuda.cudadrv.driver.Stream + :members: synchronize, auto_synchronize, add_callback, async_done + +To create a new stream: + +.. autofunction:: numba.cuda.stream + +To get the default stream: + +.. autofunction:: numba.cuda.default_stream + +To get the default stream with an explicit choice of whether it is the legacy +or per-thread default stream: + +.. autofunction:: numba.cuda.legacy_default_stream + +.. autofunction:: numba.cuda.per_thread_default_stream + +To construct a Numba ``Stream`` object using a stream allocated elsewhere, the +``external_stream`` function is provided. Note that the lifetime of external +streams must be managed by the user - Numba will not deallocate an external +stream, and the stream must remain valid whilst the Numba ``Stream`` object is +in use. + +.. autofunction:: numba.cuda.external_stream + + +Runtime +------- + +Numba generally uses the Driver API, but it provides a simple wrapper to the +Runtime API so that the version of the runtime in use can be queried. This is +accessed through ``cuda.runtime``, which is an instance of the +:class:`numba.cuda.cudadrv.runtime.Runtime` class: + +.. autoclass:: numba.cuda.cudadrv.runtime.Runtime + :members: get_version, is_supported_version, supported_versions + +Whether the current runtime is officially supported and tested with the current +version of Numba can also be queried: + +.. autofunction:: numba.cuda.is_supported_version diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/index.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..771afea194a5554b1339bd679e7b22bd4f5a3438 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/index.rst @@ -0,0 +1,10 @@ +CUDA Python Reference +===================== + +.. toctree:: + + host.rst + kernel.rst + types.rst + memory.rst + libdevice.rst diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/kernel.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/kernel.rst new file mode 100644 index 0000000000000000000000000000000000000000..d23b2eed4c1d689cb195e0c81cab21f1fe3216a3 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/kernel.rst @@ -0,0 +1,586 @@ +CUDA Kernel API +=============== + +Kernel declaration +------------------ + +The ``@cuda.jit`` decorator is used to create a CUDA dispatcher object that can +be configured and launched: + +.. autofunction:: numba.cuda.jit + + +Dispatcher objects +------------------ + +The usual syntax for configuring a Dispatcher with a launch configuration uses +subscripting, with the arguments being as in the following: + +.. code-block:: python + + # func is some function decorated with @cuda.jit + func[griddim, blockdim, stream, sharedmem] + + +The ``griddim`` and ``blockdim`` arguments specify the size of the grid and +thread blocks, and may be either integers or tuples of length up to 3. The +``stream`` parameter is an optional stream on which the kernel will be launched, +and the ``sharedmem`` parameter specifies the size of dynamic shared memory in +bytes. + +Subscripting the Dispatcher returns a configuration object that can be called +with the kernel arguments: + +.. code-block:: python + + configured = func[griddim, blockdim, stream, sharedmem] + configured(x, y, z) + + +However, it is more idiomatic to configure and call the kernel within a single +statement: + +.. code-block:: python + + func[griddim, blockdim, stream, sharedmem](x, y, z) + +This is similar to launch configuration in CUDA C/C++: + +.. code-block:: cuda + + func<<>>(x, y, z) + +.. note:: The order of ``stream`` and ``sharedmem`` are reversed in Numba + compared to in CUDA C/C++. + +Dispatcher objects also provide several utility methods for inspection and +creating a specialized instance: + +.. autoclass:: numba.cuda.dispatcher.CUDADispatcher + :members: inspect_asm, inspect_llvm, inspect_sass, inspect_types, + get_regs_per_thread, specialize, specialized, extensions, forall + + +Intrinsic Attributes and Functions +---------------------------------- + +The remainder of the attributes and functions in this section may only be called +from within a CUDA Kernel. + +Thread Indexing +~~~~~~~~~~~~~~~ + +.. attribute:: numba.cuda.threadIdx + + The thread indices in the current thread block, accessed through the + attributes ``x``, ``y``, and ``z``. Each index is an integer spanning the + range from 0 inclusive to the corresponding value of the attribute in + :attr:`numba.cuda.blockDim` exclusive. + +.. attribute:: numba.cuda.blockIdx + + The block indices in the grid of thread blocks, accessed through the + attributes ``x``, ``y``, and ``z``. Each index is an integer spanning the + range from 0 inclusive to the corresponding value of the attribute in + :attr:`numba.cuda.gridDim` exclusive. + +.. attribute:: numba.cuda.blockDim + + The shape of a block of threads, as declared when instantiating the + kernel. This value is the same for all threads in a given kernel, even + if they belong to different blocks (i.e. each block is "full"). + +.. attribute:: numba.cuda.gridDim + + The shape of the grid of blocks, accessed through the attributes ``x``, + ``y``, and ``z``. + +.. attribute:: numba.cuda.laneid + + The thread index in the current warp, as an integer spanning the range + from 0 inclusive to the :attr:`numba.cuda.warpsize` exclusive. + +.. attribute:: numba.cuda.warpsize + + The size in threads of a warp on the GPU. Currently this is always 32. + +.. function:: numba.cuda.grid(ndim) + + Return the absolute position of the current thread in the entire + grid of blocks. *ndim* should correspond to the number of dimensions + declared when instantiating the kernel. If *ndim* is 1, a single integer + is returned. If *ndim* is 2 or 3, a tuple of the given number of + integers is returned. + + Computation of the first integer is as follows:: + + cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x + + and is similar for the other two indices, but using the ``y`` and ``z`` + attributes. + +.. function:: numba.cuda.gridsize(ndim) + + Return the absolute size (or shape) in threads of the entire grid of + blocks. *ndim* should correspond to the number of dimensions declared when + instantiating the kernel. + + Computation of the first integer is as follows:: + + cuda.blockDim.x * cuda.gridDim.x + + and is similar for the other two indices, but using the ``y`` and ``z`` + attributes. + +Memory Management +~~~~~~~~~~~~~~~~~ + +.. function:: numba.cuda.shared.array(shape, dtype) + + Creates an array in the local memory space of the CUDA kernel with + the given ``shape`` and ``dtype``. + + Returns an array with its content uninitialized. + + .. note:: All threads in the same thread block sees the same array. + +.. function:: numba.cuda.local.array(shape, dtype) + + Creates an array in the local memory space of the CUDA kernel with the + given ``shape`` and ``dtype``. + + Returns an array with its content uninitialized. + + .. note:: Each thread sees a unique array. + +.. function:: numba.cuda.const.array_like(ary) + + Copies the ``ary`` into constant memory space on the CUDA kernel at compile + time. + + Returns an array like the ``ary`` argument. + + .. note:: All threads and blocks see the same array. + +Synchronization and Atomic Operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. function:: numba.cuda.atomic.add(array, idx, value) + + Perform ``array[idx] += value``. Support int32, int64, float32 and + float64 only. The ``idx`` argument can be an integer or a tuple of integer + indices for indexing into multiple dimensional arrays. The number of element + in ``idx`` must match the number of dimension of ``array``. + + Returns the value of ``array[idx]`` before the storing the new value. + Behaves like an atomic load. + +.. function:: numba.cuda.atomic.sub(array, idx, value) + + Perform ``array[idx] -= value``. Supports int32, int64, float32 and + float64 only. The ``idx`` argument can be an integer or a tuple of integer + indices for indexing into multi-dimensional arrays. The number of elements + in ``idx`` must match the number of dimensions of ``array``. + + Returns the value of ``array[idx]`` before the storing the new value. + Behaves like an atomic load. + +.. function:: numba.cuda.atomic.and_(array, idx, value) + + Perform ``array[idx] &= value``. Supports int32, uint32, int64, + and uint64 only. The ``idx`` argument can be an integer or a tuple of + integer indices for indexing into multi-dimensional arrays. The number + of elements in ``idx`` must match the number of dimensions of ``array``. + + Returns the value of ``array[idx]`` before the storing the new value. + Behaves like an atomic load. + +.. function:: numba.cuda.atomic.or_(array, idx, value) + + Perform ``array[idx] |= value``. Supports int32, uint32, int64, + and uint64 only. The ``idx`` argument can be an integer or a tuple of + integer indices for indexing into multi-dimensional arrays. The number + of elements in ``idx`` must match the number of dimensions of ``array``. + + Returns the value of ``array[idx]`` before the storing the new value. + Behaves like an atomic load. + +.. function:: numba.cuda.atomic.xor(array, idx, value) + + Perform ``array[idx] ^= value``. Supports int32, uint32, int64, + and uint64 only. The ``idx`` argument can be an integer or a tuple of + integer indices for indexing into multi-dimensional arrays. The number + of elements in ``idx`` must match the number of dimensions of ``array``. + + Returns the value of ``array[idx]`` before the storing the new value. + Behaves like an atomic load. + +.. function:: numba.cuda.atomic.exch(array, idx, value) + + Perform ``array[idx] = value``. Supports int32, uint32, int64, + and uint64 only. The ``idx`` argument can be an integer or a tuple of + integer indices for indexing into multi-dimensional arrays. The number + of elements in ``idx`` must match the number of dimensions of ``array``. + + Returns the value of ``array[idx]`` before the storing the new value. + Behaves like an atomic load. + +.. function:: numba.cuda.atomic.inc(array, idx, value) + + Perform ``array[idx] = (0 if array[idx] >= value else array[idx] + 1)``. + Supports uint32, and uint64 only. The ``idx`` argument can be an integer + or a tuple of integer indices for indexing into multi-dimensional arrays. + The number of elements in ``idx`` must match the number of dimensions of + ``array``. + + Returns the value of ``array[idx]`` before the storing the new value. + Behaves like an atomic load. + +.. function:: numba.cuda.atomic.dec(array, idx, value) + + Perform ``array[idx] = + (value if (array[idx] == 0) or (array[idx] > value) else array[idx] - 1)``. + Supports uint32, and uint64 only. The ``idx`` argument can be an integer + or a tuple of integer indices for indexing into multi-dimensional arrays. + The number of elements in ``idx`` must match the number of dimensions of + ``array``. + + Returns the value of ``array[idx]`` before the storing the new value. + Behaves like an atomic load. + +.. function:: numba.cuda.atomic.max(array, idx, value) + + Perform ``array[idx] = max(array[idx], value)``. Support int32, int64, + float32 and float64 only. The ``idx`` argument can be an integer or a + tuple of integer indices for indexing into multiple dimensional arrays. + The number of element in ``idx`` must match the number of dimension of + ``array``. + + Returns the value of ``array[idx]`` before the storing the new value. + Behaves like an atomic load. + + +.. function:: numba.cuda.syncthreads + + Synchronize all threads in the same thread block. This function implements + the same pattern as barriers in traditional multi-threaded programming: this + function waits until all threads in the block call it, at which point it + returns control to all its callers. + +.. function:: numba.cuda.syncthreads_count(predicate) + + An extension to :attr:`numba.cuda.syncthreads` where the return value is a count + of the threads where ``predicate`` is true. + +.. function:: numba.cuda.syncthreads_and(predicate) + + An extension to :attr:`numba.cuda.syncthreads` where 1 is returned if ``predicate`` is + true for all threads or 0 otherwise. + +.. function:: numba.cuda.syncthreads_or(predicate) + + An extension to :attr:`numba.cuda.syncthreads` where 1 is returned if ``predicate`` is + true for any thread or 0 otherwise. + + .. warning:: All syncthreads functions must be called by every thread in the + thread-block. Falling to do so may result in undefined behavior. + + +Cooperative Groups +~~~~~~~~~~~~~~~~~~ + +.. function:: numba.cuda.cg.this_grid() + + Get the current grid group. + + :return: The current grid group + :rtype: numba.cuda.cg.GridGroup + +.. class:: numba.cuda.cg.GridGroup + + A grid group. Users should not construct a GridGroup directly - instead, get + the current grid group using :func:`cg.this_grid() `. + + .. method:: sync() + + Synchronize the current grid group. + + +Memory Fences +~~~~~~~~~~~~~ + +The memory fences are used to guarantee the effect of memory operations +are visible by other threads within the same thread-block, the same GPU device, +and the same system (across GPUs on global memory). Memory loads and stores +are guaranteed to not move across the memory fences by optimization passes. + +.. warning:: The memory fences are considered to be advanced API and most + usercases should use the thread barrier (e.g. ``syncthreads()``). + + + +.. function:: numba.cuda.threadfence + + A memory fence at device level (within the GPU). + +.. function:: numba.cuda.threadfence_block + + A memory fence at thread block level. + +.. function:: numba.cuda.threadfence_system + + + A memory fence at system level (across GPUs). + +Warp Intrinsics +~~~~~~~~~~~~~~~ + +The argument ``membermask`` is a 32 bit integer mask with each bit +corresponding to a thread in the warp, with 1 meaning the thread is in the +subset of threads within the function call. The ``membermask`` must be all 1 if +the GPU compute capability is below 7.x. + +.. function:: numba.cuda.syncwarp(membermask) + + Synchronize a masked subset of the threads in a warp. + +.. function:: numba.cuda.all_sync(membermask, predicate) + + If the ``predicate`` is true for all threads in the masked warp, then + a non-zero value is returned, otherwise 0 is returned. + +.. function:: numba.cuda.any_sync(membermask, predicate) + + If the ``predicate`` is true for any thread in the masked warp, then + a non-zero value is returned, otherwise 0 is returned. + +.. function:: numba.cuda.eq_sync(membermask, predicate) + + If the boolean ``predicate`` is the same for all threads in the masked warp, + then a non-zero value is returned, otherwise 0 is returned. + +.. function:: numba.cuda.ballot_sync(membermask, predicate) + + Returns a mask of all threads in the warp whose ``predicate`` is true, + and are within the given mask. + +.. function:: numba.cuda.shfl_sync(membermask, value, src_lane) + + Shuffles ``value`` across the masked warp and returns the ``value`` + from ``src_lane``. If this is outside the warp, then the + given ``value`` is returned. + +.. function:: numba.cuda.shfl_up_sync(membermask, value, delta) + + Shuffles ``value`` across the masked warp and returns the ``value`` + from ``laneid - delta``. If this is outside the warp, then the + given ``value`` is returned. + +.. function:: numba.cuda.shfl_down_sync(membermask, value, delta) + + Shuffles ``value`` across the masked warp and returns the ``value`` + from ``laneid + delta``. If this is outside the warp, then the + given ``value`` is returned. + +.. function:: numba.cuda.shfl_xor_sync(membermask, value, lane_mask) + + Shuffles ``value`` across the masked warp and returns the ``value`` + from ``laneid ^ lane_mask``. + +.. function:: numba.cuda.match_any_sync(membermask, value, lane_mask) + + Returns a mask of threads that have same ``value`` as the given ``value`` + from within the masked warp. + +.. function:: numba.cuda.match_all_sync(membermask, value, lane_mask) + + Returns a tuple of (mask, pred), where mask is a mask of threads that have + same ``value`` as the given ``value`` from within the masked warp, if they + all have the same value, otherwise it is 0. And pred is a boolean of whether + or not all threads in the mask warp have the same warp. + +.. function:: numba.cuda.activemask() + + Returns a 32-bit integer mask of all currently active threads in the + calling warp. The Nth bit is set if the Nth lane in the warp is active when + activemask() is called. Inactive threads are represented by 0 bits in the + returned mask. Threads which have exited the kernel are always marked as + inactive. + +.. function:: numba.cuda.lanemask_lt() + + Returns a 32-bit integer mask of all lanes (including inactive ones) with + ID less than the current lane. + + +Integer Intrinsics +~~~~~~~~~~~~~~~~~~ + +A subset of the CUDA Math API's integer intrinsics are available. For further +documentation, including semantics, please refer to the `CUDA Toolkit +documentation +`_. + + +.. function:: numba.cuda.popc(x) + + Returns the number of bits set in ``x``. + +.. function:: numba.cuda.brev(x) + + Returns the reverse of the bit pattern of ``x``. For example, ``0b10110110`` + becomes ``0b01101101``. + +.. function:: numba.cuda.clz(x) + + Returns the number of leading zeros in ``x``. + +.. function:: numba.cuda.ffs(x) + + Returns the position of the first (least significant) bit set to 1 in ``x``, + where the least significant bit position is 1. ``ffs(0)`` returns 0. + + +Floating Point Intrinsics +~~~~~~~~~~~~~~~~~~~~~~~~~ + +A subset of the CUDA Math API's floating point intrinsics are available. For further +documentation, including semantics, please refer to the `single +`_ and +`double `_ +precision parts of the CUDA Toolkit documentation. + + +.. function:: numba.cuda.fma + + Perform the fused multiply-add operation. Named after the ``fma`` and ``fmaf`` in + the C api, but maps to the ``fma.rn.f32`` and ``fma.rn.f64`` (round-to-nearest-even) + PTX instructions. + +.. function:: numba.cuda.cbrt (x) + + Perform the cube root operation, x ** (1/3). Named after the functions + ``cbrt`` and ``cbrtf`` in the C api. Supports float32, and float64 arguments + only. + +16-bit Floating Point Intrinsics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following functions are used to operate on 16-bit floating point operands. +These functions return a 16-bit floating point result. + + +.. function:: numba.cuda.fp16.hfma (a, b, c) + + Perform the fused multiply-add operation ``(a * b) + c`` on 16-bit + floating point arguments in round to nearest mode. Maps to the ``fma.rn.f16`` + PTX instruction. + + Returns the 16-bit floating point result of the fused multiply-add. + +.. function:: numba.cuda.fp16.hadd (a, b) + + Perform the add operation ``a + b`` on 16-bit floating point arguments in + round to nearest mode. Maps to the ``add.f16`` PTX instruction. + + Returns the 16-bit floating point result of the addition. + +.. function:: numba.cuda.fp16.hsub (a, b) + + Perform the subtract operation ``a - b`` on 16-bit floating point arguments in + round to nearest mode. Maps to the ``sub.f16`` PTX instruction. + + Returns the 16-bit floating point result of the subtraction. + +.. function:: numba.cuda.fp16.hmul (a, b) + + Perform the multiply operation ``a * b`` on 16-bit floating point arguments in + round to nearest mode. Maps to the ``mul.f16`` PTX instruction. + + Returns the 16-bit floating point result of the multiplication. + +.. function:: numba.cuda.fp16.hneg (a) + + Perform the negation operation ``-a`` on the 16-bit floating point argument. + Maps to the ``neg.f16`` PTX instruction. + + Returns the 16-bit floating point result of the negation. + +.. function:: numba.cuda.fp16.habs (a) + + Perform the absolute value operation ``|a|`` on the 16-bit floating point argument. + + Returns the 16-bit floating point result of the absolute value operation. + +.. function:: numba.cuda.fp16.heq (a, b) + + Perform the comparison operation ``a == b`` on 16-bit floating point arguments. + + Returns a boolean. + +.. function:: numba.cuda.fp16.hne (a, b) + + Perform the comparison operation ``a != b`` on 16-bit floating point arguments. + + Returns a boolean. + +.. function:: numba.cuda.fp16.hgt (a, b) + + Perform the comparison operation ``a > b`` on 16-bit floating point arguments. + + Returns a boolean. + +.. function:: numba.cuda.fp16.hge (a, b) + + Perform the comparison operation ``a >= b`` on 16-bit floating point arguments. + + Returns a boolean. + +.. function:: numba.cuda.fp16.hlt (a, b) + + Perform the comparison operation ``a < b`` on 16-bit floating point arguments. + + Returns a boolean. + +.. function:: numba.cuda.fp16.hle (a, b) + + Perform the comparison operation ``a <= b`` on 16-bit floating point arguments. + + Returns a boolean. + +.. function:: numba.cuda.fp16.hmax (a, b) + + Perform the operation ``a if a > b else b.`` + + Returns a 16-bit floating point value. + +.. function:: numba.cuda.fp16.hmin (a, b) + + Perform the operation ``a if a < b else b.`` + + Returns a 16-bit floating point value. + +Control Flow Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +A subset of the CUDA's control flow instructions are directly available as +intrinsics. Avoiding branches is a key way to improve CUDA performance, and +using these intrinsics mean you don't have to rely on the ``nvcc`` optimizer +identifying and removing branches. For further documentation, including +semantics, please refer to the `relevant CUDA Toolkit documentation +`_. + + +.. function:: numba.cuda.selp + + Select between two expressions, depending on the value of the first + argument. Similar to LLVM's ``select`` instruction. + + +Timer Intrinsics +~~~~~~~~~~~~~~~~ + +.. function:: numba.cuda.nanosleep(ns) + + Suspends the thread for a sleep duration approximately close to the delay + ``ns``, specified in nanoseconds. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/libdevice.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/libdevice.rst new file mode 100644 index 0000000000000000000000000000000000000000..be3cf2080156da9ad04d850dadb6d485bf08cae8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/libdevice.rst @@ -0,0 +1,16 @@ +Libdevice functions +=================== + +All wrapped libdevice functions are listed in this section. All functions in +libdevice are wrapped, with the exception of ``__nv_nan`` and ``__nv_nanf``. +These functions return a representation of a quiet NaN, but the argument they +take (a pointer to an object specifying the representation) is undocumented, and +follows an unusual form compared to the rest of libdevice - it is not an output +like every other pointer argument. If a NaN is required, one can be obtained in +CUDA Python by other means, e.g. ``math.nan``. + +Wrapped functions +----------------- + +.. automodule:: numba.cuda.libdevice + :members: diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/memory.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/memory.rst new file mode 100644 index 0000000000000000000000000000000000000000..b70995e961e395ca5cf87c038d1d2095ea2c76b3 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/memory.rst @@ -0,0 +1,24 @@ +Memory Management +================= + +.. autofunction:: numba.cuda.to_device +.. autofunction:: numba.cuda.device_array +.. autofunction:: numba.cuda.device_array_like +.. autofunction:: numba.cuda.pinned_array +.. autofunction:: numba.cuda.pinned_array_like +.. autofunction:: numba.cuda.mapped_array +.. autofunction:: numba.cuda.mapped_array_like +.. autofunction:: numba.cuda.managed_array +.. autofunction:: numba.cuda.pinned +.. autofunction:: numba.cuda.mapped + +Device Objects +-------------- + +.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceNDArray + :members: copy_to_device, copy_to_host, is_c_contiguous, is_f_contiguous, + ravel, reshape, split +.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceRecord + :members: copy_to_device, copy_to_host +.. autoclass:: numba.cuda.cudadrv.devicearray.MappedNDArray + :members: copy_to_device, copy_to_host, split diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/types.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/types.rst new file mode 100644 index 0000000000000000000000000000000000000000..31197241ea5ef11a526e16ee734f302d7e6f3734 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda-reference/types.rst @@ -0,0 +1,56 @@ +CUDA-Specific Types +==================== + +.. note:: + + This page is about types specific to CUDA targets. Many other types are also + available in the CUDA target - see :ref:`cuda-built-in-types`. + +Vector Types +~~~~~~~~~~~~ + +`CUDA Vector Types `_ +are usable in kernels. There are two important distinctions from vector types in CUDA C/C++: + +First, the recommended names for vector types in Numba CUDA is formatted as ``x``, +where ``base_type`` is the base type of the vector, and ``N`` is the number of elements in the vector. +Examples include ``int64x3``, ``uint16x4``, ``float32x4``, etc. For new Numba CUDA kernels, +this is the recommended way to instantiate vector types. + +For convenience, users adapting existing kernels from CUDA C/C++ to Python may use +aliases consistent with the C/C++ namings. For example, ``float3`` aliases ``float32x3``, +``long3`` aliases ``int32x3`` or ``int64x3`` (depending on the platform), etc. + +Second, unlike CUDA C/C++ where factory functions are used, vector types are constructed directly +with their constructor. For example, to construct a ``float32x3``: + +.. code-block:: python3 + + from numba.cuda import float32x3 + + # In kernel + f3 = float32x3(0.0, -1.0, 1.0) + +Additionally, vector types can be constructed from a combination of vector and +primitive types, as long as the total number of components matches the result +vector type. For example, all of the following constructions are valid: + +.. code-block:: python3 + + zero = uint32(0) + u2 = uint32x2(1, 2) + # Construct a 3-component vector with primitive type and a 2-component vector + u3 = uint32x3(zero, u2) + # Construct a 4-component vector with 2 2-component vectors + u4 = uint32x4(u2, u2) + +The 1st, 2nd, 3rd and 4th component of the vector type can be accessed through fields +``x``, ``y``, ``z``, and ``w`` respectively. The components are immutable after +construction in the present version of Numba; it is expected that support for +mutating vector components will be added in a future release. + +.. code-block:: python3 + + v1 = float32x2(1.0, 1.0) + v2 = float32x2(1.0, -1.0) + dotprod = v1.x * v2.x + v1.y * v2.y diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/bindings.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/bindings.rst new file mode 100644 index 0000000000000000000000000000000000000000..d8425a91fd147c920964dd569b1f399aeeb58c08 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/bindings.rst @@ -0,0 +1,43 @@ +CUDA Bindings +============= + +Numba supports two bindings to the CUDA Driver APIs: its own internal bindings +based on ctypes, and the official `NVIDIA CUDA Python bindings +`_. Functionality is equivalent between +the two bindings. + +The internal bindings are used by default. If the NVIDIA bindings are installed, +then they can be used by setting the environment variable +``NUMBA_CUDA_USE_NVIDIA_BINDING`` to ``1`` prior to the import of Numba. Once +Numba has been imported, the selected binding cannot be changed. + + +Per-Thread Default Streams +-------------------------- + +Responsibility for handling Per-Thread Default Streams (PTDS) is delegated to +the NVIDIA bindings when they are in use. To use PTDS with the NVIDIA bindings, +set the environment variable ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to +``1`` instead of Numba's environmnent variable +:envvar:`NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`. + +.. seealso:: + + The `Default Stream section + `_ + in the NVIDIA Bindings documentation. + + +Roadmap +------- + +In Numba 0.56, the NVIDIA Bindings will be used by default, if they are +installed. + +In future versions of Numba: + +- The internal bindings will be deprecated. +- The internal bindings will be removed. + +At present, no specific release is planned for the deprecation or removal of +the internal bindings. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/caching.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/caching.rst new file mode 100644 index 0000000000000000000000000000000000000000..06c84ff775d6678a9848294b363b9c409ecf636a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/caching.rst @@ -0,0 +1,35 @@ +On-disk Kernel Caching +====================== + +When the ``cache`` keyword argument of the :func:`@cuda.jit ` +decorator is ``True``, a file-based cache is enabled. This shortens compilation +times when the function was already compiled in a previous invocation. + +The cache is maintained in the ``__pycache__`` subdirectory of the directory +containing the source file; if the current user is not allowed to write to it, +the cache implementation falls back to a platform-specific user-wide cache +directory (such as ``$HOME/.cache/numba`` on Unix platforms). + + +Compute capability considerations +--------------------------------- + +Separate cache files are maintained for each compute capability. When a cached +kernel is loaded, the compute capability of the device the kernel is first +launched on in the current run is used to determine which version to load. +Therefore, on systems that have multiple GPUs with differing compute +capabilities, the cached versions of kernels are only used for one compute +capability, and recompilation will occur for other compute capabilities. + +For example: if a system has two GPUs, one of compute capability 7.5 and one of +8.0, then: + +* If a cached kernel is first launched on the CC 7.5 device, then the cached + version for CC 7.5 is used. If it is subsequently launched on the CC 8.0 + device, a recompilation will occur. +* If in a subsequent run the cached kernel is first launched on the CC 8.0 + device, then the cached version for CC 8.0 is used. A subsequent launch on + the CC 7.5 device will require a recompilation. + +This limitation is not expected to present issues in most practical scenarios, +as multi-GPU production systems tend to have identical GPUs within each node. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cooperative_groups.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cooperative_groups.rst new file mode 100644 index 0000000000000000000000000000000000000000..a51e8ffcbdd21de355463ba1be96a5a4cede59a9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cooperative_groups.rst @@ -0,0 +1,111 @@ +================== +Cooperative Groups +================== + +Supported features +------------------ + +Numba's Cooperative Groups support presently provides grid groups and grid +synchronization, along with cooperative kernel launches. + +Cooperative groups are supported on Linux, and Windows for devices in `TCC +mode +`_. +Cooperative Groups also require the CUDA Device Runtime library, ``cudadevrt``, +to be available - for conda default channel-installed CUDA toolkit packages, it +is only available in versions 10.2 onwards. System-installed toolkits (e.g. from +NVIDIA distribution packages or runfiles) all include ``cudadevrt``. + +Using Grid Groups +----------------- + +To get the current grid group, use the :meth:`cg.this_grid() +` function: + +.. code-block:: python + + g = cuda.cg.this_grid() + +Synchronizing the grid is done with the :meth:`sync() +` method of the grid group: + +.. code-block:: python + + g.sync() + + +Cooperative Launches +-------------------- + +Unlike the CUDA C/C++ API, a cooperative launch is invoked using the same syntax +as a normal kernel launch - Numba automatically determines whether a cooperative +launch is required based on whether a grid group is synchronized in the kernel. + +The grid size limit for a cooperative launch is more restrictive than for a +normal launch - the grid must be no larger than the maximum number of active +blocks on the device on which it is launched. To get maximum grid size for a +cooperative launch of a kernel with a given block size and dynamic shared +memory requirement, use the ``max_cooperative_grid_blocks()`` method of kernel +overloads: + +.. automethod:: numba.cuda.dispatcher._Kernel.max_cooperative_grid_blocks + +This can be used to ensure that the kernel is launched with no more than the +maximum number of blocks. Exceeding the maximum number of blocks for the +cooperative launch will result in a ``CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE`` +error. + + +Applications and Example +------------------------ + +Grid group synchronization can be used to implement a global barrier across all +threads in the grid - applications of this include a global reduction to a +single value, or looping over rows of a large matrix sequentially using the +entire grid to operate on column elements in parallel. + +In the following example, rows are written sequentially by the grid. Each thread +in the grid reads a value from the previous row written by it's *opposite* +thread. A grid sync is needed to ensure that threads in the grid don't run ahead +of threads in other blocks, or fail to see updates from their opposite thread. + +First we'll define our kernel: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py + :language: python + :caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py`` + :start-after: magictoken.ex_grid_sync_kernel.begin + :end-before: magictoken.ex_grid_sync_kernel.end + :dedent: 8 + :linenos: + +Then create some empty input data and determine the grid and block sizes: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py + :language: python + :caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py`` + :start-after: magictoken.ex_grid_sync_data.begin + :end-before: magictoken.ex_grid_sync_data.end + :dedent: 8 + :linenos: + +Finally we launch the kernel and print the result: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py + :language: python + :caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py`` + :start-after: magictoken.ex_grid_sync_launch.begin + :end-before: magictoken.ex_grid_sync_launch.end + :dedent: 8 + :linenos: + + +The maximum grid size for ``sequential_rows`` can be enquired using: + + +.. code-block:: python + + overload = sequential_rows.overloads[(int32[:,::1],) + max_blocks = overload.max_cooperative_grid_blocks(blockdim) + print(max_blocks) + # 1152 (e.g. on Quadro RTX 8000 with Numba 0.52.1 and CUDA 11.0) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cuda_array_interface.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cuda_array_interface.rst new file mode 100644 index 0000000000000000000000000000000000000000..304f4ecab2493768dc06baaef2acff24dae6e5a3 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cuda_array_interface.rst @@ -0,0 +1,531 @@ +.. _cuda-array-interface: + +================================ +CUDA Array Interface (Version 3) +================================ + +The *CUDA Array Interface* (or CAI) is created for interoperability between +different implementations of CUDA array-like objects in various projects. The +idea is borrowed from the `NumPy array interface`_. + + +.. note:: + Currently, we only define the Python-side interface. In the future, we may + add a C-side interface for efficient exchange of the information in + compiled code. + + +Python Interface Specification +============================== + +.. note:: Experimental feature. Specification may change. + +The ``__cuda_array_interface__`` attribute returns a dictionary (``dict``) +that must contain the following entries: + +- **shape**: ``(integer, ...)`` + + A tuple of ``int`` (or ``long``) representing the size of each dimension. + +- **typestr**: ``str`` + + The type string. This has the same definition as ``typestr`` in the + `NumPy array interface`_. + +- **data**: ``(integer, boolean)`` + + The **data** is a 2-tuple. The first element is the data pointer + as a Python ``int`` (or ``long``). The data must be device-accessible. + For zero-size arrays, use ``0`` here. + The second element is the read-only flag as a Python ``bool``. + + Because the user of the interface may or may not be in the same context, + the most common case is to use ``cuPointerGetAttribute`` with + ``CU_POINTER_ATTRIBUTE_DEVICE_POINTER`` in the CUDA driver API (or the + equivalent CUDA Runtime API) to retrieve a device pointer that + is usable in the currently active context. + +- **version**: ``integer`` + + An integer for the version of the interface being exported. + The current version is *3*. + + +The following are optional entries: + +- **strides**: ``None`` or ``(integer, ...)`` + + If **strides** is not given, or it is ``None``, the array is in + C-contiguous layout. Otherwise, a tuple of ``int`` (or ``long``) is explicitly + given for representing the number of bytes to skip to access the next + element at each dimension. + +- **descr** + + This is for describing more complicated types. This follows the same + specification as in the `NumPy array interface`_. + +- **mask**: ``None`` or object exposing the ``__cuda_array_interface__`` + + If ``None`` then all values in **data** are valid. All elements of the mask + array should be interpreted only as true or not true indicating which + elements of this array are valid. This has the same definition as ``mask`` + in the `NumPy array interface`_. + + .. note:: Numba does not currently support working with masked CUDA arrays + and will raise a ``NotImplementedError`` exception if one is passed + to a GPU function. + +- **stream**: ``None`` or ``integer`` + + An optional stream upon which synchronization must take place at the point of + consumption, either by synchronizing on the stream or enqueuing operations on + the data on the given stream. Integer values in this entry are as follows: + + - ``0``: This is disallowed as it would be ambiguous between ``None`` and the + default stream, and also between the legacy and per-thread default streams. + Any use case where ``0`` might be given should either use ``None``, ``1``, + or ``2`` instead for clarity. + - ``1``: The legacy default stream. + - ``2``: The per-thread default stream. + - Any other integer: a ``cudaStream_t`` represented as a Python integer. + + When ``None``, no synchronization is required. See the + :ref:`cuda-array-interface-synchronization` section below for further details. + + In a future revision of the interface, this entry may be expanded (or another + entry added) so that an event to synchronize on can be specified instead of a + stream. + + +.. _cuda-array-interface-synchronization: + +Synchronization +--------------- + +Definitions +~~~~~~~~~~~ + +When discussing synchronization, the following definitions are used: + +- *Producer*: The library / object on which ``__cuda_array_interface__`` is + accessed. +- *Consumer*: The library / function that accesses the + ``__cuda_array_interface__`` of the Producer. +- *User Code*: Code that induces a Producer and Consumer to share data through + the CAI. +- *User*: The person writing or maintaining the User Code. The User may + implement User Code without knowledge of the CAI, since the CAI accesses can + be hidden from their view. + +In the following example: + +.. code-block:: python + + import cupy + from numba import cuda + + @cuda.jit + def add(x, y, out): + start = cuda.grid(1) + stride = cuda.gridsize(1) + for i in range(start, x.shape[0], stride): + out[i] = x[i] + y[i] + + a = cupy.arange(10) + b = a * 2 + out = cupy.zeros_like(a) + + add[1, 32](a, b, out) + +When the ``add`` kernel is launched: + +- ``a``, ``b``, ``out`` are Producers. +- The ``add`` kernel is the Consumer. +- The User Code is specifically ``add[1, 32](a, b, out)``. +- The author of the code is the User. + + +Design Motivations +~~~~~~~~~~~~~~~~~~ + +Elements of the CAI design related to synchronization seek to fulfill these +requirements: + +1. Producers and Consumers that exchange data through the CAI must be able to do + so without data races. +2. Requirement 1 should be met without requiring the user to be + aware of any particulars of the CAI - in other words, exchanging data between + Producers and Consumers that operate on data asynchronously should be correct + by default. + + - An exception to this requirement is made for Producers and Consumers that + explicitly document that the User is required to take additional steps to + ensure correctness with respect to synchronization. In this case, Users + are required to understand the details of the CUDA Array Interface, and + the Producer/Consumer library documentation must specify the steps that + Users are required to take. + + Use of this exception should be avoided where possible, as it is provided + for libraries that cannot implement the synchronization semantics without + the involvement of the User - for example, those interfacing with + third-party libraries oblivious to the CUDA Array Interface. + +3. Where the User is aware of the particulars of the CAI and implementation + details of the Producer and Consumer, they should be able to, at their + discretion, override some of the synchronization semantics of the interface + to reduce the synchronization overhead. Overriding synchronization semantics + implies that: + + - The CAI design, and the design and implementation of the Producer and + Consumer do not specify or guarantee correctness with respect to data + races. + - Instead, the User is responsible for ensuring correctness with respect to + data races. + + +Interface Requirements +~~~~~~~~~~~~~~~~~~~~~~ + +The ``stream`` entry enables Producers and Consumers to avoid hazards when +exchanging data. Expected behaviour of the Consumer is as follows: + +* When ``stream`` is not present or is ``None``: + + - No synchronization is required on the part of the Consumer. + - The Consumer may enqueue operations on the underlying data immediately on + any stream. + +* When ``stream`` is an integer, its value indicates the stream on which the + Producer may have in-progress operations on the data, and which the Consumer + is expected to either: + + - Synchronize on before accessing the data, or + - Enqueue operations in when accessing the data. + + The Consumer can choose which mechanism to use, with the following + considerations: + + - If the Consumer synchronizes on the provided stream prior to accessing the + data, then it must ensure that no computation can take place in the provided + stream until its operations in its own choice of stream have taken place. + This could be achieved by either: + + - Placing a wait on an event in the provided stream that occurs once all + of the Consumer's operations on the data are completed, or + - Avoiding returning control to the user code until after its operations + on its own stream have completed. + + - If the consumer chooses to only enqueue operations on the data in the + provided stream, then it may return control to the User code immediately + after enqueueing its work, as the work will all be serialized on the + exported array's stream. This is sufficient to ensure correctness even if + the User code were to induce the Producer to subsequently start enqueueing + more work on the same stream. + +* If the User has set the Consumer to ignore CAI synchronization semantics, the + Consumer may assume it can operate on the data immediately in any stream with + no further synchronization, even if the ``stream`` member has an integer + value. + + +When exporting an array through the CAI, Producers must ensure that: + +* If there is work on the data enqueued in one or more streams, then + synchronization on the provided ``stream`` is sufficient to ensure + synchronization with all pending work. + + - If the Producer has no enqueued work, or work only enqueued on the stream + identified by ``stream``, then this condition is met. + - If the Producer has enqueued work on the data on multiple streams, then it + must enqueue events on those streams that follow the enqueued work, and + then wait on those events in the provided ``stream``. For example: + + 1. Work is enqueued by the Producer on streams ``7``, ``9``, and ``15``. + 2. Events are then enqueued on each of streams ``7``, ``9``, and ``15``. + 3. Producer then tells stream ``3`` to wait on the events from Step 2, and + the ``stream`` entry is set to ``3``. + +* If there is no work enqueued on the data, then the ``stream`` entry may be + either ``None``, or not provided. + +Optionally, to facilitate the User relaxing conformance to synchronization +semantics: + +* Producers may provide a configuration option to always set ``stream`` to + ``None``. +* Consumers may provide a configuration option to ignore the value of ``stream`` + and act as if it were ``None`` or not provided. This elides synchronization + on the Producer-provided streams, and allows enqueuing work on streams other + than that provided by the Producer. + +These options should not be set by default in either a Producer or a Consumer. +The CAI specification does not prescribe the exact mechanism by which these +options are set, or related options that Producers or Consumers might provide +to allow the user further control over synchronization behavior. + + +Synchronization in Numba +~~~~~~~~~~~~~~~~~~~~~~~~ + +Numba is neither strictly a Producer nor a Consumer - it may be used to +implement either by a User. In order to facilitate the correct implementation of +synchronization semantics, Numba exhibits the following behaviors related to +synchronization of the interface: + +- When Numba acts as a Consumer (for example when an array-like object is passed + to a kernel launch): If ``stream`` is an integer, then Numba will immediately + synchronize on the provided ``stream``. A Numba :class:`Device Array + ` created from an array-like + object has its *default stream* set to the provided stream. + +- When Numba acts as a Producer (when the ``__cuda_array_interface__`` property + of a Numba CUDA Array is accessed): If the exported CUDA Array has a + *default stream*, then it is given as the ``stream`` entry. Otherwise, + ``stream`` is set to ``None``. + +.. note:: In Numba's terminology, an array's *default stream* is a property + specifying the stream that Numba will enqueue asynchronous + transfers in if no other stream is provided as an argument to the + function invoking the transfer. It is not the same as the `Default + Stream + `_ + in normal CUDA terminology. + +Numba's synchronization behavior results in the following intended +consequences: + +- Exchanging data either as a Producer or a Consumer will be correct without + the need for any further action from the User, provided that the other side + of the interaction also follows the CAI synchronization semantics. +- The User is expected to either: + + - Avoid launching kernels or other operations on streams that + are not the default stream for their parameters, or + - When launching operations on a stream that is not the default stream for + a given parameter, they should then insert an event into the stream that + they are operating in, and wait on that event in the default stream for + the parameter. For an example of this, :ref:`see below + `. + +The User may override Numba's synchronization behavior by setting the +environment variable ``NUMBA_CUDA_ARRAY_INTERFACE_SYNC`` or the config variable +``CUDA_ARRAY_INTERFACE_SYNC`` to ``0`` (see :ref:`GPU Support Environment +Variables `). When set, Numba will not synchronize +on the streams of imported arrays, and it is the responsibility of the user to +ensure correctness with respect to stream synchronization. Synchronization when +creating a Numba CUDA Array from an object exporting the CUDA Array Interface +may also be elided by passing ``sync=False`` when creating the Numba CUDA +Array with :func:`numba.cuda.as_cuda_array` or +:func:`numba.cuda.from_cuda_array_interface`. + +There is scope for Numba's synchronization implementation to be optimized in +the future, by eliding synchronizations when a kernel or driver API operation +(e.g. a memcopy or memset) is launched on the same stream as an imported +array. + + +.. _example-multi-streams: + +An example launching on an array's non-default stream +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This example shows how to ensure that a Consumer can safely consume an array +with a default stream when it is passed to a kernel launched in a different +stream. + +First we need to import Numba and a consumer library (a fictitious library named +``other_cai_library`` for this example): + +.. code-block:: python + + from numba import cuda, int32, void + import other_cai_library + +Now we'll define a kernel - this initializes the elements of the array, setting +each entry to its index: + +.. code-block:: python + + @cuda.jit(void, int32[::1]) + def initialize_array(x): + i = cuda.grid(1) + if i < len(x): + x[i] = i + +Next we will create two streams: + +.. code-block:: python + + array_stream = cuda.stream() + kernel_stream = cuda.stream() + +Then create an array with one of the streams as its default stream: + +.. code-block:: python + + N = 16384 + x = cuda.device_array(N, stream=array_stream) + +Now we launch the kernel in the other stream: + +.. code-block:: python + + nthreads = 256 + nblocks = N // nthreads + + initialize_array[nthreads, nblocks, kernel_stream](x) + +If we were to pass ``x`` to a Consumer now, there is a risk that it may operate on +it in ``array_stream`` whilst the kernel is still running in ``kernel_stream``. +To prevent operations in ``array_stream`` starting before the kernel launch is +finished, we create an event and wait on it: + +.. code-block:: python + + # Create event + evt = cuda.event() + # Record the event after the kernel launch in kernel_stream + evt.record(kernel_stream) + # Wait for the event in array_stream + evt.wait(array_stream) + +It is now safe for ``other_cai_library`` to consume ``x``: + +.. code-block:: python + + other_cai_library.consume(x) + + +Lifetime management +------------------- + +Data +~~~~ + +Obtaining the value of the ``__cuda_array_interface__`` property of any object +has no effect on the lifetime of the object from which it was created. In +particular, note that the interface has no slot for the owner of the data. + +The User code must preserve the lifetime of the object owning the data for as +long as the Consumer might use it. + + +Streams +~~~~~~~ + +Like data, CUDA streams also have a finite lifetime. It is therefore required +that a Producer exporting data on the interface with an associated stream +ensures that the exported stream's lifetime is equal to or surpasses the +lifetime of the object from which the interface was exported. + + +Lifetime management in Numba +---------------------------- + +Producing Arrays +~~~~~~~~~~~~~~~~ + +Numba takes no steps to maintain the lifetime of an object from which the +interface is exported - it is the user's responsibility to ensure that the +underlying object is kept alive for the duration that the exported interface +might be used. + +The lifetime of any Numba-managed stream exported on the interface is guaranteed +to equal or surpass the lifetime of the underlying object, because the +underlying object holds a reference to the stream. + +.. note:: Numba-managed streams are those created with + ``cuda.default_stream()``, ``cuda.legacy_default_stream()``, or + ``cuda.per_thread_default_stream()``. Streams not managed by Numba + are created from an external stream with ``cuda.external_stream()``. + + +Consuming Arrays +~~~~~~~~~~~~~~~~ + +Numba provides two mechanisms for creating device arrays from objects exporting +the CUDA Array Interface. Which to use depends on whether the created device +array should maintain the life of the object from which it is created: + +- ``as_cuda_array``: This creates a device array that holds a reference to the + owning object. As long as a reference to the device array is held, its + underlying data will also be kept alive, even if all other references to the + original owning object have been dropped. +- ``from_cuda_array_interface``: This creates a device array with no reference + to the owning object by default. The owning object, or some other object to + be considered the owner can be passed in the ``owner`` parameter. + +The interfaces of these functions are: + +.. automethod:: numba.cuda.as_cuda_array + +.. automethod:: numba.cuda.from_cuda_array_interface + + +Pointer Attributes +------------------ + +Additional information about the data pointer can be retrieved using +``cuPointerGetAttribute`` or ``cudaPointerGetAttributes``. Such information +include: + +- the CUDA context that owns the pointer; +- is the pointer host-accessible? +- is the pointer a managed memory? + + +.. _NumPy array interface: https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html#__array_interface__ + + +Differences with CUDA Array Interface (Version 0) +------------------------------------------------- + +Version 0 of the CUDA Array Interface did not have the optional **mask** +attribute to support masked arrays. + + +Differences with CUDA Array Interface (Version 1) +------------------------------------------------- + +Versions 0 and 1 of the CUDA Array Interface neither clarified the +**strides** attribute for C-contiguous arrays nor specified the treatment for +zero-size arrays. + + +Differences with CUDA Array Interface (Version 2) +------------------------------------------------- + +Prior versions of the CUDA Array Interface made no statement about +synchronization. + + +Interoperability +---------------- + +The following Python libraries have adopted the CUDA Array Interface: + +- Numba +- `CuPy `_ +- `PyTorch `_ +- `PyArrow `_ +- `mpi4py `_ +- `ArrayViews `_ +- `JAX `_ +- `PyCUDA `_ +- `DALI: the NVIDIA Data Loading Library `_ : + + - `TensorGPU objects + `_ + expose the CUDA Array Interface. + - `The External Source operator + `_ + consumes objects exporting the CUDA Array Interface. +- The RAPIDS stack: + + - `cuDF `_ + - `cuML `_ + - `cuSignal `_ + - `RMM `_ + +If your project is not on this list, please feel free to report it on the `Numba issue tracker `_. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cuda_ffi.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cuda_ffi.rst new file mode 100644 index 0000000000000000000000000000000000000000..1ee441254c02e59dbcd35d87c4bc6c02caff2862 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cuda_ffi.rst @@ -0,0 +1,158 @@ + +.. _cuda_ffi: + +Calling foreign functions from Python kernels +============================================= + +Python kernels can call device functions written in other languages. CUDA C/C++, +PTX, and binary objects (cubins, fat binaries, etc.) are directly supported; +sources in other languages must be compiled to PTX first. The constituent parts +of a Python kernel call to a foreign device function are: + +- The device function implementation in a foreign language (e.g. CUDA C). +- A declaration of the device function in Python. +- A kernel that links with and calls the foreign function. + + +Device function ABI +------------------- + +Numba's ABI for calling device functions defines the following prototype in +C/C++: + +.. code:: C + + extern "C" + __device__ int + function( + T* return_value, + ... + ); + + +Components of the prototype are as follows: + +- ``extern "C"`` is used to prevent name-mangling so that it is easy to declare + the function in Python. It can be removed, but then the mangled name must be + used in the declaration of the function in Python. +- ``__device__`` is required to define the function as a device function. +- The return value is always of type ``int``, and is used to signal whether a + Python exception occurred. Since Python exceptions don't occur in foreign + functions, this should always be set to 0 by the callee. +- The first argument is a pointer to the return value of type ``T``, which is + allocated in the local address space [#f1]_ and passed in by the caller. If + the function returns a value, the pointee should be set by the callee to + store the return value. +- Subsequent arguments should match the types and order of arguments passed to + the function from the Python kernel. + +Functions written in other languages must compile to PTX that conforms to this +prototype specification. + +A function that accepts two floats and returns a float would have the following +prototype: + +.. code:: C + + extern "C" + __device__ int + mul_f32_f32( + float* return_value, + float x, + float y + ) + +.. rubric:: Notes + +.. [#f1] Care must be taken to ensure that any operations on the return value + are applicable to data in the local address space. Some operations, + such as atomics, cannot be performed on data in the local address + space. + +Declaration in Python +--------------------- + +To declare a foreign device function in Python, use :func:`declare_device() +`: + +.. autofunction:: numba.cuda.declare_device + +The returned descriptor name need not match the name of the foreign function. +For example, when: + +.. code:: + + mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)') + +is declared, calling ``mul(a, b)`` inside a kernel will translate into a call to +``mul_f32_f32(a, b)`` in the compiled code. + + +Linking and Calling functions +----------------------------- + +The ``link`` keyword argument of the :func:`@cuda.jit ` +decorator accepts a list of file names specified by absolute path or a path +relative to the current working directory. Files whose name ends in ``.cu`` +will be compiled with the `NVIDIA Runtime Compiler (NVRTC) +`_ and linked into the kernel as +PTX; other files will be passed directly to the CUDA Linker. + +For example, the following kernel calls the ``mul()`` function declared above +with the implementation ``mul_f32_f32()`` in a file called ``functions.cu``: + +.. code:: + + @cuda.jit(link=['functions.cu']) + def multiply_vectors(r, x, y): + i = cuda.grid(1) + + if i < len(r): + r[i] = mul(x[i], y[i]) + + +C/C++ Support +------------- + +Support for compiling and linking of CUDA C/C++ code is provided through the use +of NVRTC subject to the following considerations: + +- It is only available when using the NVIDIA Bindings. See + :envvar:`NUMBA_CUDA_USE_NVIDIA_BINDING`. +- A suitable version of the NVRTC library for the installed version of the + NVIDIA CUDA Bindings must be available. +- The CUDA include path is assumed by default to be ``/usr/local/cuda/include`` + on Linux and ``$env:CUDA_PATH\include`` on Windows. It can be modified using + the environment variable :envvar:`NUMBA_CUDA_INCLUDE_PATH`. +- The CUDA include directory will be made available to NVRTC on the include + path; additional includes are not supported. + + +Complete Example +---------------- + +This example demonstrates calling a foreign function written in CUDA C to +multiply pairs of numbers from two arrays. + +The foreign function is written as follows: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/ffi/functions.cu + :language: C + :caption: ``numba/cuda/tests/doc_examples/ffi/functions.cu`` + :linenos: + +The Python code and kernel are: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_ffi.py + :language: python + :caption: from ``test_ex_linking_cu`` in ``numba/cuda/tests/doc_examples/test_ffi.py`` + :start-after: magictoken.ex_linking_cu.begin + :end-before: magictoken.ex_linking_cu.end + :dedent: 8 + :linenos: + +.. note:: + + The example above is minimal in order to illustrate a foreign function call - + it would not be expected to be particularly performant due to the small grid + and light workload of the foreign function. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cudapysupported.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cudapysupported.rst new file mode 100644 index 0000000000000000000000000000000000000000..d5dc5a7908b9a21dfa4946943c43ed2a0fa14bab --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/cudapysupported.rst @@ -0,0 +1,296 @@ +======================================== +Supported Python features in CUDA Python +======================================== + +This page lists the Python features supported in the CUDA Python. This includes +all kernel and device functions compiled with ``@cuda.jit`` and other higher +level Numba decorators that targets the CUDA GPU. + +Language +======== + +Execution Model +--------------- + +CUDA Python maps directly to the *single-instruction multiple-thread* +execution (SIMT) model of CUDA. Each instruction is implicitly +executed by multiple threads in parallel. With this execution model, array +expressions are less useful because we don't want multiple threads to perform +the same task. Instead, we want threads to perform a task in a cooperative +fashion. + +For details please consult the +`CUDA Programming Guide +`_. + +Floating Point Error Model +-------------------------- + +By default, CUDA Python kernels execute with the NumPy error model. In this +model, division by zero raises no exception and instead produces a result of +``inf``, ``-inf`` or ``nan``. This differs from the normal Python error model, +in which division by zero raises a ``ZeroDivisionError``. + +When debug is enabled (by passing ``debug=True`` to the +:func:`@cuda.jit ` decorator), the Python error model is used. +This allows division-by-zero errors during kernel execution to be identified. + +Constructs +---------- + +The following Python constructs are not supported: + +* Exception handling (``try .. except``, ``try .. finally``) +* Context management (the ``with`` statement) +* Comprehensions (either list, dict, set or generator comprehensions) +* Generator (any ``yield`` statements) + +The ``raise`` and ``assert`` statements are supported, with the following +constraints: + +- They can only be used in kernels, not in device functions. +- They only have an effect when ``debug=True`` is passed to the + :func:`@cuda.jit ` decorator. This is similar to the behavior + of the ``assert`` keyword in CUDA C/C++, which is ignored unless compiling + with device debug turned on. + + +Printing of strings, integers, and floats is supported, but printing is an +asynchronous operation - in order to ensure that all output is printed after a +kernel launch, it is necessary to call :func:`numba.cuda.synchronize`. Eliding +the call to ``synchronize`` is acceptable, but output from a kernel may appear +during other later driver operations (e.g. subsequent kernel launches, memory +transfers, etc.), or fail to appear before the program execution completes. Up +to 32 arguments may be passed to the ``print`` function - if more are passed +then a format string will be emitted instead and a warning will be produced. +This is due to a general limitation in CUDA printing, as outlined in the +`section on limitations in printing +`_ +in the CUDA C++ Programming Guide. + + +Recursion +--------- + +Self-recursive device functions are supported, with the constraint that +recursive calls must have the same argument types as the initial call to +the function. For example, the following form of recursion is supported: + +.. code:: python + + @cuda.jit("int64(int64)", device=True) + def fib(n): + if n < 2: + return n + return fib(n - 1) + fib(n - 2) + +(the ``fib`` function always has an ``int64`` argument), whereas the following +is unsupported: + +.. code:: python + + # Called with x := int64, y := float64 + @cuda.jit + def type_change_self(x, y): + if x > 1 and y > 0: + return x + type_change_self(x - y, y) + else: + return y + +The outer call to ``type_change_self`` provides ``(int64, float64)`` arguments, +but the inner call uses ``(float64, float64)`` arguments (because ``x - y`` / +``int64 - float64`` results in a ``float64`` type). Therefore, this function is +unsupported. + +Mutual recursion between functions (e.g. where a function ``func1()`` calls +``func2()`` which again calls ``func1()``) is unsupported. + +.. note:: + + The call stack in CUDA is typically quite limited in size, so it is easier + to overflow it with recursive calls on CUDA devices than it is on CPUs. + + Stack overflow will result in an Unspecified Launch Failure (ULF) during + kernel execution. In order to identify whether a ULF is due to stack + overflow, programs can be run under `Compute Sanitizer + `_, + which explicitly states when stack overflow has occurred. + +.. _cuda-built-in-types: + +Built-in types +=============== + +The following built-in types support are inherited from CPU nopython mode. + +* int +* float +* complex +* bool +* None +* tuple +* Enum, IntEnum + +See :ref:`nopython built-in types `. + +There is also some very limited support for character sequences (bytes and +unicode strings) used in NumPy arrays. Note that this support can only be used +with CUDA 11.2 onwards. + +Built-in functions +================== + +The following built-in functions are supported: + +* :func:`abs` +* :class:`bool` +* :class:`complex` +* :func:`enumerate` +* :class:`float` +* :class:`int`: only the one-argument form +* :func:`len` +* :func:`min`: only the multiple-argument form +* :func:`max`: only the multiple-argument form +* :func:`pow` +* :class:`range` +* :func:`round` +* :func:`zip` + + +Standard library modules +======================== + + +``cmath`` +--------- + +The following functions from the :mod:`cmath` module are supported: + +* :func:`cmath.acos` +* :func:`cmath.acosh` +* :func:`cmath.asin` +* :func:`cmath.asinh` +* :func:`cmath.atan` +* :func:`cmath.atanh` +* :func:`cmath.cos` +* :func:`cmath.cosh` +* :func:`cmath.exp` +* :func:`cmath.isfinite` +* :func:`cmath.isinf` +* :func:`cmath.isnan` +* :func:`cmath.log` +* :func:`cmath.log10` +* :func:`cmath.phase` +* :func:`cmath.polar` +* :func:`cmath.rect` +* :func:`cmath.sin` +* :func:`cmath.sinh` +* :func:`cmath.sqrt` +* :func:`cmath.tan` +* :func:`cmath.tanh` + +``math`` +-------- + +The following functions from the :mod:`math` module are supported: + +* :func:`math.acos` +* :func:`math.asin` +* :func:`math.atan` +* :func:`math.acosh` +* :func:`math.asinh` +* :func:`math.atanh` +* :func:`math.cos` +* :func:`math.sin` +* :func:`math.tan` +* :func:`math.hypot` +* :func:`math.cosh` +* :func:`math.sinh` +* :func:`math.tanh` +* :func:`math.atan2` +* :func:`math.erf` +* :func:`math.erfc` +* :func:`math.exp` +* :func:`math.expm1` +* :func:`math.fabs` +* :func:`math.frexp` +* :func:`math.ldexp` +* :func:`math.gamma` +* :func:`math.lgamma` +* :func:`math.log` +* :func:`math.log2` +* :func:`math.log10` +* :func:`math.log1p` +* :func:`math.sqrt` +* :func:`math.remainder`: Python 3.7+ +* :func:`math.pow` +* :func:`math.ceil` +* :func:`math.floor` +* :func:`math.copysign` +* :func:`math.fmod` +* :func:`math.modf` +* :func:`math.isnan` +* :func:`math.isinf` +* :func:`math.isfinite` + + +``operator`` +------------ + +The following functions from the :mod:`operator` module are supported: + +* :func:`operator.add` +* :func:`operator.and_` +* :func:`operator.eq` +* :func:`operator.floordiv` +* :func:`operator.ge` +* :func:`operator.gt` +* :func:`operator.iadd` +* :func:`operator.iand` +* :func:`operator.ifloordiv` +* :func:`operator.ilshift` +* :func:`operator.imod` +* :func:`operator.imul` +* :func:`operator.invert` +* :func:`operator.ior` +* :func:`operator.ipow` +* :func:`operator.irshift` +* :func:`operator.isub` +* :func:`operator.itruediv` +* :func:`operator.ixor` +* :func:`operator.le` +* :func:`operator.lshift` +* :func:`operator.lt` +* :func:`operator.mod` +* :func:`operator.mul` +* :func:`operator.ne` +* :func:`operator.neg` +* :func:`operator.not_` +* :func:`operator.or_` +* :func:`operator.pos` +* :func:`operator.pow` +* :func:`operator.rshift` +* :func:`operator.sub` +* :func:`operator.truediv` +* :func:`operator.xor` + + +NumPy support +============= + +Due to the CUDA programming model, dynamic memory allocation inside a kernel is +inefficient and is often not needed. Numba disallows any memory allocating features. +This disables a large number of NumPy APIs. For best performance, users should write +code such that each thread is dealing with a single element at a time. + +Supported NumPy features: + +* accessing `ndarray` attributes `.shape`, `.strides`, `.ndim`, `.size`, etc.. +* scalar ufuncs that have equivalents in the `math` module; i.e. ``np.sin(x[0])``, where x is a 1D array. +* indexing and slicing works. + +Unsupported NumPy features: + +* array creation APIs. +* array methods. +* functions that returns a new array. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/device-functions.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/device-functions.rst new file mode 100644 index 0000000000000000000000000000000000000000..4fba8c66ff3cef2856810ee571a2a375287f6d2b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/device-functions.rst @@ -0,0 +1,15 @@ + +Writing Device Functions +======================== + +CUDA device functions can only be invoked from within the device (by a kernel +or another device function). To define a device function:: + + from numba import cuda + + @cuda.jit(device=True) + def a_device_function(a, b): + return a + b + +Unlike a kernel function, a device function can return a value like normal +functions. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/device-management.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/device-management.rst new file mode 100644 index 0000000000000000000000000000000000000000..8f9beb4db1ba42cbf056342dce0d887e97910f72 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/device-management.rst @@ -0,0 +1,92 @@ + +Device management +================= + +For multi-GPU machines, users may want to select which GPU to use. +By default the CUDA driver selects the fastest GPU as the device 0, +which is the default device used by Numba. + +The features introduced on this page are generally not of interest +unless working with systems hosting/offering more than one CUDA-capable GPU. + +Device Selection +---------------- + +If at all required, device selection must be done before any CUDA feature is +used. + +:: + + from numba import cuda + cuda.select_device(0) + +The device can be closed by: + +:: + + cuda.close() + +Users can then create a new context with another device. + +:: + + cuda.select_device(1) # assuming we have 2 GPUs + + +.. function:: numba.cuda.select_device(device_id) + :noindex: + + Create a new CUDA context for the selected *device_id*. *device_id* + should be the number of the device (starting from 0; the device order + is determined by the CUDA libraries). The context is associated with + the current thread. Numba currently allows only one context per thread. + + If successful, this function returns a device instance. + + .. XXX document device instances? + + +.. function:: numba.cuda.close + :noindex: + + Explicitly close all contexts in the current thread. + + .. note:: + Compiled functions are associated with the CUDA context. + This makes it not very useful to close and create new devices, though it + is certainly useful for choosing which device to use when the machine + has multiple GPUs. + +The Device List +=============== + +The Device List is a list of all the GPUs in the system, and can be indexed to +obtain a context manager that ensures execution on the selected GPU. + +.. attribute:: numba.cuda.gpus + :noindex: +.. attribute:: numba.cuda.cudadrv.devices.gpus + +:py:data:`numba.cuda.gpus` is an instance of the ``_DeviceList`` class, from +which the current GPU context can also be retrieved: + +.. autoclass:: numba.cuda.cudadrv.devices._DeviceList + :members: current + :noindex: + + +Device UUIDs +============ + +The UUID of a device (equal to that returned by ``nvidia-smi -L``) is available +in the :attr:`uuid ` attribute of a CUDA +device object. + +For example, to obtain the UUID of the current device: + +.. code-block:: python + + dev = cuda.current_context().device + # prints e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643" + print(dev.uuid) + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/examples.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/examples.rst new file mode 100644 index 0000000000000000000000000000000000000000..793d13ba2a528257162b37cc06bd9220789594b4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/examples.rst @@ -0,0 +1,527 @@ + +======== +Examples +======== + +.. _cuda-vecadd: + +Vector Addition +=============== +This example uses Numba to create on-device arrays and a vector addition kernel; +it is a warmup for learning how to write GPU kernels using Numba. We'll begin +with some required imports: + + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py + :language: python + :caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py`` + :start-after: ex_vecadd.import.begin + :end-before: ex_vecadd.import.end + :dedent: 8 + :linenos: + +The following function is the kernel. Note that it is defined in terms of Python +variables with unspecified types. When the kernel is launched, Numba will +examine the types of the arguments that are passed at runtime and generate a +CUDA kernel specialized for them. + +Note that Numba kernels do not return values and must write any output into +arrays passed in as parameters (this is similar to the requirement that CUDA +C/C++ kernels have ``void`` return type). Here we pass in ``c`` for the results +to be written into. + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py + :language: python + :caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py`` + :start-after: ex_vecadd.kernel.begin + :end-before: ex_vecadd.kernel.end + :dedent: 8 + :linenos: + +:func:`cuda.to_device() ` can be used create device-side +copies of arrays. :func:`cuda.device_array_like() +` creates an uninitialized array of the same shape +and type as an existing array. Here we transfer two vectors and create an empty +vector to hold our results: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py + :language: python + :caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py`` + :start-after: ex_vecadd.allocate.begin + :end-before: ex_vecadd.allocate.end + :dedent: 8 + :linenos: + +A call to :meth:`forall() ` generates +an appropriate launch configuration with a 1D grid (see +:ref:`cuda-kernel-invocation`) for a given data size and is often the simplest +way of launching a kernel: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py + :language: python + :caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py`` + :start-after: ex_vecadd.forall.begin + :end-before: ex_vecadd.forall.end + :dedent: 8 + :linenos: + +This prints: + +.. code-block:: none + + [0.73548323 1.32061059 0.12582968 ... 1.25925809 1.49335059 1.59315414] + +One can also configure the grid manually using the subscripting syntax. The +following example launches a grid with sufficient threads to operate on every +vector element: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_vecadd.py + :language: python + :caption: from ``test_ex_vecadd`` in ``numba/cuda/tests/doc_examples/test_vecadd.py`` + :start-after: ex_vecadd.launch.begin + :end-before: ex_vecadd.launch.end + :dedent: 8 + :linenos: + +This also prints: + +.. code-block:: none + + [0.73548323 1.32061059 0.12582968 ... 1.25925809 1.49335059 1.59315414] + +.. _cuda-laplace: + +1D Heat Equation +===================== +This example solves Laplace's equation in one dimension for a certain set of initial +conditions and boundary conditions. A full discussion of Laplace's equation is out of +scope for this documentation, but it will suffice to say that it describes how heat +propagates through an object over time. It works by discretizing the problem in two ways: + +1. The domain is partitioned into a mesh of points that each have an individual temperature. +2. Time is partitioned into discrete intervals that are advanced forward sequentially. + +Then, the following assumption is applied: The temperature of a point after some interval +has passed is some weighted average of the temperature of the points that are directly +adjacent to it. Intuitively, if all the points in the domain are very hot +and a single point in the middle is very cold, as time passes, the hot points will cause +the cold one to heat up and the cold point will cause the surrounding hot pieces to cool +slightly. Simply put, the heat spreads throughout the object. + +We can implement this simulation using a Numba kernel. Let's start simple by assuming +we have a one dimensional object which we'll represent with an array of values. The position +of the element in the array is the position of a point within the object, and the value +of the element represents the temperature. + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py + :language: python + :caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py`` + :start-after: ex_laplace.import.begin + :end-before: ex_laplace.import.end + :dedent: 8 + :linenos: + + +Some initial setup here. Let's make one point in the center of the object very hot. + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py + :language: python + :caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py`` + :start-after: ex_laplace.allocate.begin + :end-before: ex_laplace.allocate.end + :dedent: 8 + :linenos: + +The initial state of the problem can be visualized as: + +.. image:: laplace_initial.svg + +In our kernel each thread will be responsible for managing the temperature update for a single element +in a loop over the desired number of timesteps. The kernel is below. Note the use of cooperative group +synchronization and the use of two buffers swapped at each iteration to avoid race conditions. See +:func:`numba.cuda.cg.this_grid() ` for details. + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py + :language: python + :caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py`` + :start-after: ex_laplace.kernel.begin + :end-before: ex_laplace.kernel.end + :dedent: 8 + :linenos: + + +Calling the kernel: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_laplace.py + :language: python + :caption: from ``test_ex_laplace`` in ``numba/cuda/tests/doc_examples/test_laplace.py`` + :start-after: ex_laplace.launch.begin + :end-before: ex_laplace.launch.end + :dedent: 8 + :linenos: + + +Plotting the final data shows an arc that is highest where +the object was hot initially and gradually sloping down to zero towards the +edges where the temperature is fixed at zero. In the limit of infinite time, +the arc will flatten out completely. + +.. image:: laplace_final.svg + +.. _cuda_reduction_shared: + +Shared Memory Reduction +======================= +Numba exposes many CUDA features, including :ref:`shared memory +`. To demonstrate shared memory, let's reimplement a +famous CUDA solution for summing a vector which works by "folding" the data up +using a successively smaller number of threads. + + +Note that this is a fairly naive implementation, and there are more efficient ways of implementing reductions +using Numba - see :ref:`cuda_montecarlo` for an example. + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py + :language: python + :caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py`` + :start-after: ex_reduction.import.begin + :end-before: ex_reduction.import.end + :dedent: 8 + :linenos: + +Let's create some one dimensional data that we'll use to demonstrate the +kernel itself: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py + :language: python + :caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py`` + :start-after: ex_reduction.allocate.begin + :end-before: ex_reduction.allocate.end + :dedent: 8 + :linenos: + + +Here is a version of the kernel implemented using Numba: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py + :language: python + :caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py`` + :start-after: ex_reduction.kernel.begin + :end-before: ex_reduction.kernel.end + :dedent: 8 + :linenos: + +We can run kernel and verify that the same result is obtained through +summing data on the host as follows: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_reduction.py + :language: python + :caption: from ``test_ex_reduction`` in ``numba/cuda/tests/doc_examples/test_reduction.py`` + :start-after: ex_reduction.launch.begin + :end-before: ex_reduction.launch.end + :dedent: 8 + :linenos: + +This algorithm can be greatly improved upon by redesigning the inner loop +to use sequential memory accesses, and even further by using strategies that +keep more threads active and working, since in this example most threads quickly +become idle. + +.. _cuda_sessionization: + +Dividing Click Data into Sessions +================================= + + +A common problem in business analytics is that of grouping the activity of users of an online platform into +sessions, called "sessionization". The idea is that users generally traverse through a website and perform +various actions (clicking something, filling out a form, etc.) in discrete groups. Perhaps a customer spends +some time shopping for an item in the morning and then again at night - often the business is interested in +treating these periods as separate interactions with their service, and this creates the problem of +programmatically splitting up activity in some agreed-upon way. + +Here we'll illustrate how to write a Numba kernel to solve this problem. We'll start with data +containing two fields: let ``user_id`` represent a unique ID corresponding to an individual customer, and let +``action_time`` be a time that some unknown action was taken on the service. Right now, we'll assume there's +only one type of action, so all there is to know is when it happened. + +Our goal will be to create a new column called ``session_id``, which contains a label corresponding to a unique +session. We'll define the boundary between sessions as when there has been at least one hour between clicks. + + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_sessionize.py + :language: python + :caption: from ``test_ex_sessionize`` in ``numba/cuda/tests/doc_examples/test_sessionize.py`` + :start-after: ex_sessionize.import.begin + :end-before: ex_sessionize.import.end + :dedent: 8 + :linenos: + +Here is a solution using Numba: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_sessionize.py + :language: python + :caption: from ``test_ex_sessionize`` in ``numba/cuda/tests/doc_examples/test_sessionize.py`` + :start-after: ex_sessionize.kernel.begin + :end-before: ex_sessionize.kernel.end + :dedent: 8 + :linenos: + +Let's generate some data and try out the kernel: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_sessionize.py + :language: python + :caption: from ``test_ex_sessionize`` in ``numba/cuda/tests/doc_examples/test_sessionize.py`` + :start-after: ex_sessionize.allocate.begin + :end-before: ex_sessionize.allocate.end + :dedent: 8 + :linenos: + +As can be seen above, the kernel successfully divided the first three datapoints from the second three for the first user ID, +and a similar pattern is seen throughout. + +.. _cuda_reuse_function: + +JIT Function CPU-GPU Compatibility +================================== + +This example demonstrates how ``numba.jit`` can be used to jit compile a function for the CPU, while at the same time making +it available for use inside CUDA kernels. This can be very useful for users that are migrating workflows from CPU to GPU as +they can directly reuse potential business logic with fewer code changes. + +Take the following example function: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py + :language: python + :caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py`` + :start-after: ex_cpu_gpu_compat.define.begin + :end-before: ex_cpu_gpu_compat.define.end + :dedent: 8 + :linenos: + +The function ``business_logic`` can be run standalone in compiled form on the CPU: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py + :language: python + :caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py`` + :start-after: ex_cpu_gpu_compat.cpurun.begin + :end-before: ex_cpu_gpu_compat.cpurun.end + :dedent: 8 + :linenos: + +It can also be directly reused threadwise inside a GPU kernel. For example one may +generate some vectors to represent ``x``, ``y``, and ``z``: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py + :language: python + :caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py`` + :start-after: ex_cpu_gpu_compat.allocate.begin + :end-before: ex_cpu_gpu_compat.allocate.end + :dedent: 8 + :linenos: + +And a numba kernel referencing the decorated function: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py + :language: python + :caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py`` + :start-after: ex_cpu_gpu_compat.usegpu.begin + :end-before: ex_cpu_gpu_compat.usegpu.end + :dedent: 8 + :linenos: + +This kernel can be invoked in the normal way: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py + :language: python + :caption: from ``test_ex_cpu_gpu_compat`` in ``numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py`` + :start-after: ex_cpu_gpu_compat.launch.begin + :end-before: ex_cpu_gpu_compat.launch.end + :dedent: 8 + :linenos: + +.. _cuda_montecarlo: + +Monte Carlo Integration +======================= + +This example shows how to use Numba to approximate the value of a definite integral by rapidly generating +random numbers on the GPU. A detailed description of the mathematical mechanics of Monte Carlo integeration +is out of the scope of the example, but it can briefly be described as an averaging process where the area +under the curve is approximated by taking the average of many rectangles formed by its function values. + +In addition, this example shows how to perform reductions in numba using the +:func:`cuda.reduce() ` API. + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py + :language: python + :caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py`` + :start-after: ex_montecarlo.import.begin + :end-before: ex_montecarlo.import.end + :dedent: 8 + :linenos: + +Let's create a variable to control the number of samples drawn: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py + :language: python + :caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py`` + :start-after: ex_montecarlo.define.begin + :end-before: ex_montecarlo.define.end + :dedent: 8 + :linenos: + + +The following kernel implements the main integration routine: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py + :language: python + :caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py`` + :start-after: ex_montecarlo.kernel.begin + :end-before: ex_montecarlo.kernel.end + :dedent: 8 + :linenos: + +This convenience function calls the kernel performs some +preprocessing and post processing steps. Note the use of Numba's reduction API to +take sum of the array and compute the final result: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py + :language: python + :caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py`` + :start-after: ex_montecarlo.callfunc.begin + :end-before: ex_montecarlo.callfunc.end + :dedent: 8 + :linenos: + + +We can now use ``mc_integrate`` to compute the definite integral of this function between +two limits: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_montecarlo.py + :language: python + :caption: from ``test_ex_montecarlo`` in ``numba/cuda/tests/doc_examples/test_montecarlo.py`` + :start-after: ex_montecarlo.launch.begin + :end-before: ex_montecarlo.launch.end + :dedent: 8 + :linenos: + + +.. _cuda-matmul: + +Matrix multiplication +===================== +First, import the modules needed for this example: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py + :language: python + :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py`` + :start-after: magictoken.ex_import.begin + :end-before: magictoken.ex_import.end + :dedent: 8 + :linenos: + +Here is a naïve implementation of matrix multiplication using a CUDA kernel: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py + :language: python + :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py`` + :start-after: magictoken.ex_matmul.begin + :end-before: magictoken.ex_matmul.end + :dedent: 8 + :linenos: + +An example usage of this function is as follows: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py + :language: python + :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py`` + :start-after: magictoken.ex_run_matmul.begin + :end-before: magictoken.ex_run_matmul.end + :dedent: 8 + :linenos: + +This implementation is straightforward and intuitive but performs poorly, +because the same matrix elements will be loaded multiple times from device +memory, which is slow (some devices may have transparent data caches, but +they may not be large enough to hold the entire inputs at once). + +It will be faster if we use a blocked algorithm to reduce accesses to the +device memory. CUDA provides a fast :ref:`shared memory ` +for threads in a block to cooperatively compute on a task. The following +implements a faster version of the square matrix multiplication using shared +memory: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py + :language: python + :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py`` + :start-after: magictoken.ex_fast_matmul.begin + :end-before: magictoken.ex_fast_matmul.end + :dedent: 8 + :linenos: + + +Because the shared memory is a limited resource, the code preloads a small +block at a time from the input arrays. Then, it calls +:func:`~numba.cuda.syncthreads` to wait until all threads have finished +preloading and before doing the computation on the shared memory. +It synchronizes again after the computation to ensure all threads +have finished with the data in shared memory before overwriting it +in the next loop iteration. + +An example usage of the ``fast_matmul`` function is as follows: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py + :language: python + :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py`` + :start-after: magictoken.ex_run_fast_matmul.begin + :end-before: magictoken.ex_run_fast_matmul.end + :dedent: 8 + :linenos: + + +This passes a :ref:`CUDA memory check test `, which +can help with debugging. Running the code above produces the following output: + +.. code-block:: none + + $ python fast_matmul.py + [[ 6. 6. 6. 6.] + [22. 22. 22. 22.] + [38. 38. 38. 38.] + [54. 54. 54. 54.]] + [[ 6. 6. 6. 6.] + [22. 22. 22. 22.] + [38. 38. 38. 38.] + [54. 54. 54. 54.]] + +.. note:: For high performance matrix multiplication in CUDA, see also the `CuPy implementation `_. + +The approach outlined here generalizes to non-square matrix multiplication as +follows by adjusting the ``blockspergrid`` variable: + +Again, here is an example usage: + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_matmul.py + :language: python + :caption: from ``test_ex_matmul`` in ``numba/cuda/tests/doc_examples/test_matmul.py`` + :start-after: magictoken.ex_run_nonsquare.begin + :end-before: magictoken.ex_run_nonsquare.end + :dedent: 8 + :linenos: + +and the corresponding output: + +.. code-block:: none + + $ python nonsquare_matmul.py + [[ 253. 253. 253. 253. 253. 253. 253.] + [ 782. 782. 782. 782. 782. 782. 782.] + [1311. 1311. 1311. 1311. 1311. 1311. 1311.] + [1840. 1840. 1840. 1840. 1840. 1840. 1840.] + [2369. 2369. 2369. 2369. 2369. 2369. 2369.]] + [[ 253. 253. 253. 253. 253. 253. 253.] + [ 782. 782. 782. 782. 782. 782. 782.] + [1311. 1311. 1311. 1311. 1311. 1311. 1311.] + [1840. 1840. 1840. 1840. 1840. 1840. 1840.] + [2369. 2369. 2369. 2369. 2369. 2369. 2369.]] diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/external-memory.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/external-memory.rst new file mode 100644 index 0000000000000000000000000000000000000000..28a8f59f0a96522c47657179d0fb1c7510fa188c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/external-memory.rst @@ -0,0 +1,320 @@ +.. _cuda-emm-plugin: + +================================================= +External Memory Management (EMM) Plugin interface +================================================= + +The :ref:`CUDA Array Interface ` enables sharing of data +between different Python libraries that access CUDA devices. However, each +library manages its own memory distinctly from the others. For example: + +- By default, Numba allocates memory on CUDA devices by interacting with the + CUDA driver API to call functions such as ``cuMemAlloc`` and ``cuMemFree``, + which is suitable for many use cases. +- The RAPIDS libraries (cuDF, cuML, etc.) use the `RAPIDS Memory Manager (RMM) + `_ for allocating device memory. +- `CuPy `_ includes a `memory pool implementation + `_ for both + device and pinned memory. + +When multiple CUDA-aware libraries are used together, it may be preferable for +Numba to defer to another library for memory management. The EMM Plugin +interface facilitates this, by enabling Numba to use another CUDA-aware library +for all allocations and deallocations. + +An EMM Plugin is used to facilitate the use of an external library for memory +management. An EMM Plugin can be a part of an external library, or could be +implemented as a separate library. + + +Overview of External Memory Management +====================================== + +When an EMM Plugin is in use (see :ref:`setting-emm-plugin`), Numba will make +memory allocations and deallocations through the Plugin. It will never directly call +functions such as ``cuMemAlloc``, ``cuMemFree``, etc. + +EMM Plugins always take responsibility for the management of device memory. +However, not all CUDA-aware libraries also support managing host memory, so a +facility for Numba to continue the management of host memory whilst ceding +control of device memory to the EMM is provided (see +:ref:`host-only-cuda-memory-manager`). + + +Effects on Deallocation Strategies +---------------------------------- + +Numba's internal :ref:`deallocation-behavior` is designed to increase efficiency +by deferring deallocations until a significant quantity are pending. It also +provides a mechanism for preventing deallocations entirely during critical +sections, using the :func:`~numba.cuda.defer_cleanup` context manager. + +When an EMM Plugin is in use, the deallocation strategy is implemented by the +EMM, and Numba's internal deallocation mechanism is not used. The EMM +Plugin could implement: + +- A similar strategy to the Numba deallocation behaviour, or +- Something more appropriate to the plugin - for example, deallocated memory + might immediately be returned to a memory pool. + +The ``defer_cleanup`` context manager may behave differently with an EMM Plugin +- an EMM Plugin should be accompanied by documentation of the behaviour of the +``defer_cleanup`` context manager when it is in use. For example, a pool +allocator could always immediately return memory to a pool even when the +context manager is in use, but could choose not to free empty pools until +``defer_cleanup`` is not in use. + + +Management of other objects +--------------------------- + +In addition to memory, Numba manages the allocation and deallocation of +:ref:`events `, :ref:`streams `, and modules (a module is a +compiled object, which is generated from ``@cuda.jit``\ -ted functions). The +management of events, streams, and modules is unchanged by the use of an EMM +Plugin. + + +Asynchronous allocation and deallocation +---------------------------------------- + +The present EMM Plugin interface does not provide support for asynchronous +allocation and deallocation. This may be added to a future version of the +interface. + + +Implementing an EMM Plugin +========================== + +An EMM Plugin is implemented by deriving from +:class:`~numba.cuda.BaseCUDAMemoryManager`. A summary of considerations for the +implementation follows: + +- Numba instantiates one instance of the EMM Plugin class per context. The + context that owns an EMM Plugin object is accessible through ``self.context``, + if required. +- The EMM Plugin is transparent to any code that uses Numba - all its methods + are invoked by Numba, and never need to be called by code that uses Numba. +- The allocation methods ``memalloc``, ``memhostalloc``, and ``mempin``, should + use the underlying library to allocate and/or pin device or host memory, and + construct an instance of a :ref:`memory pointer ` + representing the memory to return back to Numba. These methods are always + called when the current CUDA context is the context that owns the EMM Plugin + instance. +- The ``initialize`` method is called by Numba prior to the first use of the EMM + Plugin object for a context. This method should do anything required to + prepare the underlying library for allocations in the current context. This + method may be called multiple times, and must not invalidate previous state + when it is called. +- The ``reset`` method is called when all allocations in the context are to be + cleaned up. It may be called even prior to ``initialize``, and an EMM Plugin + implementation needs to guard against this. +- To support inter-GPU communication, the ``get_ipc_handle`` method should + provide an :class:`~numba.cuda.IpcHandle` for a given + :class:`~numba.cuda.MemoryPointer` instance. This method is part of the EMM + interface (rather than being handled within Numba) because the base address of + the allocation is only known by the underlying library. Closing an IPC handle + is handled internally within Numba. +- It is optional to provide memory info from the ``get_memory_info`` method, which + provides a count of the total and free memory on the device for the context. + It is preferable to implement the method, but this may not be practical for + all allocators. If memory info is not provided, this method should raise a + :class:`RuntimeError`. +- The ``defer_cleanup`` method should return a context manager that ensures that + expensive cleanup operations are avoided whilst it is active. The nuances of + this will vary between plugins, so the plugin documentation should include an + explanation of how deferring cleanup affects deallocations, and performance in + general. +- The ``interface_version`` property is used to ensure that the plugin version + matches the interface provided by the version of Numba. At present, this + should always be 1. + +Full documentation for the base class follows: + +.. autoclass:: numba.cuda.BaseCUDAMemoryManager + :members: memalloc, memhostalloc, mempin, initialize, get_ipc_handle, + get_memory_info, reset, defer_cleanup, interface_version + :member-order: bysource + + +.. _host-only-cuda-memory-manager: + +The Host-Only CUDA Memory Manager +--------------------------------- + +Some external memory managers will support management of on-device memory but +not host memory. For implementing EMM Plugins using one of these memory +managers, a partial implementation of a plugin that implements host-side +allocation and pinning is provided. To use it, derive from +:class:`~numba.cuda.HostOnlyCUDAMemoryManager` instead of +:class:`~numba.cuda.BaseCUDAMemoryManager`. Guidelines for using this class +are: + +- The host-only memory manager implements ``memhostalloc`` and ``mempin`` - the + EMM Plugin should still implement ``memalloc``. +- If ``reset`` is overridden, it must also call ``super().reset()`` to allow the + host allocations to be cleaned up. +- If ``defer_cleanup`` is overridden, it must hold an active context manager + from ``super().defer_cleanup()`` to ensure that host-side cleanup is also + deferred. + +Documentation for the methods of :class:`~numba.cuda.HostOnlyCUDAMemoryManager` +follows: + +.. autoclass:: numba.cuda.HostOnlyCUDAMemoryManager + :members: memhostalloc, mempin, reset, defer_cleanup + :member-order: bysource + + +The IPC Handle Mixin +-------------------- + +An implementation of the ``get_ipc_handle()`` function is is provided in the +``GetIpcHandleMixin`` class. This uses the driver API to determine the base +address of an allocation for opening an IPC handle. If this implementation is +appropriate for an EMM plugin, it can be added by mixing in the +``GetIpcHandleMixin`` class: + +.. autoclass:: numba.cuda.GetIpcHandleMixin + :members: get_ipc_handle + + +Classes and structures of returned objects +========================================== + +This section provides an overview of the classes and structures that need to be +constructed by an EMM Plugin. + +.. _memory-pointers: + +Memory Pointers +--------------- + +EMM Plugins should construct memory pointer instances that represent their +allocations, for return to Numba. The appropriate memory pointer class to use in +each method is: + +- :class:`~numba.cuda.MemoryPointer`: returned from ``memalloc`` +- :class:`~numba.cuda.MappedMemory`: returned from ``memhostalloc`` or + ``mempin`` when the host memory is mapped into the device memory space. +- :class:`~numba.cuda.PinnedMemory`: return from ``memhostalloc`` or ``mempin`` + when the host memory is not mapped into the device memory space. + +Memory pointers can take a finalizer, which is a function that is called when +the buffer is no longer needed. Usually the finalizer will make a call to the +memory management library (either internal to Numba, or external if allocated +by an EMM Plugin) to inform it that the memory is no longer required, and that +it could potentially be freed and/or unpinned. The memory manager may choose to +defer actually cleaning up the memory to any later time after the finalizer +runs - it is not required to free the buffer immediately. + +Documentation for the memory pointer classes follows. + +.. autoclass:: numba.cuda.MemoryPointer + +The ``AutoFreePointer`` class need not be used directly, but is documented here +as it is subclassed by :class:`numba.cuda.MappedMemory`: + +.. autoclass:: numba.cuda.cudadrv.driver.AutoFreePointer + +.. autoclass:: numba.cuda.MappedMemory + +.. autoclass:: numba.cuda.PinnedMemory + + +Memory Info +----------- + +If an implementation of +:meth:`~numba.cuda.BaseCUDAMemoryManager.get_memory_info` is to provide a +result, then it should return an instance of the ``MemoryInfo`` named tuple: + +.. autoclass:: numba.cuda.MemoryInfo + + +IPC +--- + +An instance of ``IpcHandle`` is required to be returned from an implementation +of :meth:`~numba.cuda.BaseCUDAMemoryManager.get_ipc_handle`: + +.. autoclass:: numba.cuda.IpcHandle + +Guidance for constructing an IPC handle in the context of implementing an EMM +Plugin: + +- The ``memory`` parameter passed to the ``get_ipc_handle`` method of an EMM + Plugin can be passed as the ``base`` parameter. +- A suitable type for the ``handle`` can be constructed as ``ctypes.c_byte * + 64``. The data for ``handle`` must be populated using a method for obtaining a + CUDA IPC handle appropriate to the underlying library. +- ``size`` should match the size of the original allocation, which can be + obtained with ``memory.size`` in ``get_ipc_handle``. +- An appropriate value for ``source_info`` can be created by calling + ``self.context.device.get_device_identity()``. +- If the underlying memory does not point to the base of an allocation returned + by the CUDA driver or runtime API (e.g. if a pool allocator is in use) then + the ``offset`` from the base must be provided. + + +.. _setting-emm-plugin: + +Setting the EMM Plugin +====================== + +By default, Numba uses its internal memory management - if an EMM Plugin is to +be used, it must be configured. There are two mechanisms for configuring the use +of an EMM Plugin: an environment variable, and a function. + + +Environment variable +-------------------- + +A module name can be provided in the environment variable, +``NUMBA_CUDA_MEMORY_MANAGER``. If this environment variable is set, Numba will +attempt to import the module, and and use its ``_numba_memory_manager`` global +variable as the memory manager class. This is primarily useful for running the +Numba test suite with an EMM Plugin, e.g.: + +.. code:: + + $ NUMBA_CUDA_MEMORY_MANAGER=rmm python -m numba.runtests numba.cuda.tests + + +Function +-------- + +The :func:`~numba.cuda.set_memory_manager` function can be used to set the +memory manager at runtime. This should be called prior to the initialization of +any contexts, as EMM Plugin instances are instantiated along with contexts. + +.. autofunction:: numba.cuda.set_memory_manager + + +Resetting the memory manager +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It is recommended that the memory manager is set once prior to using any CUDA +functionality, and left unchanged for the remainder of execution. It is possible +to set the memory manager multiple times, noting the following: + +* At the time of their creation, contexts are bound to an instance of a memory + manager for their lifetime. +* Changing the memory manager will have no effect on existing contexts - only + contexts created after the memory manager was updated will use instances of + the new memory manager. +* :func:`numba.cuda.close` can be used to destroy contexts after setting the + memory manager so that they get re-created with the new memory manager. + + - This will invalidate any arrays, streams, events, and modules owned by the + context. + - Attempting to use invalid arrays, streams, or events will likely fail with + an exception being raised due to a ``CUDA_ERROR_INVALID_CONTEXT`` or + ``CUDA_ERROR_CONTEXT_IS_DESTROYED`` return code from a Driver API function. + - Attempting to use an invalid module will result in similar, or in some + cases a segmentation fault / access violation. + +.. note:: The invalidation of modules means that all functions compiled with + ``@cuda.jit`` prior to context destruction will need to be + redefined, as the code underlying them will also have been unloaded + from the GPU. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/faq.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/faq.rst new file mode 100644 index 0000000000000000000000000000000000000000..b4392e6ab0497773c55aabf2ae1c2b3b109268ed --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/faq.rst @@ -0,0 +1,20 @@ + +.. _cudafaq: + +================================================= +CUDA Frequently Asked Questions +================================================= + +nvprof reports "No kernels were profiled" +----------------------------------------- + +When using the ``nvprof`` tool to profile Numba jitted code for the CUDA +target, the output contains ``No kernels were profiled`` but there are clearly +running kernels present, what is going on? + +This is quite likely due to the profiling data not being flushed on program +exit, see the `NVIDIA CUDA documentation +`_ for +details. To fix this simply add a call to ``numba.cuda.profile_stop()`` prior +to the exit point in your program (or wherever you want to stop profiling). +For more on CUDA profiling support in Numba, see :ref:`cuda-profiling`. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/fastmath.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/fastmath.rst new file mode 100644 index 0000000000000000000000000000000000000000..fb9de10ea987396ee395e42d6c43355b3d535c53 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/fastmath.rst @@ -0,0 +1,36 @@ + +.. _cuda-fast-math: + +CUDA Fast Math +============== + +As noted in :ref:`fast-math`, for certain classes of applications that utilize +floating point, strict IEEE-754 conformance is not required. For this subset of +applications, performance speedups may be possible. + +The CUDA target implements :ref:`fast-math` behavior with two differences. + +* First, the ``fastmath`` argument to the :func:`@jit decorator + ` is limited to the values ``True`` and ``False``. + When ``True``, the following optimizations are enabled: + + - Flushing of denormals to zero. + - Use of a fast approximation to the square root function. + - Use of a fast approximation to the division operation. + - Contraction of multiply and add operations into single fused multiply-add + operations. + + See the `documentation for nvvmCompileProgram `_ for more details of these optimizations. + +* Secondly, calls to a subset of math module functions on ``float32`` operands + will be implemented using fast approximate implementations from the libdevice + library. + + - :func:`math.cos`: Implemented using `__nv_fast_cosf `_. + - :func:`math.sin`: Implemented using `__nv_fast_sinf `_. + - :func:`math.tan`: Implemented using `__nv_fast_tanf `_. + - :func:`math.exp`: Implemented using `__nv_fast_expf `_. + - :func:`math.log2`: Implemented using `__nv_fast_log2f `_. + - :func:`math.log10`: Implemented using `__nv_fast_log10f `_. + - :func:`math.log`: Implemented using `__nv_fast_logf `_. + - :func:`math.pow`: Implemented using `__nv_fast_powf `_. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/index.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..686b4ad0b77ecf2a8ee30050280f9949a9b27df9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/index.rst @@ -0,0 +1,29 @@ + +.. _cuda-index: + +Numba for CUDA GPUs +=================== + +.. toctree:: + + overview.rst + kernels.rst + memory.rst + device-functions.rst + cudapysupported.rst + fastmath.rst + intrinsics.rst + cooperative_groups.rst + random.rst + device-management.rst + examples.rst + simulator.rst + reduction.rst + ufunc.rst + ipc.rst + cuda_array_interface.rst + external-memory.rst + bindings.rst + cuda_ffi.rst + caching.rst + faq.rst diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/intrinsics.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/intrinsics.rst new file mode 100644 index 0000000000000000000000000000000000000000..521c1d918f73a2a4313c234f70a51220e5054521 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/intrinsics.rst @@ -0,0 +1,58 @@ + +Supported Atomic Operations +=========================== + +Numba provides access to some of the atomic operations supported in CUDA. Those +that are presently implemented are as follows: + +.. automodule:: numba.cuda + :members: atomic + :noindex: + +Example +''''''' + +The following code demonstrates the use of :class:`numba.cuda.atomic.max` to +find the maximum value in an array. Note that this is not the most efficient way +of finding a maximum in this case, but that it serves as an example:: + + from numba import cuda + import numpy as np + + @cuda.jit + def max_example(result, values): + """Find the maximum value in values and store in result[0]""" + tid = cuda.threadIdx.x + bid = cuda.blockIdx.x + bdim = cuda.blockDim.x + i = (bid * bdim) + tid + cuda.atomic.max(result, 0, values[i]) + + + arr = np.random.rand(16384) + result = np.zeros(1, dtype=np.float64) + + max_example[256,64](result, arr) + print(result[0]) # Found using cuda.atomic.max + print(max(arr)) # Print max(arr) for comparison (should be equal!) + + +Multiple dimension arrays are supported by using a tuple of ints for the index:: + + + @cuda.jit + def max_example_3d(result, values): + """ + Find the maximum value in values and store in result[0]. + Both result and values are 3d arrays. + """ + i, j, k = cuda.grid(3) + # Atomically store to result[0,1,2] from values[i, j, k] + cuda.atomic.max(result, (0, 1, 2), values[i, j, k]) + + arr = np.random.rand(1000).reshape(10,10,10) + result = np.zeros((3, 3, 3), dtype=np.float64) + max_example_3d[(2, 2, 2), (5, 5, 5)](result, arr) + print(result[0, 1, 2], '==', np.max(arr)) + + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/ipc.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/ipc.rst new file mode 100644 index 0000000000000000000000000000000000000000..ce0f508e599732debed056d02e9c7ed66d374797 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/ipc.rst @@ -0,0 +1,36 @@ +=================== +Sharing CUDA Memory +=================== + +.. _cuda-ipc-memory: + +Sharing between process +======================= + +Sharing between processes is implemented using the Legacy CUDA IPC API +(functions whose names begin with ``cuIpc``), and is supported only on Linux. + + +Export device array to another process +-------------------------------------- + +A device array can be shared with another process in the same machine using +the CUDA IPC API. To do so, use the ``.get_ipc_handle()`` method on the device +array to get a ``IpcArrayHandle`` object, which can be transferred to another +process. + + +.. automethod:: numba.cuda.cudadrv.devicearray.DeviceNDArray.get_ipc_handle + :noindex: + +.. autoclass:: numba.cuda.cudadrv.devicearray.IpcArrayHandle + :members: open, close + + +Import IPC memory from another process +-------------------------------------- + +The following function is used to open IPC handle from another process +as a device array. + +.. automethod:: numba.cuda.open_ipc_array diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/kernels.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/kernels.rst new file mode 100644 index 0000000000000000000000000000000000000000..b4af2ccf8b99a7dabb48d2ffd44cf985ee7dce5c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/kernels.rst @@ -0,0 +1,233 @@ + +==================== +Writing CUDA Kernels +==================== + +Introduction +============ + +CUDA has an execution model unlike the traditional sequential model used +for programming CPUs. In CUDA, the code you write will be executed by +multiple threads at once (often hundreds or thousands). Your solution will +be modeled by defining a thread hierarchy of *grid*, *blocks* and *threads*. + +Numba's CUDA support exposes facilities to declare and manage this +hierarchy of threads. The facilities are largely similar to those +exposed by NVidia's CUDA C language. + +Numba also exposes three kinds of GPU memory: global :ref:`device memory +` (the large, relatively slow +off-chip memory that's connected to the GPU itself), on-chip +:ref:`shared memory ` and :ref:`local memory `. +For all but the simplest algorithms, it is important that you carefully +consider how to use and access memory in order to minimize bandwidth +requirements and contention. + + +Kernel declaration +================== + +A *kernel function* is a GPU function that is meant to be called from CPU +code (*). It gives it two fundamental characteristics: + +* kernels cannot explicitly return a value; all result data must be written + to an array passed to the function (if computing a scalar, you will + probably pass a one-element array); + +* kernels explicitly declare their thread hierarchy when called: i.e. + the number of thread blocks and the number of threads per block + (note that while a kernel is compiled once, it can be called multiple + times with different block sizes or grid sizes). + +At first sight, writing a CUDA kernel with Numba looks very much like +writing a :term:`JIT function` for the CPU:: + + @cuda.jit + def increment_by_one(an_array): + """ + Increment all array elements by one. + """ + # code elided here; read further for different implementations + +(*) Note: newer CUDA devices support device-side kernel launching; this feature +is called *dynamic parallelism* but Numba does not support it currently) + + +.. _cuda-kernel-invocation: + +Kernel invocation +================= + +A kernel is typically launched in the following way:: + + threadsperblock = 32 + blockspergrid = (an_array.size + (threadsperblock - 1)) // threadsperblock + increment_by_one[blockspergrid, threadsperblock](an_array) + +We notice two steps here: + +* Instantiate the kernel proper, by specifying a number of blocks + (or "blocks per grid"), and a number of threads per block. The product + of the two will give the total number of threads launched. Kernel + instantiation is done by taking the compiled kernel function + (here ``increment_by_one``) and indexing it with a tuple of integers. + +* Running the kernel, by passing it the input array (and any separate + output arrays if necessary). Kernels run asynchronously: launches queue their + execution on the device and then return immediately. You can use + :func:`cuda.synchronize() ` to wait for all previous + kernel launches to finish executing. + +.. note:: Passing an array that resides in host memory will implicitly cause a + copy back to the host, which will be synchronous. In this case, the kernel + launch will not return until the data is copied back, and therefore appears + to execute synchronously. + +Choosing the block size +----------------------- + +It might seem curious to have a two-level hierarchy when declaring the +number of threads needed by a kernel. The block size (i.e. number of +threads per block) is often crucial: + +* On the software side, the block size determines how many threads + share a given area of :ref:`shared memory `. + +* On the hardware side, the block size must be large enough for full + occupation of execution units; recommendations can be found in the + `CUDA C Programming Guide`_. + +Multi-dimensional blocks and grids +---------------------------------- + +To help deal with multi-dimensional arrays, CUDA allows you to specify +multi-dimensional blocks and grids. In the example above, you could +make ``blockspergrid`` and ``threadsperblock`` tuples of one, two +or three integers. Compared to 1D declarations of equivalent sizes, +this doesn't change anything to the efficiency or behaviour of generated +code, but can help you write your algorithms in a more natural way. + + +Thread positioning +================== + +When running a kernel, the kernel function's code is executed by every +thread once. It therefore has to know which thread it is in, in order +to know which array element(s) it is responsible for (complex algorithms +may define more complex responsibilities, but the underlying principle +is the same). + +One way is for the thread to determine its position in the grid and block +and manually compute the corresponding array position:: + + @cuda.jit + def increment_by_one(an_array): + # Thread id in a 1D block + tx = cuda.threadIdx.x + # Block id in a 1D grid + ty = cuda.blockIdx.x + # Block width, i.e. number of threads per block + bw = cuda.blockDim.x + # Compute flattened index inside the array + pos = tx + ty * bw + if pos < an_array.size: # Check array boundaries + an_array[pos] += 1 + +.. note:: Unless you are sure the block size and grid size is a divisor + of your array size, you **must** check boundaries as shown above. + +:attr:`.threadIdx`, :attr:`.blockIdx`, :attr:`.blockDim` and :attr:`.gridDim` +are special objects provided by the CUDA backend for the sole purpose of +knowing the geometry of the thread hierarchy and the position of the +current thread within that geometry. + +These objects can be 1D, 2D or 3D, depending on how the kernel was +:ref:`invoked `. To access the value at each +dimension, use the ``x``, ``y`` and ``z`` attributes of these objects, +respectively. + +.. attribute:: numba.cuda.threadIdx + :noindex: + + The thread indices in the current thread block. For 1D blocks, the index + (given by the ``x`` attribute) is an integer spanning the range from 0 + inclusive to :attr:`numba.cuda.blockDim` exclusive. A similar rule + exists for each dimension when more than one dimension is used. + +.. attribute:: numba.cuda.blockDim + :noindex: + + The shape of the block of threads, as declared when instantiating the + kernel. This value is the same for all threads in a given kernel, even + if they belong to different blocks (i.e. each block is "full"). + +.. attribute:: numba.cuda.blockIdx + :noindex: + + The block indices in the grid of threads launched a kernel. For a 1D grid, + the index (given by the ``x`` attribute) is an integer spanning the range + from 0 inclusive to :attr:`numba.cuda.gridDim` exclusive. A similar rule + exists for each dimension when more than one dimension is used. + +.. attribute:: numba.cuda.gridDim + :noindex: + + The shape of the grid of blocks, i.e. the total number of blocks launched + by this kernel invocation, as declared when instantiating the kernel. + +Absolute positions +------------------ + +Simple algorithms will tend to always use thread indices in the +same way as shown in the example above. Numba provides additional facilities +to automate such calculations: + +.. function:: numba.cuda.grid(ndim) + :noindex: + + Return the absolute position of the current thread in the entire + grid of blocks. *ndim* should correspond to the number of dimensions + declared when instantiating the kernel. If *ndim* is 1, a single integer + is returned. If *ndim* is 2 or 3, a tuple of the given number of + integers is returned. + +.. function:: numba.cuda.gridsize(ndim) + :noindex: + + Return the absolute size (or shape) in threads of the entire grid of + blocks. *ndim* has the same meaning as in :func:`.grid` above. + +With these functions, the incrementation example can become:: + + @cuda.jit + def increment_by_one(an_array): + pos = cuda.grid(1) + if pos < an_array.size: + an_array[pos] += 1 + +The same example for a 2D array and grid of threads would be:: + + @cuda.jit + def increment_a_2D_array(an_array): + x, y = cuda.grid(2) + if x < an_array.shape[0] and y < an_array.shape[1]: + an_array[x, y] += 1 + +Note the grid computation when instantiating the kernel must still be +done manually, for example:: + + threadsperblock = (16, 16) + blockspergrid_x = math.ceil(an_array.shape[0] / threadsperblock[0]) + blockspergrid_y = math.ceil(an_array.shape[1] / threadsperblock[1]) + blockspergrid = (blockspergrid_x, blockspergrid_y) + increment_a_2D_array[blockspergrid, threadsperblock](an_array) + + +Further Reading +---------------- + +Please refer to the the `CUDA C Programming Guide`_ for a detailed discussion +of CUDA programming. + + +.. _CUDA C Programming Guide: http://docs.nvidia.com/cuda/cuda-c-programming-guide diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/laplace_final.svg b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/laplace_final.svg new file mode 100644 index 0000000000000000000000000000000000000000..4f3b197fb0c1208dfa568ae9ca8984cb8da7763a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/laplace_final.svg @@ -0,0 +1,1953 @@ + + + + + + + + 2022-04-18T06:58:19.244680 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/laplace_initial.svg b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/laplace_initial.svg new file mode 100644 index 0000000000000000000000000000000000000000..dbede3687dd2873bc00f4dfd983f2dbc0092454e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/laplace_initial.svg @@ -0,0 +1,1838 @@ + + + + + + + + 2022-04-18T06:58:18.768147 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/memory.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/memory.rst new file mode 100644 index 0000000000000000000000000000000000000000..fa6fe6a970113cc4791deb07330cb1ee36588532 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/memory.rst @@ -0,0 +1,257 @@ +================= +Memory management +================= + +.. _cuda-device-memory: + +Data transfer +============= + +Even though Numba can automatically transfer NumPy arrays to the device, +it can only do so conservatively by always transferring device memory back to +the host when a kernel finishes. To avoid the unnecessary transfer for +read-only arrays, you can use the following APIs to manually control the +transfer: + +.. autofunction:: numba.cuda.device_array + :noindex: +.. autofunction:: numba.cuda.device_array_like + :noindex: +.. autofunction:: numba.cuda.to_device + :noindex: + +In addition to the device arrays, Numba can consume any object that implements +:ref:`cuda array interface `. These objects also can be +manually converted into a Numba device array by creating a view of the GPU +buffer using the following APIs: + +.. autofunction:: numba.cuda.as_cuda_array + :noindex: +.. autofunction:: numba.cuda.is_cuda_array + :noindex: + + +Device arrays +------------- + +Device array references have the following methods. These methods are to be +called in host code, not within CUDA-jitted functions. + +.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceNDArray + :members: copy_to_host, is_c_contiguous, is_f_contiguous, ravel, reshape + :noindex: + + +.. note:: DeviceNDArray defines the :ref:`cuda array interface `. + + +Pinned memory +============= + +.. autofunction:: numba.cuda.pinned + :noindex: +.. autofunction:: numba.cuda.pinned_array + :noindex: +.. autofunction:: numba.cuda.pinned_array_like + :noindex: + + +Mapped memory +============= + +.. autofunction:: numba.cuda.mapped + :noindex: +.. autofunction:: numba.cuda.mapped_array + :noindex: +.. autofunction:: numba.cuda.mapped_array_like + :noindex: + + + +Managed memory +============== + +.. autofunction:: numba.cuda.managed_array + :noindex: + + +Streams +======= + +Streams can be passed to functions that accept them (e.g. copies between the +host and device) and into kernel launch configurations so that the operations +are executed asynchronously. + +.. autofunction:: numba.cuda.stream + :noindex: + +.. autofunction:: numba.cuda.default_stream + :noindex: + +.. autofunction:: numba.cuda.legacy_default_stream + :noindex: + +.. autofunction:: numba.cuda.per_thread_default_stream + :noindex: + +.. autofunction:: numba.cuda.external_stream + :noindex: + +CUDA streams have the following methods: + +.. autoclass:: numba.cuda.cudadrv.driver.Stream + :members: synchronize, auto_synchronize + :noindex: + +.. _cuda-shared-memory: + +Shared memory and thread synchronization +======================================== + +A limited amount of shared memory can be allocated on the device to speed +up access to data, when necessary. That memory will be shared (i.e. both +readable and writable) amongst all threads belonging to a given block +and has faster access times than regular device memory. It also allows +threads to cooperate on a given solution. You can think of it as a +manually-managed data cache. + +The memory is allocated once for the duration of the kernel, unlike +traditional dynamic memory management. + +.. function:: numba.cuda.shared.array(shape, type) + :noindex: + + Allocate a shared array of the given *shape* and *type* on the device. + This function must be called on the device (i.e. from a kernel or + device function). *shape* is either an integer or a tuple of integers + representing the array's dimensions and must be a simple constant + expression. A "simple constant expression" includes, but is not limited to: + + #. A literal (e.g. ``10``) + #. A local variable whose right-hand side is a literal or a simple constant + expression (e.g. ``shape``, where ``shape`` is defined earlier in the function + as ``shape = 10``) + #. A global variable that is defined in the jitted function's globals by the time + of compilation (e.g. ``shape``, where ``shape`` is defined using any expression + at global scope). + + The definition must result in a Python ``int`` (i.e. not a NumPy scalar or other + scalar / integer-like type). *type* is a :ref:`Numba type ` of the + elements needing to be stored in the array. The returned array-like object can be + read and written to like any normal device array (e.g. through indexing). + + A common pattern is to have each thread populate one element in the + shared array and then wait for all threads to finish using :func:`.syncthreads`. + + +.. function:: numba.cuda.syncthreads() + :noindex: + + Synchronize all threads in the same thread block. This function + implements the same pattern as `barriers `_ + in traditional multi-threaded programming: this function waits + until all threads in the block call it, at which point it returns + control to all its callers. + +.. seealso:: + :ref:`Matrix multiplication example `. + +.. _cuda-local-memory: + +Local memory +============ + +Local memory is an area of memory private to each thread. Using local +memory helps allocate some scratchpad area when scalar local variables +are not enough. The memory is allocated once for the duration of the kernel, +unlike traditional dynamic memory management. + +.. function:: numba.cuda.local.array(shape, type) + :noindex: + + Allocate a local array of the given *shape* and *type* on the device. + *shape* is either an integer or a tuple of integers representing the array's + dimensions and must be a simple constant expression. A "simple constant expression" + includes, but is not limited to: + + #. A literal (e.g. ``10``) + #. A local variable whose right-hand side is a literal or a simple constant + expression (e.g. ``shape``, where ``shape`` is defined earlier in the function + as ``shape = 10``) + #. A global variable that is defined in the jitted function's globals by the time + of compilation (e.g. ``shape``, where ``shape`` is defined using any expression + at global scope). + + The definition must result in a Python ``int`` (i.e. not a NumPy scalar or other + scalar / integer-like type). *type* is a :ref:`Numba type ` + of the elements needing to be stored in the array. The array is private to + the current thread. An array-like object is returned which can be read and + written to like any standard array (e.g. through indexing). + + .. seealso:: The Local Memory section of `Device Memory Accesses + `_ + in the CUDA programming guide. + +Constant memory +=============== + +Constant memory is an area of memory that is read only, cached and off-chip, it +is accessible by all threads and is host allocated. A method of +creating an array in constant memory is through the use of: + +.. function:: numba.cuda.const.array_like(arr) + :noindex: + + Allocate and make accessible an array in constant memory based on array-like + *arr*. + + +.. _deallocation-behavior: + +Deallocation Behavior +===================== + +This section describes the deallocation behaviour of Numba's internal memory +management. If an External Memory Management Plugin is in use (see +:ref:`cuda-emm-plugin`), then deallocation behaviour may differ; you may refer to the +documentation for the EMM Plugin to understand its deallocation behaviour. + +Deallocation of all CUDA resources are tracked on a per-context basis. +When the last reference to a device memory is dropped, the underlying memory +is scheduled to be deallocated. The deallocation does not occur immediately. +It is added to a queue of pending deallocations. This design has two benefits: + +1. Resource deallocation API may cause the device to synchronize; thus, breaking + any asynchronous execution. Deferring the deallocation could avoid latency + in performance critical code section. +2. Some deallocation errors may cause all the remaining deallocations to fail. + Continued deallocation errors can cause critical errors at the CUDA driver + level. In some cases, this could mean a segmentation fault in the CUDA + driver. In the worst case, this could cause the system GUI to freeze and + could only recover with a system reset. When an error occurs during a + deallocation, the remaining pending deallocations are cancelled. Any + deallocation error will be reported. When the process is terminated, the + CUDA driver is able to release all allocated resources by the terminated + process. + +The deallocation queue is flushed automatically as soon as the following events +occur: + +- An allocation failed due to out-of-memory error. Allocation is retried after + flushing all deallocations. +- The deallocation queue has reached its maximum size, which is default to 10. + User can override by setting the environment variable + `NUMBA_CUDA_MAX_PENDING_DEALLOCS_COUNT`. For example, + `NUMBA_CUDA_MAX_PENDING_DEALLOCS_COUNT=20`, increases the limit to 20. +- The maximum accumulated byte size of resources that are pending deallocation + is reached. This is default to 20% of the device memory capacity. + User can override by setting the environment variable + `NUMBA_CUDA_MAX_PENDING_DEALLOCS_RATIO`. For example, + `NUMBA_CUDA_MAX_PENDING_DEALLOCS_RATIO=0.5` sets the limit to 50% of the + capacity. + +Sometimes, it is desired to defer resource deallocation until a code section +ends. Most often, users want to avoid any implicit synchronization due to +deallocation. This can be done by using the following context manager: + +.. autofunction:: numba.cuda.defer_cleanup diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/overview.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/overview.rst new file mode 100644 index 0000000000000000000000000000000000000000..4d5f7d1c70170a514fddcc2d8900fc163a73625a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/overview.rst @@ -0,0 +1,133 @@ +======== +Overview +======== + +Numba supports CUDA GPU programming by directly compiling a restricted subset +of Python code into CUDA kernels and device functions following the CUDA +execution model. Kernels written in Numba appear to have direct access +to NumPy arrays. NumPy arrays are transferred between the CPU and the +GPU automatically. + + +Terminology +=========== + +Several important terms in the topic of CUDA programming are listed here: + +- *host*: the CPU +- *device*: the GPU +- *host memory*: the system main memory +- *device memory*: onboard memory on a GPU card +- *kernels*: a GPU function launched by the host and executed on the device +- *device function*: a GPU function executed on the device which can only be + called from the device (i.e. from a kernel or another device function) + + +Programming model +================= + +Most CUDA programming facilities exposed by Numba map directly to the CUDA +C language offered by NVidia. Therefore, it is recommended you read the +official `CUDA C programming guide `_. + + +Requirements +============ + +Supported GPUs +-------------- + +Numba supports CUDA-enabled GPUs with Compute Capability 3.5 or greater. +Support for devices with Compute Capability less than 5.3 is deprecated, and +will be removed in a future Numba release. + +Devices with Compute Capability 5.3 or greater include (but are not limited to): + +- Embedded platforms: NVIDIA Jetson Nano, TX1, TX2, Xavier NX, AGX Xavier, AGX + Orin. +- Desktop / Server GPUs: All GPUs with Pascal microarchitecture or later. E.g. + GTX 10 / 16 series, RTX 20 / 30 series, Quadro P / V / RTX series, RTX A + series, H100. +- Laptop GPUs: All GPUs with Pascal microarchitecture or later. E.g. MX series, + Quadro P / T series (mobile), RTX 20 / 30 series (mobile), RTX A series (mobile). + +Software +-------- + +Numba aims to support CUDA Toolkit versions released within the last 3 years. +An NVIDIA driver sufficient for the toolkit version is also required. +Presently: + +* 10.2 is the minimum required toolkit version. +* 11.2 or later is recommended, as it uses an NVVM version based on LLVM 7 (as + opposed to 3.4 in earlier releases). + +CUDA is supported on 64-bit Linux and Windows. + +If you are using Conda, you can install the CUDA toolkit with:: + + $ conda install cudatoolkit + +If you are not using Conda or if you want to use a different version of CUDA +toolkit, the following describes how Numba searches for a CUDA toolkit +installation. + +.. _cuda-bindings: + +CUDA Bindings +~~~~~~~~~~~~~ + +Numba supports interacting with the CUDA Driver API via the `NVIDIA CUDA Python +bindings `_ and its own ctypes-based +bindings. Functionality is equivalent between the two bindings. The +ctypes-based bindings are presently the default, but the NVIDIA bindings will +be used by default (if they are available in the environment) in a future Numba +release. + +You can install the NVIDIA bindings with:: + + $ conda install nvidia::cuda-python + +if you are using Conda, or:: + + $ pip install cuda-python + +if you are using pip. + +The use of the NVIDIA bindings is enabled by setting the environment variable +:envvar:`NUMBA_CUDA_USE_NVIDIA_BINDING` to ``"1"``. + +.. _cudatoolkit-lookup: + +Setting CUDA Installation Path +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Numba searches for a CUDA toolkit installation in the following order: + +1. Conda installed `cudatoolkit` package. +2. Environment variable ``CUDA_HOME``, which points to the directory of the + installed CUDA toolkit (i.e. ``/home/user/cuda-10``) +3. System-wide installation at exactly ``/usr/local/cuda`` on Linux platforms. + Versioned installation paths (i.e. ``/usr/local/cuda-10.0``) are intentionally + ignored. Users can use ``CUDA_HOME`` to select specific versions. + +In addition to the CUDA toolkit libraries, which can be installed by conda into +an environment or installed system-wide by the `CUDA SDK installer +<(https://developer.nvidia.com/cuda-downloads)>`_, the CUDA target in Numba +also requires an up-to-date NVIDIA graphics driver. Updated graphics drivers +are also installed by the CUDA SDK installer, so there is no need to do both. +Note that on macOS, the CUDA SDK must be installed to get the required driver, +and the driver is only supported on macOS prior to 10.14 (Mojave). If the +``libcuda`` library is in a non-standard location, users can set environment +variable ``NUMBA_CUDA_DRIVER`` to the file path (not the directory path) of the +shared library file. + + +Missing CUDA Features +===================== + +Numba does not implement all features of CUDA, yet. Some missing features +are listed below: + +* dynamic parallelism +* texture memory diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/random.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/random.rst new file mode 100644 index 0000000000000000000000000000000000000000..6f3ebd85a9b0e66123fd308a2336d3c4b7e6ad6a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/random.rst @@ -0,0 +1,90 @@ + +.. _cuda-random: + +Random Number Generation +======================== + +Numba provides a random number generation algorithm that can be executed on +the GPU. Due to technical issues with how NVIDIA implemented cuRAND, however, +Numba's GPU random number generator is not based on cuRAND. Instead, Numba's +GPU RNG is an implementation of the `xoroshiro128+ algorithm +`_. The xoroshiro128+ algorithm has a period of +``2**128 - 1``, which is shorter than the period of the XORWOW algorithm +used by default in cuRAND, but xoroshiro128+ still passes the BigCrush tests +of random number generator quality. + +When using any RNG on the GPU, it is important to make sure that each thread +has its own RNG state, and they have been initialized to produce non-overlapping +sequences. The numba.cuda.random module provides a host function to do this, +as well as CUDA device functions to obtain uniformly or normally distributed +random numbers. + +.. note:: Numba (like cuRAND) uses the + `Box-Muller transform ` + to generate normally distributed random numbers from a uniform generator. + However, Box-Muller generates pairs of random numbers, and the current + implementation only returns one of them. As a result, generating normally + distributed values is half the speed of uniformly distributed values. + +.. automodule:: numba.cuda.random + :members: create_xoroshiro128p_states, init_xoroshiro128p_states, xoroshiro128p_uniform_float32, xoroshiro128p_uniform_float64, xoroshiro128p_normal_float32, xoroshiro128p_normal_float64 + :noindex: + +A simple example +'''''''''''''''' + +Here is a sample program that uses the random number generator:: + + from __future__ import print_function, absolute_import + + from numba import cuda + from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32 + import numpy as np + + @cuda.jit + def compute_pi(rng_states, iterations, out): + """Find the maximum value in values and store in result[0]""" + thread_id = cuda.grid(1) + + # Compute pi by drawing random (x, y) points and finding what + # fraction lie inside a unit circle + inside = 0 + for i in range(iterations): + x = xoroshiro128p_uniform_float32(rng_states, thread_id) + y = xoroshiro128p_uniform_float32(rng_states, thread_id) + if x**2 + y**2 <= 1.0: + inside += 1 + + out[thread_id] = 4.0 * inside / iterations + + threads_per_block = 64 + blocks = 24 + rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=1) + out = np.zeros(threads_per_block * blocks, dtype=np.float32) + + compute_pi[blocks, threads_per_block](rng_states, 10000, out) + print('pi:', out.mean()) + +An example of managing RNG state size and using a 3D grid +''''''''''''''''''''''''''''''''''''''''''''''''''''''''' + +The number of RNG states scales with the number of threads using the RNG, so it +is often better to use strided loops in conjunction with the RNG in order to +keep the state size manageable. + +In the following example, which initializes a large 3D array with random +numbers, using one thread per output element would result in 453,617,100 RNG +states. This would take a long time to initialize and poorly utilize the GPU. +Instead, it uses a fixed size 3D grid with a total of 2,097,152 (``(16 ** 3) * +(8 ** 3)``) threads striding over the output array. The 3D thread indices +``startx``, ``starty``, and ``startz`` are linearized into a 1D index, +``tid``, to index into the 2,097,152 RNG states. + + +.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_random.py + :language: python + :caption: from ``test_ex_3d_grid of ``numba/cuda/tests/doc_example/test_random.py`` + :start-after: magictoken.ex_3d_grid.begin + :end-before: magictoken.ex_3d_grid.end + :dedent: 8 + :linenos: diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/reduction.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/reduction.rst new file mode 100644 index 0000000000000000000000000000000000000000..674728408bd798f2c3af26150ebbd706c2840aab --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/reduction.rst @@ -0,0 +1,33 @@ +GPU Reduction +============== + +Writing a reduction algorithm for CUDA GPU can be tricky. Numba provides a +``@reduce`` decorator for converting a simple binary operation into a reduction +kernel. An example follows:: + + import numpy + from numba import cuda + + @cuda.reduce + def sum_reduce(a, b): + return a + b + + A = (numpy.arange(1234, dtype=numpy.float64)) + 1 + expect = A.sum() # NumPy sum reduction + got = sum_reduce(A) # cuda sum reduction + assert expect == got + +Lambda functions can also be used here:: + + sum_reduce = cuda.reduce(lambda a, b: a + b) + +The Reduce class +---------------- + +The ``reduce`` decorator creates an instance of the ``Reduce`` class. +Currently, ``reduce`` is an alias to ``Reduce``, but this behavior is not +guaranteed. + +.. autoclass:: numba.cuda.Reduce + :members: __init__, __call__ + :member-order: bysource diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/simulator.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/simulator.rst new file mode 100644 index 0000000000000000000000000000000000000000..099ffc347d9646287a80d5121974c07ed3c6642e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/simulator.rst @@ -0,0 +1,104 @@ + +.. _simulator: + +================================================= +Debugging CUDA Python with the the CUDA Simulator +================================================= + +Numba includes a CUDA Simulator that implements most of the semantics in CUDA +Python using the Python interpreter and some additional Python code. This can +be used to debug CUDA Python code, either by adding print statements to your +code, or by using the debugger to step through the execution of an individual +thread. + +The simulator deliberately allows running non-CUDA code like starting a debugger +and printing arbitrary expressions for debugging purposes. Therefore, it is +best to start from code that compiles for the CUDA target, and then move over to +the simulator to investigate issues. + +Execution of kernels is performed by the simulator one block at a time. One +thread is spawned for each thread in the block, and scheduling of the execution +of these threads is left up to the operating system. + +Using the simulator +=================== + +The simulator is enabled by setting the environment variable +:envvar:`NUMBA_ENABLE_CUDASIM` to 1 prior to importing Numba. CUDA Python code +may then be executed as normal. The easiest way to use the debugger inside a +kernel is to only stop a single thread, otherwise the interaction with the +debugger is difficult to handle. For example, the kernel below will stop in +the thread ``<<<(3,0,0), (1, 0, 0)>>>``:: + + @cuda.jit + def vec_add(A, B, out): + x = cuda.threadIdx.x + bx = cuda.blockIdx.x + bdx = cuda.blockDim.x + if x == 1 and bx == 3: + from pdb import set_trace; set_trace() + i = bx * bdx + x + out[i] = A[i] + B[i] + +when invoked with a one-dimensional grid and one-dimensional blocks. + +Supported features +================== + +The simulator aims to provide as complete a simulation of execution on a real +GPU as possible - in particular, the following are supported: + +* Atomic operations +* Constant memory +* Local memory +* Shared memory: declarations of shared memory arrays must be on separate source + lines, since the simulator uses source line information to keep track of + allocations of shared memory across threads. +* Mapped arrays. +* Host and device memory operations: copying and setting memory. +* :func:`.syncthreads` is supported - however, in the case where divergent + threads enter different :func:`.syncthreads` calls, the launch will not fail, + but unexpected behaviour will occur. A future version of the simulator may + detect this condition. +* The stream API is supported, but all operations occur sequentially and + synchronously, unlike on a real device. Synchronising on a stream is therefore + a no-op. +* The event API is also supported, but provides no meaningful timing + information. +* Data transfer to and from the GPU - in particular, creating array objects with + :func:`.device_array` and :func:`.device_array_like`. The APIs for pinned memory + :func:`.pinned` and :func:`.pinned_array` are also supported, but no pinning + takes place. +* The driver API implementation of the list of GPU contexts (``cuda.gpus`` and + ``cuda.cudadrv.devices.gpus``) is supported, and reports a single GPU context. + This context can be closed and reset as the real one would. +* The :func:`.detect` function is supported, and reports one device called + `SIMULATOR`. +* Cooperative grids: A cooperative kernel can be launched, but with only one + block - the simulator always returns ``1`` from a kernel overload's + :meth:`~numba.cuda.dispatcher._Kernel.max_cooperative_grid_blocks` method. + +Some limitations of the simulator include: + +* It does not perform type checking/type inference. If any argument types to a + jitted function are incorrect, or if the specification of the type of any + local variables are incorrect, this will not be detected by the simulator. +* Only one GPU is simulated. +* Multithreaded accesses to a single GPU are not supported, and will result in + unexpected behaviour. +* Most of the driver API is unimplemented. +* It is not possible to link PTX code with CUDA Python functions. +* Warps and warp-level operations are not yet implemented. +* Because the simulator executes kernels using the Python interpreter, + structured array access by attribute that works with the hardware target may + fail in the simulator - see :ref:`structured-array-access`. +* Operations directly against device arrays are only partially supported, that + is, testing equality, less than, greater than, and basic mathematical + operations are supported, but many other operations, such as the in-place + operators and bit operators are not. +* The :func:`ffs() ` function only works correctly for values + that can be represented using 32-bit integers. + +Obviously, the speed of the simulator is also much lower than that of a real +device. It may be necessary to reduce the size of input data and the size of the +CUDA grid in order to make debugging with the simulator tractable. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/ufunc.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/ufunc.rst new file mode 100644 index 0000000000000000000000000000000000000000..c690557fce36e76563b7d7e761ff2469eaf88e58 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/cuda/ufunc.rst @@ -0,0 +1,154 @@ +CUDA Ufuncs and Generalized Ufuncs +================================== + +This page describes the CUDA ufunc-like object. + +To support the programming pattern of CUDA programs, CUDA Vectorize and +GUVectorize cannot produce a conventional ufunc. Instead, a ufunc-like +object is returned. This object is a close analog but not fully +compatible with a regular NumPy ufunc. The CUDA ufunc adds support for +passing intra-device arrays (already on the GPU device) to reduce +traffic over the PCI-express bus. It also accepts a `stream` keyword +for launching in asynchronous mode. + +Example: Basic Example +------------------------ + +:: + + import math + from numba import vectorize, cuda + import numpy as np + + @vectorize(['float32(float32, float32, float32)', + 'float64(float64, float64, float64)'], + target='cuda') + def cu_discriminant(a, b, c): + return math.sqrt(b ** 2 - 4 * a * c) + + N = 10000 + dtype = np.float32 + + # prepare the input + A = np.array(np.random.sample(N), dtype=dtype) + B = np.array(np.random.sample(N) + 10, dtype=dtype) + C = np.array(np.random.sample(N), dtype=dtype) + + D = cu_discriminant(A, B, C) + + print(D) # print result + +Example: Calling Device Functions +---------------------------------- + +All CUDA ufunc kernels have the ability to call other CUDA device functions:: + + from numba import vectorize, cuda + + # define a device function + @cuda.jit('float32(float32, float32, float32)', device=True, inline=True) + def cu_device_fn(x, y, z): + return x ** y / z + + # define a ufunc that calls our device function + @vectorize(['float32(float32, float32, float32)'], target='cuda') + def cu_ufunc(x, y, z): + return cu_device_fn(x, y, z) + + +Generalized CUDA ufuncs +----------------------- + +Generalized ufuncs may be executed on the GPU using CUDA, analogous to +the CUDA ufunc functionality. This may be accomplished as follows:: + + from numba import guvectorize + + @guvectorize(['void(float32[:,:], float32[:,:], float32[:,:])'], + '(m,n),(n,p)->(m,p)', target='cuda') + def matmulcore(A, B, C): + ... + +There are times when the gufunc kernel uses too many of a GPU's +resources, which can cause the kernel launch to fail. The user can +explicitly control the maximum size of the thread block by setting +the `max_blocksize` attribute on the compiled gufunc object. + +:: + + from numba import guvectorize + + @guvectorize(..., target='cuda') + def very_complex_kernel(A, B, C): + ... + + very_complex_kernel.max_blocksize = 32 # limits to 32 threads per block + +.. comment + + Example: A Chunk at a Time + --------------------------- + + Partitioning your data into chunks allows computation and memory transfer + to be overlapped. This can increase the throughput of your ufunc and + enables your ufunc to operate on data that is larger than the memory + capacity of your GPU. For example: + + :: + + import math + from numba import vectorize, cuda + import numpy as np + + # the ufunc kernel + def discriminant(a, b, c): + return math.sqrt(b ** 2 - 4 * a * c) + + cu_discriminant = vectorize(['float32(float32, float32, float32)', + 'float64(float64, float64, float64)'], + target='cuda')(discriminant) + + N = int(1e+8) + dtype = np.float32 + + # prepare the input + A = np.array(np.random.sample(N), dtype=dtype) + B = np.array(np.random.sample(N) + 10, dtype=dtype) + C = np.array(np.random.sample(N), dtype=dtype) + D = np.empty(A.shape, dtype=A.dtype) + + # create a CUDA stream + stream = cuda.stream() + + chunksize = 1e+6 + chunkcount = N // chunksize + + # partition NumPy arrays into chunks + # no copying is performed + sA = np.split(A, chunkcount) + sB = np.split(B, chunkcount) + sC = np.split(C, chunkcount) + sD = np.split(D, chunkcount) + + device_ptrs = [] + + with stream.auto_synchronize(): + # every operation in this context with be launched asynchronously + # by using the CUDA stream + + # for each chunk + for a, b, c, d in zip(sA, sB, sC, sD): + # transfer to device + dA = cuda.to_device(a, stream) + dB = cuda.to_device(b, stream) + dC = cuda.to_device(c, stream) + dD = cuda.to_device(d, stream, copy=False) # no copying + # launch kernel + cu_discriminant(dA, dB, dC, out=dD, stream=stream) + # retrieve result + dD.copy_to_host(d, stream) + # store device pointers to prevent them from freeing before + # the kernel is scheduled + device_ptrs.extend([dA, dB, dC, dD]) + + # data is ready at this point inside D diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/architecture.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/architecture.rst new file mode 100644 index 0000000000000000000000000000000000000000..3e9dffe1bfdecbec5213f0bb45f63143f198208d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/architecture.rst @@ -0,0 +1,909 @@ + +.. _architecture: + +================== +Numba architecture +================== + +Introduction +============ + +Numba is a compiler for Python bytecode with optional type-specialization. + +Suppose you enter a function like this into the standard Python interpreter +(henceforward referred to as "CPython"):: + + def add(a, b): + return a + b + +The interpreter will immediately parse the function and convert it into a +bytecode representation that describes how the CPython interpreter should +execute the function at a low level. For the example above, it looks +something like this:: + + >>> import dis + >>> dis.dis(add) + 2 0 LOAD_FAST 0 (a) + 3 LOAD_FAST 1 (b) + 6 BINARY_ADD + 7 RETURN_VALUE + + +CPython uses a stack-based interpreter (much like an HP calculator), so the +code first pushes two local variables onto the stack. The ``BINARY_ADD`` +opcode pops the top two arguments off the stack and makes a Python C API +function call that is equivalent to calling ``a.__add__(b)``. The result is +then pushed onto the top of the interpreter stack. Finally, the +``RETURN_VALUE`` opcode returns value on the top of the stack as the result of +the function call. + +Numba can take this bytecode and compile it to machine code that performs the +same operations as the CPython interpreter, treating ``a`` and ``b`` as +generic Python objects. The full semantics of Python are preserved, and the +compiled function can be used with any kind of objects that have the add +operator defined. When a Numba function is compiled this way, we say that it +has been compiled in :term:`object mode`, because the code still manipulates +Python objects. + +Numba code compiled in object mode is not much faster than executing the +original Python function in the CPython interpreter. However, if we +specialize the function to only run with certain data types, Numba can +generate much shorter and more efficient code that manipulates the data +natively without any calls into the Python C API. When code has been compiled +for specific data types so that the function body no longer relies on the +Python runtime, we say the function has been compiled in :term:`nopython mode`. +Numeric code compiled in nopython mode can be hundreds of times faster +than the original Python. + + +Compiler architecture +===================== + +Like many compilers, Numba can be conceptually divided into a +*frontend* and a *backend*. + +The Numba *frontend* comprises the stages which analyze the Python bytecode, +translate it to :term:`Numba IR` and perform various transformations and +analysis steps on the IR. One of the key steps is :term:`type inference`. +The frontend must succeed in typing all variables unambiguously in order +for the backend to generate code in :term:`nopython mode`, because the +backend uses type information to match appropriate code generators with +the values they operate on. + +The Numba *backend* walks the Numba IR resulting from the frontend analyses +and exploits the type information deduced by the type inference phase to +produce the right LLVM code for each encountered operation. After LLVM +code is produced, the LLVM library is asked to optimize it and generate +native processor code for the final, native function. + +There are other pieces besides the compiler frontend and backend, such +as the caching machinery for JIT functions. Those pieces are not considered +in this document. + + +Contexts +======== + +Numba is quite flexible, allowing it to generate code for different hardware +architectures like CPUs and GPUs. In order to support these different +applications, Numba uses a *typing context* and a *target context*. + +A *typing context* is used in the compiler frontend to perform type inference +on operations and values in the function. Similar typing contexts could be +used for many architectures because for nearly all cases, typing inference +is hardware-independent. However, Numba currently has a different typing +context for each target. + +A *target context* is used to generate the specific instruction sequence +required to operate on the Numba types identified during type inference. +Target contexts are architecture-specific and are flexible in defining +the execution model and available Python APIs. For example, Numba has a "cpu" +and a "cuda" context for those two kinds of architecture, and a "parallel" +context which produces multithreaded CPU code. + + +Compiler stages +=============== + +The :func:`~numba.jit` decorator in Numba ultimately calls +``numba.compiler.compile_extra()`` which compiles the Python function in a +multi-stage process, described below. + +Stage 1: Analyze bytecode +------------------------- + +At the start of compilation, the function bytecode is passed to an instance of +the Numba interpreter (``numba.interpreter``). The interpreter object +analyzes the bytecode to find the control flow graph (``numba.controlflow``). +The control flow graph (CFG) describes the ways that execution can move from one +block to the next inside the function as a result of loops and branches. + +The data flow analysis (``numba.dataflow``) takes the control flow graph and +traces how values get pushed and popped off the Python interpreter stack for +different code paths. This is important to understand the lifetimes of +variables on the stack, which are needed in Stage 2. + +If you set the environment variable ``NUMBA_DUMP_CFG`` to 1, Numba will dump +the results of the control flow graph analysis to the screen. Our ``add()`` +example is pretty boring, since there is only one statement block:: + + CFG adjacency lists: + {0: []} + CFG dominators: + {0: set([0])} + CFG post-dominators: + {0: set([0])} + CFG back edges: [] + CFG loops: + {} + CFG node-to-loops: + {0: []} + +A function with more complex flow control will have a more interesting +control flow graph. This function:: + + def doloops(n): + acc = 0 + for i in range(n): + acc += 1 + if n == 10: + break + return acc + +compiles to this bytecode:: + + 9 0 LOAD_CONST 1 (0) + 3 STORE_FAST 1 (acc) + + 10 6 SETUP_LOOP 46 (to 55) + 9 LOAD_GLOBAL 0 (range) + 12 LOAD_FAST 0 (n) + 15 CALL_FUNCTION 1 + 18 GET_ITER + >> 19 FOR_ITER 32 (to 54) + 22 STORE_FAST 2 (i) + + 11 25 LOAD_FAST 1 (acc) + 28 LOAD_CONST 2 (1) + 31 INPLACE_ADD + 32 STORE_FAST 1 (acc) + + 12 35 LOAD_FAST 0 (n) + 38 LOAD_CONST 3 (10) + 41 COMPARE_OP 2 (==) + 44 POP_JUMP_IF_FALSE 19 + + 13 47 BREAK_LOOP + 48 JUMP_ABSOLUTE 19 + 51 JUMP_ABSOLUTE 19 + >> 54 POP_BLOCK + + 14 >> 55 LOAD_FAST 1 (acc) + 58 RETURN_VALUE + +The corresponding CFG for this bytecode is:: + + CFG adjacency lists: + {0: [6], 6: [19], 19: [54, 22], 22: [19, 47], 47: [55], 54: [55], 55: []} + CFG dominators: + {0: set([0]), + 6: set([0, 6]), + 19: set([0, 6, 19]), + 22: set([0, 6, 19, 22]), + 47: set([0, 6, 19, 22, 47]), + 54: set([0, 6, 19, 54]), + 55: set([0, 6, 19, 55])} + CFG post-dominators: + {0: set([0, 6, 19, 55]), + 6: set([6, 19, 55]), + 19: set([19, 55]), + 22: set([22, 55]), + 47: set([47, 55]), + 54: set([54, 55]), + 55: set([55])} + CFG back edges: [(22, 19)] + CFG loops: + {19: Loop(entries=set([6]), exits=set([54, 47]), header=19, body=set([19, 22]))} + CFG node-to-loops: + {0: [], 6: [], 19: [19], 22: [19], 47: [], 54: [], 55: []} + +The numbers in the CFG refer to the bytecode offsets shown just to the left +of the opcode names above. + +.. _arch_generate_numba_ir: + +Stage 2: Generate the Numba IR +------------------------------ + +Once the control flow and data analyses are complete, the Numba interpreter +can step through the bytecode and translate it into an Numba-internal +intermediate representation. This translation process changes the function +from a stack machine representation (used by the Python interpreter) to a +register machine representation (used by LLVM). + +Although the IR is stored in memory as a tree of objects, it can be serialized +to a string for debugging. If you set the environment variable +``NUMBA_DUMP_IR`` equal to 1, the Numba IR will be dumped to the screen. For +the ``add()`` function described above, the Numba IR looks like:: + + label 0: + a = arg(0, name=a) ['a'] + b = arg(1, name=b) ['b'] + $0.3 = a + b ['$0.3', 'a', 'b'] + del b [] + del a [] + $0.4 = cast(value=$0.3) ['$0.3', '$0.4'] + del $0.3 [] + return $0.4 ['$0.4'] + +The ``del`` instructions are produced by :ref:`live variable analysis`. +Those instructions ensure references are not leaked. +In :term:`nopython mode`, some objects are tracked by the Numba runtime and +some are not. For tracked objects, a dereference operation is emitted; +otherwise, the instruction is an no-op. +In :term:`object mode` each variable contains an owned reference to a PyObject. + + +.. _`rewrite-untyped-ir`: + +Stage 3: Rewrite untyped IR +--------------------------- + +Before running type inference, it may be desired to run certain +transformations on the Numba IR. One such example is to detect ``raise`` +statements which have an implicitly constant argument, so as to +support them in :term:`nopython mode`. Let's say you compile the +following function with Numba:: + + def f(x): + if x == 0: + raise ValueError("x cannot be zero") + +If you set the :envvar:`NUMBA_DUMP_IR` environment variable to ``1``, +you'll see the IR being rewritten before the type inference phase:: + + REWRITING: + del $0.3 [] + $12.1 = global(ValueError: ) ['$12.1'] + $const12.2 = const(str, x cannot be zero) ['$const12.2'] + $12.3 = call $12.1($const12.2) ['$12.1', '$12.3', '$const12.2'] + del $const12.2 [] + del $12.1 [] + raise $12.3 ['$12.3'] + ____________________________________________________________ + del $0.3 [] + $12.1 = global(ValueError: ) ['$12.1'] + $const12.2 = const(str, x cannot be zero) ['$const12.2'] + $12.3 = call $12.1($const12.2) ['$12.1', '$12.3', '$const12.2'] + del $const12.2 [] + del $12.1 [] + raise ('x cannot be zero') [] + + +.. _arch_type_inference: + +Stage 4: Infer types +-------------------- + +Now that the Numba IR has been generated, type analysis can be performed. The +types of the function arguments can be taken either from the explicit function +signature given in the ``@jit`` decorator (such as ``@jit('float64(float64, +float64)')``), or they can be taken from the types of the actual function +arguments if compilation is happening when the function is first called. + +The type inference engine is found in ``numba.typeinfer``. Its job is to +assign a type to every intermediate variable in the Numba IR. The result of +this pass can be seen by setting the :envvar:`NUMBA_DUMP_ANNOTATION` +environment variable to 1: + +.. code-block:: python + + -----------------------------------ANNOTATION----------------------------------- + # File: archex.py + # --- LINE 4 --- + + @jit(nopython=True) + + # --- LINE 5 --- + + def add(a, b): + + # --- LINE 6 --- + # label 0 + # a = arg(0, name=a) :: int64 + # b = arg(1, name=b) :: int64 + # $0.3 = a + b :: int64 + # del b + # del a + # $0.4 = cast(value=$0.3) :: int64 + # del $0.3 + # return $0.4 + + return a + b + + +If type inference fails to find a consistent type assignment for all the +intermediate variables, it will label every variable as type ``pyobject`` and +fall back to object mode. Type inference can fail when unsupported Python +types, language features, or functions are used in the function body. + + +.. _`rewrite-typed-ir`: + +Stage 5a: Rewrite typed IR +-------------------------- + +This pass's purpose is to perform any high-level optimizations that still +require, or could at least benefit from, Numba IR type information. + +One example of a problem domain that isn't as easily optimized once +lowered is the domain of multidimensional array operations. When +Numba lowers an array operation, Numba treats the operation like a +full ufunc kernel. During lowering a single array operation, Numba +generates an inline broadcasting loop that creates a new result array. +Then Numba generates an application loop that applies the operator +over the array inputs. Recognizing and rewriting these loops once +they are lowered into LLVM is hard, if not impossible. + +An example pair of optimizations in the domain of array operators is +loop fusion and shortcut deforestation. When the optimizer +recognizes that the output of one array operator is being fed into +another array operator, and only to that array operator, it can fuse +the two loops into a single loop. The optimizer can further eliminate +the temporary array allocated for the initial operation by directly +feeding the result of the first operation into the second, skipping +the store and load to the intermediate array. This elimination is +known as shortcut deforestation. Numba currently uses the rewrite +pass to implement these array optimizations. For more information, +please consult the ":ref:`case-study-array-expressions`" subsection, +later in this document. + +One can see the result of rewriting by setting the +:envvar:`NUMBA_DUMP_IR` environment variable to a non-zero value (such +as 1). The following example shows the output of the rewrite pass as +it recognizes an array expression consisting of a multiply and add, +and outputs a fused kernel as a special operator, :func:`arrayexpr`:: + + ______________________________________________________________________ + REWRITING: + a0 = arg(0, name=a0) ['a0'] + a1 = arg(1, name=a1) ['a1'] + a2 = arg(2, name=a2) ['a2'] + $0.3 = a0 * a1 ['$0.3', 'a0', 'a1'] + del a1 [] + del a0 [] + $0.5 = $0.3 + a2 ['$0.3', '$0.5', 'a2'] + del a2 [] + del $0.3 [] + $0.6 = cast(value=$0.5) ['$0.5', '$0.6'] + del $0.5 [] + return $0.6 ['$0.6'] + ____________________________________________________________ + a0 = arg(0, name=a0) ['a0'] + a1 = arg(1, name=a1) ['a1'] + a2 = arg(2, name=a2) ['a2'] + $0.5 = arrayexpr(ty=array(float64, 1d, C), expr=('+', [('*', [Var(a0, test.py (14)), Var(a1, test.py (14))]), Var(a2, test.py (14))])) ['$0.5', 'a0', 'a1', 'a2'] + del a0 [] + del a1 [] + del a2 [] + $0.6 = cast(value=$0.5) ['$0.5', '$0.6'] + del $0.5 [] + return $0.6 ['$0.6'] + ______________________________________________________________________ + +Following this rewrite, Numba lowers the array expression into a new +ufunc-like function that is inlined into a single loop that only +allocates a single result array. + + +.. _`parallel-accelerator`: + +Stage 5b: Perform Automatic Parallelization +------------------------------------------- + +This pass is only performed if the ``parallel`` option in the :func:`~numba.jit` +decorator is set to ``True``. This pass finds parallelism implicit in the +semantics of operations in the Numba IR and replaces those operations +with explicitly parallel representations of those operations using a +special `parfor` operator. Then, optimizations are performed to maximize +the number of parfors that are adjacent to each other such that they can +then be fused together into one parfor that takes only one pass over the +data and will thus typically have better cache performance. Finally, +during lowering, these parfor operators are converted to a form similar +to guvectorize to implement the actual parallelism. + +The automatic parallelization pass has a number of sub-passes, many of +which are controllable using a dictionary of options passed via the +``parallel`` keyword argument to :func:`~numba.jit`:: + + { 'comprehension': True/False, # parallel comprehension + 'prange': True/False, # parallel for-loop + 'numpy': True/False, # parallel numpy calls + 'reduction': True/False, # parallel reduce calls + 'setitem': True/False, # parallel setitem + 'stencil': True/False, # parallel stencils + 'fusion': True/False, # enable fusion or not + } + +The default is set to `True` for all of them. The sub-passes are +described in more detail in the following paragraphs. + +#. CFG Simplification + Sometimes Numba IR will contain chains of blocks containing no loops which + are merged in this sub-pass into single blocks. This sub-pass simplifies + subsequent analysis of the IR. + +#. Numpy canonicalization + Some Numpy operations can be written as operations on Numpy objects (e.g. + ``arr.sum()``), or as calls to Numpy taking those objects (e.g. + ``numpy.sum(arr)``). This sub-pass converts all such operations to the + latter form for cleaner subsequent analysis. + +#. Array analysis + A critical requirement for later parfor fusion is that parfors have + identical iteration spaces and these iteration spaces typically correspond + to the sizes of the dimensions of Numpy arrays. In this sub-pass, the IR is + analyzed to determine equivalence classes for the dimensions of Numpy + arrays. Consider the example, ``a = b + 1``, where ``a`` and ``b`` are both + Numpy arrays. Here, we know that each dimension of ``a`` must have the same + equivalence class as the corresponding dimension of ``b``. Typically, + routines rich in Numpy operations will enable equivalence classes to be + fully known for all arrays created within a function. + + Array analysis will also reason about size equivalence for slice selection, + and boolean array masking (one dimensional only). For example, it is able to + infer that ``a[1 : n-1]`` is of the same size as ``b[0 : n-2]``. + + Array analysis may also insert safety assumptions to ensure pre-conditions + related to array sizes are met before an operation can be parallelized. + For example, ``np.dot(X, w)`` between a 2-D matrix ``X`` and a 1-D vector ``w`` + requires that the second dimension of ``X`` is of the same size as ``w``. + Usually this kind of runtime check is automatically inserted, but if array + analysis can infer such equivalence, it will skip them. + + Users can even help array analysis by turning implicit knowledge about + array sizes into explicit assertions. For example, in the code below: + + .. code-block:: python + + @numba.njit(parallel=True) + def logistic_regression(Y, X, w, iterations): + assert(X.shape == (Y.shape[0], w.shape[0])) + for i in range(iterations): + w -= np.dot(((1.0 / (1.0 + np.exp(-Y * np.dot(X, w))) - 1.0) * Y), X) + return w + + Making the explicit assertion helps eliminate all bounds checks in the + rest of the function. + +#. ``prange()`` to parfor + The use of prange (:ref:`numba-prange`) in a for loop is an explicit + indication from the programmer that all iterations of the for loop can + execute in parallel. In this sub-pass, we analyze the CFG to locate loops + and to convert those loops controlled by a prange object to the explicit + `parfor` operator. Each explicit parfor operator consists of: + + a. A list of loop nest information that describes the iteration space of the + parfor. Each entry in the loop nest list contains an indexing variable, + the start of the range, the end of the range, and the step value for each + iteration. + #. An initialization (init) block which contains instructions to be executed + one time before the parfor begins executing. + #. A loop body comprising a set of basic blocks that correspond to the body + of the loop and compute one point in the iteration space. + #. The index variables used for each dimension of the iteration space. + + For parfor `pranges`, the loop nest is a single entry where the start, + stop, and step fields come from the specified `prange`. The init block is + empty for `prange` parfors and the loop body is the set of blocks in the + loop minus the loop header. + + With parallelization on, array comprehensions (:ref:`pysupported-comprehension`) + will also be translated to prange so as to run in parallel. This behavior + be disabled by setting ``parallel={'comprehension': False}``. + + Likewise, the overall `prange` to `parfor` translation can be disabled by + setting ``parallel={'prange': False}``, in which case `prange` is treated the + same as `range`. + +#. Numpy to parfor + In this sub-pass, Numpy functions such as ``ones``, ``zeros``, ``dot``, most + of the random number generating functions, arrayexprs (from Section + :ref:`rewrite-typed-ir`), and Numpy reductions are converted to parfors. + Generally, this conversion creates the loop nest list, whose length is equal + to the number of dimensions of the left-hand side of the assignment + instruction in the IR. The number and size of the dimensions of the + left-hand-side array is taken from the array analysis information generated + in sub-pass 3 above. An instruction to create the result Numpy array is + generated and stored in the new parfor's init block. A basic block is + created for the loop body and an instruction is generated and added to the + end of that block to store the result of the computation into the array at + the current point in the iteration space. The result stored into the array + depends on the operation that is being converted. For example, for ``ones``, + the value stored is a constant 1. For calls to generate a random array, the + value comes from a call to the same random number function but with the size + parameter dropped and therefore returning a scalar. For arrayexpr operators, + the arrayexpr tree is converted to Numba IR and the value at the root of that + expression tree is used to write into the output array. The translation from + Numpy functions and arrayexpr operators to `parfor` can be disabled by + setting ``parallel={'numpy': False}``. + + For reductions, the loop nest list is similarly created using the array + analysis information for the array being reduced. In the init block, the + initial value is assigned to the reduction variable. The loop body consists + of a single block in which the next value in the iteration space is fetched + and the reduction operation is applied to that value and the current + reduction value and the result stored back into the reduction value. + The translation of reduction functions to `parfor` can be disabled by + setting ``parallel={'reduction': False}``. + + Setting the :envvar:`NUMBA_DEBUG_ARRAY_OPT_STATS` environment variable to + 1 will show some statistics about parfor conversions in general. + +#. Setitem to parfor + Setting a range of array elements using a slice or boolean array selection + can also run in parallel. Statement such as ``A[P] = B[Q]`` + (or a simpler case ``A[P] = c``, where ``c`` is a scalar) is translated to + `parfor` if one of the following conditions is met: + + a. ``P`` and ``Q`` are slices or multi-dimensional selector involving + scalar and slices, and ``A[P]`` and ``B[Q]`` are considered size + equivalent by array analysis. Only 2-value slice/range is supported, + 3-value with a step will not be translated to `parfor`. + #. ``P`` and ``Q`` are the same boolean array. + + This translation can be disabled by setting ``parallel={'setitem': False}``. + +#. Simplification + Performs a copy propagation and dead code elimination pass. + +#. Fusion + This sub-pass first processes each basic block and does a reordering of the + instructions within the block with the goal of pushing parfors lower in the + block and lifting non-parfors towards the start of the block. In practice, + this approach does a good job of getting parfors adjacent to each other in + the IR, which enables more parfors to then be fused. During parfor fusion, + each basic block is repeatedly scanned until no further fusion is possible. + During this scan, each set of adjacent instructions are considered. + Adjacent instructions are fused together if: + + a. they are both parfors + #. the parfors' loop nests are the same size and the array equivalence + classes for each dimension of the loop nests are the same, and + #. the first parfor does not create a reduction variable used by the + second parfor. + + The two parfors are fused together by adding the second parfor's init block + to the first's, merging the two parfors' loop bodies together and replacing + the instances of the second parfor's loop index variables in the second + parfor's body with the loop index variables for the first parfor. + Fusion can be disabled by setting ``parallel={'fusion': False}``. + + Setting the :envvar:`NUMBA_DEBUG_ARRAY_OPT_STATS` environment variable to + 1 will show some statistics about parfor fusions. + +#. Push call objects and compute parfor parameters + In the lowering phase described in Section :ref:`lowering`, each parfor + becomes a separate function executed in parallel in ``guvectorize`` + (:ref:`guvectorize`) style. Since parfors may use variables defined + previously in a function, when those parfors become separate functions, + those variables must be passed to the parfor function as parameters. In + this sub-pass, a use-def scan is made over each parfor body and liveness + information is used to determine which variables are used but not defined by + the parfor. That list of variables is stored here in the parfor for use + during lowering. Function variables are a special case in this process + since function variables cannot be passed to functions compiled in nopython + mode. Instead, for function variables, this sub-pass pushes the assignment + instruction to the function variable into the parfor body so that those do + not need to be passed as parameters. + + To see the intermediate IR between the above sub-passes and other debugging + information, set the :envvar:`NUMBA_DEBUG_ARRAY_OPT` environment variable to + 1. For the example in Section :ref:`rewrite-typed-ir`, the following IR with + a parfor is generated during this stage:: + + ______________________________________________________________________ + label 0: + a0 = arg(0, name=a0) ['a0'] + a0_sh_attr0.0 = getattr(attr=shape, value=a0) ['a0', 'a0_sh_attr0.0'] + $consta00.1 = const(int, 0) ['$consta00.1'] + a0size0.2 = static_getitem(value=a0_sh_attr0.0, index_var=$consta00.1, index=0) ['$consta00.1', 'a0_sh_attr0.0', 'a0size0.2'] + a1 = arg(1, name=a1) ['a1'] + a1_sh_attr0.3 = getattr(attr=shape, value=a1) ['a1', 'a1_sh_attr0.3'] + $consta10.4 = const(int, 0) ['$consta10.4'] + a1size0.5 = static_getitem(value=a1_sh_attr0.3, index_var=$consta10.4, index=0) ['$consta10.4', 'a1_sh_attr0.3', 'a1size0.5'] + a2 = arg(2, name=a2) ['a2'] + a2_sh_attr0.6 = getattr(attr=shape, value=a2) ['a2', 'a2_sh_attr0.6'] + $consta20.7 = const(int, 0) ['$consta20.7'] + a2size0.8 = static_getitem(value=a2_sh_attr0.6, index_var=$consta20.7, index=0) ['$consta20.7', 'a2_sh_attr0.6', 'a2size0.8'] + ---begin parfor 0--- + index_var = parfor_index.9 + LoopNest(index_variable=parfor_index.9, range=0,a0size0.2,1 correlation=5) + init block: + $np_g_var.10 = global(np: ) ['$np_g_var.10'] + $empty_attr_attr.11 = getattr(attr=empty, value=$np_g_var.10) ['$empty_attr_attr.11', '$np_g_var.10'] + $np_typ_var.12 = getattr(attr=float64, value=$np_g_var.10) ['$np_g_var.10', '$np_typ_var.12'] + $0.5 = call $empty_attr_attr.11(a0size0.2, $np_typ_var.12, kws=(), func=$empty_attr_attr.11, vararg=None, args=[Var(a0size0.2, test2.py (7)), Var($np_typ_var.12, test2.py (7))]) ['$0.5', '$empty_attr_attr.11', '$np_typ_var.12', 'a0size0.2'] + label 1: + $arg_out_var.15 = getitem(value=a0, index=parfor_index.9) ['$arg_out_var.15', 'a0', 'parfor_index.9'] + $arg_out_var.16 = getitem(value=a1, index=parfor_index.9) ['$arg_out_var.16', 'a1', 'parfor_index.9'] + $arg_out_var.14 = $arg_out_var.15 * $arg_out_var.16 ['$arg_out_var.14', '$arg_out_var.15', '$arg_out_var.16'] + $arg_out_var.17 = getitem(value=a2, index=parfor_index.9) ['$arg_out_var.17', 'a2', 'parfor_index.9'] + $expr_out_var.13 = $arg_out_var.14 + $arg_out_var.17 ['$arg_out_var.14', '$arg_out_var.17', '$expr_out_var.13'] + $0.5[parfor_index.9] = $expr_out_var.13 ['$0.5', '$expr_out_var.13', 'parfor_index.9'] + ----end parfor 0---- + $0.6 = cast(value=$0.5) ['$0.5', '$0.6'] + return $0.6 ['$0.6'] + ______________________________________________________________________ + + .. _`lowering`: + +Stage 6a: Generate nopython LLVM IR +----------------------------------- + +If type inference succeeds in finding a Numba type for every intermediate +variable, then Numba can (potentially) generate specialized native code. This +process is called :term:`lowering`. The Numba IR tree is translated into +LLVM IR by using helper classes from `llvmlite `_. +The machine-generated LLVM IR can seem unnecessarily verbose, but the LLVM +toolchain is able to optimize it quite easily into compact, efficient code. + +The basic lowering algorithm is generic, but the specifics of how particular +Numba IR nodes are translated to LLVM instructions is handled by the +target context selected for compilation. The default target context is +the "cpu" context, defined in ``numba.targets.cpu``. + +The LLVM IR can be displayed by setting the :envvar:`NUMBA_DUMP_LLVM` environment +variable to 1. For the "cpu" context, our ``add()`` example would look like: + +.. code-block:: llvm + + define i32 @"__main__.add$1.int64.int64"(i64* %"retptr", + {i8*, i32}** %"excinfo", + i8* %"env", + i64 %"arg.a", i64 %"arg.b") + { + entry: + %"a" = alloca i64 + %"b" = alloca i64 + %"$0.3" = alloca i64 + %"$0.4" = alloca i64 + br label %"B0" + B0: + store i64 %"arg.a", i64* %"a" + store i64 %"arg.b", i64* %"b" + %".8" = load i64* %"a" + %".9" = load i64* %"b" + %".10" = add i64 %".8", %".9" + store i64 %".10", i64* %"$0.3" + %".12" = load i64* %"$0.3" + store i64 %".12", i64* %"$0.4" + %".14" = load i64* %"$0.4" + store i64 %".14", i64* %"retptr" + ret i32 0 + } + +The post-optimization LLVM IR can be output by setting +:envvar:`NUMBA_DUMP_OPTIMIZED` to 1. The optimizer shortens the code +generated above quite significantly: + +.. code-block:: llvm + + define i32 @"__main__.add$1.int64.int64"(i64* nocapture %retptr, + { i8*, i32 }** nocapture readnone %excinfo, + i8* nocapture readnone %env, + i64 %arg.a, i64 %arg.b) + { + entry: + %.10 = add i64 %arg.b, %arg.a + store i64 %.10, i64* %retptr, align 8 + ret i32 0 + } + +If created during :ref:`parallel-accelerator`, parfor operations are +lowered in the following manner. First, instructions in the parfor's init +block are lowered into the existing function using the normal lowering code. +Second, the loop body of the parfor is turned into a separate GUFunc. +Third, code is emitted for the current function to call the parallel GUFunc. + +To create a GUFunc from the parfor body, the signature of the GUFunc is +created by taking the parfor parameters as identified in step 9 of +Stage :ref:`parallel-accelerator` and adding to that a special `schedule` +parameter, across which the GUFunc will be parallelized. The schedule +parameter is in effect a static schedule mapping portions of the parfor +iteration space to Numba threads and so the length of the schedule +array is the same as the number of configured Numba threads. To make +this process easier and somewhat less dependent on changes to Numba IR, +this stage creates a Python function as text that contains the parameters +to the GUFunc and iteration code that takes the current schedule entry +and loops through the specified portion of the iteration space. In the +body of that loop, a special sentinel is inserted for subsequent easy +location. This code that handles the processing of the iteration space +is then ``eval``'ed into existence and the Numba compiler's run_frontend +function is called to generate IR. That IR is scanned to locate the +sentinel and the sentinel is replaced with the loop body of the parfor. +Then, the process of creating the parallel GUFunc is completed by +compiling this merged IR with the Numba compiler's ``compile_ir`` function. + +To call the parallel GUFunc, the static schedule must be created. +Code is inserted to call a function named ``do_scheduling.`` This function +is called with the size of each of the parfor's dimensions and the number +`N` of configured Numba threads (:envvar:`NUMBA_NUM_THREADS`). +The ``do_scheduling`` function will divide +the iteration space into N approximately equal sized regions (linear for +1D, rectangular for 2D, or hyperrectangles for 3+D) and the resulting +schedule is passed to the parallel GUFunc. The number of threads +dedicated to a given dimension of the full iteration space is roughly +proportional to the ratio of the size of the given dimension to the sum +of the sizes of all the dimensions of the iteration space. + +Parallel reductions are not natively provided by GUFuncs but the parfor +lowering strategy allows us to use GUFuncs in a way that reductions can +be performed in parallel. To accomplish this, for each reduction variable +computed by a parfor, the parallel GUFunc and the code that calls it are +modified to make the scalar reduction variable into an array of reduction +variables whose length is equal to the number of Numba threads. In addition, +the GUFunc still contains a scalar version of the reduction variable that +is updated by the parfor body during each iteration. One time at the +end of the GUFunc this local reduction variable is copied into the +reduction array. In this way, false sharing of the reduction array is +prevented. Code is also inserted into the main +function after the parallel GUFunc has returned that does a reduction +across this smaller reduction array and this final reduction value is +then stored into the original scalar reduction variable. + +The GUFunc corresponding to the example from Section :ref:`parallel-accelerator` +can be seen below:: + + ______________________________________________________________________ + label 0: + sched.29 = arg(0, name=sched) ['sched.29'] + a0 = arg(1, name=a0) ['a0'] + a1 = arg(2, name=a1) ['a1'] + a2 = arg(3, name=a2) ['a2'] + _0_5 = arg(4, name=_0_5) ['_0_5'] + $3.1.24 = global(range: ) ['$3.1.24'] + $const3.3.21 = const(int, 0) ['$const3.3.21'] + $3.4.23 = getitem(value=sched.29, index=$const3.3.21) ['$3.4.23', '$const3.3.21', 'sched.29'] + $const3.6.28 = const(int, 1) ['$const3.6.28'] + $3.7.27 = getitem(value=sched.29, index=$const3.6.28) ['$3.7.27', '$const3.6.28', 'sched.29'] + $const3.8.32 = const(int, 1) ['$const3.8.32'] + $3.9.31 = $3.7.27 + $const3.8.32 ['$3.7.27', '$3.9.31', '$const3.8.32'] + $3.10.36 = call $3.1.24($3.4.23, $3.9.31, kws=[], func=$3.1.24, vararg=None, args=[Var($3.4.23, (2)), Var($3.9.31, (2))]) ['$3.1.24', '$3.10.36', '$3.4.23', '$3.9.31'] + $3.11.30 = getiter(value=$3.10.36) ['$3.10.36', '$3.11.30'] + jump 1 [] + label 1: + $28.2.35 = iternext(value=$3.11.30) ['$28.2.35', '$3.11.30'] + $28.3.25 = pair_first(value=$28.2.35) ['$28.2.35', '$28.3.25'] + $28.4.40 = pair_second(value=$28.2.35) ['$28.2.35', '$28.4.40'] + branch $28.4.40, 2, 3 ['$28.4.40'] + label 2: + $arg_out_var.15 = getitem(value=a0, index=$28.3.25) ['$28.3.25', '$arg_out_var.15', 'a0'] + $arg_out_var.16 = getitem(value=a1, index=$28.3.25) ['$28.3.25', '$arg_out_var.16', 'a1'] + $arg_out_var.14 = $arg_out_var.15 * $arg_out_var.16 ['$arg_out_var.14', '$arg_out_var.15', '$arg_out_var.16'] + $arg_out_var.17 = getitem(value=a2, index=$28.3.25) ['$28.3.25', '$arg_out_var.17', 'a2'] + $expr_out_var.13 = $arg_out_var.14 + $arg_out_var.17 ['$arg_out_var.14', '$arg_out_var.17', '$expr_out_var.13'] + _0_5[$28.3.25] = $expr_out_var.13 ['$28.3.25', '$expr_out_var.13', '_0_5'] + jump 1 [] + label 3: + $const44.1.33 = const(NoneType, None) ['$const44.1.33'] + $44.2.39 = cast(value=$const44.1.33) ['$44.2.39', '$const44.1.33'] + return $44.2.39 ['$44.2.39'] + ______________________________________________________________________ + + +Stage 6b: Generate object mode LLVM IR +-------------------------------------- + +If type inference fails to find Numba types for all values inside a function, +the function will be compiled in object mode. The generated LLVM will be +significantly longer, as the compiled code will need to make calls to the +`Python C API `_ to perform basically all +operations. The optimized LLVM for our example ``add()`` function is: + +.. code-block:: llvm + + @PyExc_SystemError = external global i8 + @".const.Numba_internal_error:_object_mode_function_called_without_an_environment" = internal constant [73 x i8] c"Numba internal error: object mode function called without an environment\00" + @".const.name_'a'_is_not_defined" = internal constant [24 x i8] c"name 'a' is not defined\00" + @PyExc_NameError = external global i8 + @".const.name_'b'_is_not_defined" = internal constant [24 x i8] c"name 'b' is not defined\00" + + define i32 @"__main__.add$1.pyobject.pyobject"(i8** nocapture %retptr, { i8*, i32 }** nocapture readnone %excinfo, i8* readnone %env, i8* %arg.a, i8* %arg.b) { + entry: + %.6 = icmp eq i8* %env, null + br i1 %.6, label %entry.if, label %entry.endif, !prof !0 + + entry.if: ; preds = %entry + tail call void @PyErr_SetString(i8* @PyExc_SystemError, i8* getelementptr inbounds ([73 x i8]* @".const.Numba_internal_error:_object_mode_function_called_without_an_environment", i64 0, i64 0)) + ret i32 -1 + + entry.endif: ; preds = %entry + tail call void @Py_IncRef(i8* %arg.a) + tail call void @Py_IncRef(i8* %arg.b) + %.21 = icmp eq i8* %arg.a, null + br i1 %.21, label %B0.if, label %B0.endif, !prof !0 + + B0.if: ; preds = %entry.endif + tail call void @PyErr_SetString(i8* @PyExc_NameError, i8* getelementptr inbounds ([24 x i8]* @".const.name_'a'_is_not_defined", i64 0, i64 0)) + tail call void @Py_DecRef(i8* null) + tail call void @Py_DecRef(i8* %arg.b) + ret i32 -1 + + B0.endif: ; preds = %entry.endif + %.30 = icmp eq i8* %arg.b, null + br i1 %.30, label %B0.endif1, label %B0.endif1.1, !prof !0 + + B0.endif1: ; preds = %B0.endif + tail call void @PyErr_SetString(i8* @PyExc_NameError, i8* getelementptr inbounds ([24 x i8]* @".const.name_'b'_is_not_defined", i64 0, i64 0)) + tail call void @Py_DecRef(i8* %arg.a) + tail call void @Py_DecRef(i8* null) + ret i32 -1 + + B0.endif1.1: ; preds = %B0.endif + %.38 = tail call i8* @PyNumber_Add(i8* %arg.a, i8* %arg.b) + %.39 = icmp eq i8* %.38, null + br i1 %.39, label %B0.endif1.1.if, label %B0.endif1.1.endif, !prof !0 + + B0.endif1.1.if: ; preds = %B0.endif1.1 + tail call void @Py_DecRef(i8* %arg.a) + tail call void @Py_DecRef(i8* %arg.b) + ret i32 -1 + + B0.endif1.1.endif: ; preds = %B0.endif1.1 + tail call void @Py_DecRef(i8* %arg.b) + tail call void @Py_DecRef(i8* %arg.a) + tail call void @Py_IncRef(i8* %.38) + tail call void @Py_DecRef(i8* %.38) + store i8* %.38, i8** %retptr, align 8 + ret i32 0 + } + + declare void @PyErr_SetString(i8*, i8*) + + declare void @Py_IncRef(i8*) + + declare void @Py_DecRef(i8*) + + declare i8* @PyNumber_Add(i8*, i8*) + + +The careful reader might notice several unnecessary calls to ``Py_IncRef`` +and ``Py_DecRef`` in the generated code. Currently Numba isn't able to +optimize those away. + +Object mode compilation will also attempt to identify loops which can be +extracted and statically-typed for "nopython" compilation. This process is +called *loop-lifting*, and results in the creation of a hidden nopython mode +function just containing the loop which is then called from the original +function. Loop-lifting helps improve the performance of functions that +need to access uncompilable code (such as I/O or plotting code) but still +contain a time-intensive section of compilable code. + +Stage 7: Compile LLVM IR to machine code +---------------------------------------- + +In both :term:`object mode` and :term:`nopython mode`, the generated LLVM IR +is compiled by the LLVM JIT compiler and the machine code is loaded into +memory. A Python wrapper is also created (defined in +``numba.dispatcher.Dispatcher``) which can do the dynamic dispatch to the +correct version of the compiled function if multiple type specializations +were generated (for example, for both ``float32`` and ``float64`` versions +of the same function). + +The machine assembly code generated by LLVM can be dumped to the screen by +setting the :envvar:`NUMBA_DUMP_ASSEMBLY` environment variable to 1: + +.. code-block:: gas + + .globl __main__.add$1.int64.int64 + .align 16, 0x90 + .type __main__.add$1.int64.int64,@function + __main__.add$1.int64.int64: + addq %r8, %rcx + movq %rcx, (%rdi) + xorl %eax, %eax + retq + +The assembly output will also include the generated wrapper function that +translates the Python arguments to native data types. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/caching.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/caching.rst new file mode 100644 index 0000000000000000000000000000000000000000..29dad8f525c3068ddaecff0b88c91359e8772b8b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/caching.rst @@ -0,0 +1,111 @@ +.. _developer-caching: + +================ +Notes on Caching +================ + +Numba supports caching of compiled functions into the filesystem for future +use of the same functions. + + +The Implementation +================== + +Caching is done by saving the compiled *object code*, the ELF object of the +executable code. By using the *object code*, cached functions have minimal +overhead because no compilation is needed. The cached data is saved under the +cache directory (see :envvar:`NUMBA_CACHE_DIR`). The index of the cache is +stored in a ``.nbi`` file, with one index per function, and it lists all the +overloaded signatures compiled for the function. The *object code* is stored in +files with an ``.nbc`` extension, one file per overload. The data in both files +is serialized with :mod:`pickle`. + +.. note:: On Python <=3.7, Numba extends ``pickle`` using the pure-Python + pickler. To use the faster C Pickler, install ``pickle5`` + from ``pip``. ``pickle5`` backports Python 3.8 pickler features. + + +Requirements for Cacheability +----------------------------- + +Developers should note the requirements of a function to permit it to be cached +to ensure that the features they are working on are compatible with caching. + +Requirements for cacheable function: + +- The LLVM module must be *self-contained*, meaning that it cannot rely on + other compiled units without linking to them. +- The only allowed external symbols are from the + :ref:`NRT ` or other common symbols from system libraries + (i.e. libc and libm). + +Debugging note: + +- Look for the usage of ``inttoptr`` in the LLVM IR or + ``target_context.add_dynamic_add()`` in the lowering code in Python. + They indicate potential usage of runtime address. Not all uses are + problematic and some are necessary. Only the conversion of constant integers + into pointers will affect caching. +- Misuse of dynamic address or dynamic symbols will likely result in a + segfault. +- Linking order matters because unused symbols are dropped after linking. + Linking should start from the leaf nodes of the dependency graph. + + +Features Compatible with Caching +-------------------------------- + +The following features are explicitly verified to work with caching. + +- ufuncs and gufuncs for the ``cpu`` and ``parallel`` target +- parallel accelerator features (i.e. ``parallel=True``) + + +Caching Limitations +------------------- + +This is a list of known limitation of the cache: + +- Cache invalidation fails to recognize changes in symbols defined in a + different file. +- Global variables are treated as constants. The cache will remember the value + in the global variable used at compilation. On cache load, the cached + function will not rebind to the new value of the global variable. + + +.. _cache-sharing: + +Cache Sharing +------------- + +It is safe to share and reuse the contents in the cache directory on a +different machine. The cache remembers the CPU model and the available +CPU features during compilation. If the CPU model and the CPU features do +not match exactly, the cache contents will not be considered. +(Also see :envvar:`NUMBA_CPU_NAME`) + +If the cache directory is shared on a network filesystem, concurrent +read/write of the cache is safe only if file replacement operation is atomic +for the filesystem. Numba always writes to a unique temporary file first, it +then replaces the target cache file path with the temporary file. Numba is +tolerant against lost cache files and lost cache entries. + +.. _cache-clearing: + +Cache Clearing +-------------- + +The cache is invalidated when the corresponding source file is modified. +However, it is necessary sometimes to clear the cache directory manually. +For instance, changes in the compiler will not be recognized because the source +files are not modified. + +To clear the cache, the cache directory can be simply removed. + +Removing the cache directory when a Numba application is running may cause an +``OSError`` exception to be raised at the compilation site. + +Related Environment Variables +----------------------------- + +See :ref:`env-vars for caching `. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/compiler_pass_example.py b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/compiler_pass_example.py new file mode 100644 index 0000000000000000000000000000000000000000..15a91bd0e50bbf0ee0dba2f912949421c87b4d61 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/compiler_pass_example.py @@ -0,0 +1,78 @@ +def ex_compiler_pass(): + + # magictoken.ex_compiler_pass.begin + from numba import njit + from numba.core import ir + from numba.core.compiler import CompilerBase, DefaultPassBuilder + from numba.core.compiler_machinery import FunctionPass, register_pass + from numba.core.untyped_passes import IRProcessing + from numbers import Number + + # Register this pass with the compiler framework, declare that it will not + # mutate the control flow graph and that it is not an analysis_only pass (it + # potentially mutates the IR). + @register_pass(mutates_CFG=False, analysis_only=False) + class ConstsAddOne(FunctionPass): + _name = "consts_add_one" # the common name for the pass + + def __init__(self): + FunctionPass.__init__(self) + + # implement method to do the work, "state" is the internal compiler + # state from the CompilerBase instance. + def run_pass(self, state): + func_ir = state.func_ir # get the FunctionIR object + mutated = False # used to record whether this pass mutates the IR + # walk the blocks + for blk in func_ir.blocks.values(): + # find the assignment nodes in the block and walk them + for assgn in blk.find_insts(ir.Assign): + # if an assignment value is a ir.Consts + if isinstance(assgn.value, ir.Const): + const_val = assgn.value + # if the value of the ir.Const is a Number + if isinstance(const_val.value, Number): + # then add one! + const_val.value += 1 + mutated |= True + return mutated # return True if the IR was mutated, False if not. + # magictoken.ex_compiler_pass.end + + # magictoken.ex_compiler_defn.begin + class MyCompiler(CompilerBase): # custom compiler extends from CompilerBase + + def define_pipelines(self): + # define a new set of pipelines (just one in this case) and for ease + # base it on an existing pipeline from the DefaultPassBuilder, + # namely the "nopython" pipeline + pm = DefaultPassBuilder.define_nopython_pipeline(self.state) + # Add the new pass to run after IRProcessing + pm.add_pass_after(ConstsAddOne, IRProcessing) + # finalize + pm.finalize() + # return as an iterable, any number of pipelines may be defined! + return [pm] + # magictoken.ex_compiler_defn.end + + # magictoken.ex_compiler_call.begin + @njit(pipeline_class=MyCompiler) # JIT compile using the custom compiler + def foo(x): + a = 10 + b = 20.2 + c = x + a + b + return c + + print(foo(100)) # 100 + 10 + 20.2 (+ 1 + 1), extra + 1 + 1 from the rewrite! + # magictoken.ex_compiler_call.end + + # magictoken.ex_compiler_timings.begin + compile_result = foo.overloads[foo.signatures[0]] + nopython_times = compile_result.metadata['pipeline_times']['nopython'] + for k in nopython_times.keys(): + if ConstsAddOne._name in k: + print(nopython_times[k]) + # magictoken.ex_compiler_timings.end + + assert foo(100) == 132.2 + +ex_compiler_pass() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/contributing.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/contributing.rst new file mode 100644 index 0000000000000000000000000000000000000000..9ec5f9d9ffcefd04d19c0d44b85338bdbf079534 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/contributing.rst @@ -0,0 +1,491 @@ + +Contributing to Numba +===================== + +We welcome people who want to make contributions to Numba, big or small! +Even simple documentation improvements are encouraged. If you have +questions, don't hesitate to ask them (see below). + + +Communication +------------- + +Real-time Chat +'''''''''''''' + +Numba uses Gitter for public real-time chat. To help improve the +signal-to-noise ratio, we have two channels: + +* `numba/numba `_: General Numba discussion, + questions, and debugging help. +* `numba/numba-dev `_: Discussion of PRs, + planning, release coordination, etc. + +Both channels are public, but we may ask that discussions on numba-dev move to +the numba channel. This is simply to ensure that numba-dev is easy for core +developers to keep up with. + +Note that the Github issue tracker is the best place to report bugs. Bug +reports in chat are difficult to track and likely to be lost. + +Forum +..... + +Numba uses Discourse as a forum for longer running threads such as design +discussions and roadmap planning. There are various categories available and it +can be reached at: `numba.discourse.group `_. + +Weekly Meetings +''''''''''''''' + +The core Numba developers have a weekly video conference to discuss roadmap, +feature planning, and outstanding issues. These meetings are entirely public, +details are posted on +`numba.discourse.group Announcements `_ +and everyone is welcome to join the discussion. Minutes will be taken and will +be posted to the +`Numba wiki `_. + +.. _report-numba-bugs: + +Bug tracker +'''''''''''' + +We use the `Github issue tracker `_ +to track both bug reports and feature requests. If you report an issue, +please include specifics: + +* what you are trying to do; +* which operating system you have and which version of Numba you are running; +* how Numba is misbehaving, e.g. the full error traceback, or the unexpected + results you are getting; +* as far as possible, a code snippet that allows full reproduction of your + problem. + +Getting set up +-------------- + +If you want to contribute, we recommend you fork our `Github repository +`_, then create a branch representing +your work. When your work is ready, you should submit it as a pull +request from the Github interface. + +If you want, you can submit a pull request even when you haven't finished +working. This can be useful to gather feedback, or to stress your changes +against the :ref:`continuous integration ` +platform. In this case, please prepend ``[WIP]`` to your pull request's title. + +.. _buildenv: + +Build environment +''''''''''''''''' + +Numba has a number of dependencies (mostly `NumPy `_ and +`llvmlite `_) with non-trivial build +instructions. Unless you want to build those dependencies yourself, we +recommend you use `conda `_ to create a +dedicated development environment and install precompiled versions of those +dependencies there. Read more about the Numba dependencies here: +`numba-source-install-check`. + +When working with a source checkout of Numba you will also need a development +build of llvmlite. These are available from the ``numba/label/dev`` channel on +`anaconda.org `_. + + +Then, to create an environment with a few of the most common dependencies:: + + $ conda create -n numbaenv python=3.10 numba/label/dev::llvmlite numpy scipy jinja2 cffi + +.. note:: + This installs an environment based on Python 3.10, but you can of course + choose another version supported by Numba. To test additional features, + you may also need to install ``tbb`` and/or ``llvm-openmp``. Check the + dependency list above for details. + +To activate the environment for the current shell session:: + + $ conda activate numbaenv + +.. note:: + These instructions are for a standard Linux shell. You may need to + adapt them for other platforms. + +Once the environment is activated, you have a dedicated Python with the +required dependencies:: + + $ python + Python 3.10.3 (main, Mar 28 2022, 04:26:28) [Clang 12.0.0 ] on darwin + Type "help", "copyright", "credits" or "license" for more information. + + >>> import llvmlite + >>> llvmlite.__version__ + 0.39.0dev0+61.gf27ac6f + + +Building Numba +'''''''''''''' + +For a convenient development workflow, we recommend you build Numba inside +its source checkout:: + + $ git clone git@github.com:numba/numba.git + $ cd numba + $ python setup.py build_ext --inplace + +This assumes you have a working C compiler and runtime on your development +system. You will have to run this command again whenever you modify +C files inside the Numba source tree. + +The ``build_ext`` command in Numba's setup also accepts the following +arguments: + +- ``--noopt``: This disables optimization when compiling Numba's CPython + extensions, which makes debugging them much easier. Recommended in + conjunction with the standard ``build_ext`` option ``--debug``. +- ``--werror``: Compiles Numba's CPython extensions with the ``-Werror`` flag. +- ``--wall``: Compiles Numba's CPython extensions with the ``-Wall`` flag. + +Note that Numba's CI and the conda recipe for Linux build with the ``--werror`` +and ``--wall`` flags, so any contributions that change the CPython extensions +should be tested with these flags too. + +Running tests +''''''''''''' + +Numba is validated using a test suite comprised of various kind of tests +(unit tests, functional tests). The test suite is written using the +standard :py:mod:`unittest` framework. + +The tests can be executed via ``python -m numba.runtests``. If you are +running Numba from a source checkout, you can type ``./runtests.py`` +as a shortcut. Various options are supported to influence test running +and reporting. Pass ``-h`` or ``--help`` to get a glimpse at those options. +Examples: + +* to list all available tests:: + + $ python -m numba.runtests -l + +* to list tests from a specific (sub-)suite:: + + $ python -m numba.runtests -l numba.tests.test_usecases + +* to run those tests:: + + $ python -m numba.runtests numba.tests.test_usecases + +* to run all tests in parallel, using multiple sub-processes:: + + $ python -m numba.runtests -m + +* For a detailed list of all options:: + + $ python -m numba.runtests -h + +The numba test suite can take a long time to complete. When you want to avoid +the long wait, it is useful to focus on the failing tests first with the +following test runner options: + +* The ``--failed-first`` option is added to capture the list of failed tests + and to re-execute them first:: + + $ python -m numba.runtests --failed-first -m -v -b + +* The ``--last-failed`` option is used with ``--failed-first`` to execute + the previously failed tests only:: + + $ python -m numba.runtests --last-failed -m -v -b + +When debugging, it is useful to turn on logging. Numba logs using the +standard ``logging`` module. One can use the standard ways (i.e. +``logging.basicConfig``) to configure the logging behavior. To enable logging +in the test runner, there is a ``--log`` flag for convenience:: + + $ python -m numba.runtests --log + +To enable :ref:`runtime type-checking `, set the environment +variable ``NUMBA_USE_TYPEGUARD=1`` and use `runtests.py` from the source root +instead. For example:: + + $ NUMBA_USE_TYPEGUARD=1 python runtests.py + + +Running coverage +'''''''''''''''' + +Coverage reports can be produced using `coverage.py +`_. To record coverage +info for the test suite, run:: + + coverage run -m numba.runtests + +Next, combine coverage files (potentially for multiple runs) with:: + + coverage combine + +The combined output can be transformed into various report formats - see the +`coverage CLI usage reference +`_. +For example, to produce an HTML report, run:: + + coverage html + +Following this command, the report can be viewed by opening ``htmlcov/index.html``. + + +Development rules +----------------- + +Code reviews +'''''''''''' + +Any non-trivial change should go through a code review by one or several of +the core developers. The recommended process is to submit a pull request +on github. + +A code review should try to assess the following criteria: + +* general design and correctness +* code structure and maintainability +* coding conventions +* docstrings, comments +* test coverage + +Coding conventions +'''''''''''''''''' + +All Python code should follow :pep:`8`. Our C code doesn't have a +well-defined coding style (would it be nice to follow :pep:`7`?). +Code and documentation should generally fit within 80 columns, for +maximum readability with all existing tools (such as code review UIs). + +Numba uses `Flake8 `_ to ensure a consistent +Python code format throughout the project. ``flake8`` can be installed +with ``pip`` or ``conda`` and then run from the root of the Numba repository:: + + flake8 numba + +Optionally, you may wish to setup `pre-commit hooks `_ +to automatically run ``flake8`` when you make a git commit. This can be +done by installing ``pre-commit``:: + + pip install pre-commit + +and then running:: + + pre-commit install + +from the root of the Numba repository. Now ``flake8`` will be run each time +you commit changes. You can skip this check with ``git commit --no-verify``. + +Numba has started the process of using `type hints `_ in its code base. This +will be a gradual process of extending the number of files that use type hints, as well as going from voluntary to +mandatory type hints for new features. `Mypy `_ is used for automated static checking. + +At the moment, only certain files are checked by mypy. The list can be found in ``mypy.ini``. When making changes to +those files, it is necessary to add the required type hints such that mypy tests will pass. Only in exceptional +circumstances should ``type: ignore`` comments be used. + +If you are contributing a new feature, we encourage you to use type hints, even if the file is not currently in the +checklist. If you want to contribute type hints to enable a new file to be in the checklist, please add the file to the +``files`` variable in ``mypy.ini``, and decide what level of compliance you are targeting. Level 3 is basic static +checks, while levels 2 and 1 represent stricter checking. The levels are described in details in ``mypy.ini``. + +There is potential for confusion between the Numba module ``typing`` and Python built-in module ``typing`` used for type +hints, as well as between Numba types---such as ``Dict`` or ``Literal``---and ``typing`` types of the same name. +To mitigate the risk of confusion we use a naming convention by which objects of the built-in ``typing`` module are +imported with an ``pt`` prefix. For example, ``typing.Dict`` is imported as ``from typing import Dict as ptDict``. + +Stability +''''''''' + +The repository's ``main`` branch is expected to be stable at all times. +This translates into the fact that the test suite passes without errors +on all supported platforms (see below). This also means that a pull request +also needs to pass the test suite before it is merged in. + +.. _platform_support: + +Platform support +'''''''''''''''' + +Every commit to the main branch is automatically tested on all of the +platforms Numba supports. This includes ARMv8, POWER8, and NVIDIA GPUs. +The build system however is internal to Anaconda, so we also use +`Azure `_ to provide public continuous +integration information for as many combinations as can be supported by the +service. Azure CI automatically tests all pull requests on Windows, OS X and +Linux, as well as a sampling of different Python and NumPy versions. If you see +problems on platforms you are unfamiliar with, feel free to ask for help in your +pull request. The Numba core developers can help diagnose cross-platform +compatibility issues. Also see the :ref:`continuous integration +` section on how public CI is implemented. + +.. _continuous_integration_testing: + +Continuous integration testing +'''''''''''''''''''''''''''''' + +The Numba test suite causes CI systems a lot of grief: + +#. It's huge, 9000+ tests. +#. In part because of 1. and that compilers are pretty involved, the test suite + takes a long time to run. +#. There's sections of the test suite that are deliberately designed to stress + systems almost to the point of failure (tests which concurrently compile and + execute with threads and fork processes etc). +#. The combination of things that Numba has to test well exceeds the capacity of + any public CI system, (Python versions x NumPy versions x Operating systems + x Architectures x feature libraries (e.g. SVML) x threading backends + (e.g. OpenMP, TBB)) and then there's CUDA too and all its version + variants. + +As a result of the above, public CI is implemented as follows: + +#. The combination of OS x Python x NumPy x Various Features in the testing + matrix is designed to give a good indicative result for whether "this pull + request is probably ok". +#. When public CI runs it: + + #. Looks for files that contain tests that have been altered by the proposed + change and runs these on the whole testing matrix. + #. Runs a subset of the test suite on each part of the testing matrix. i.e. + slice the test suite up by the number of combinations in the testing + matrix and each combination runs one chunk. This is done for speed, + because public CI cannot cope with the load else. + +If a Pull Request (PR) changes CUDA code or will affect the CUDA target, it +needs to be run on `gpuCI `_. +This can be triggered by one of the Numba maintainers commenting ``run gpuCI +tests`` on the PR discussion. This runs the CUDA testsuite with various CUDA +toolkit versions on Linux, to provide some initial confidence in the +correctness of the changes with respect to CUDA. Following approval, the PR +will also be run on Numba's build farm to test other configurations with CUDA +(including Windows, which is not tested by gpuCI). + +If the PR is not CUDA-related but makes changes to something that the core +developers consider risky, then it will also be run on the Numba farm just to +make sure. The Numba project's private build and test farm will actually +exercise all the applicable tests on all the combinations noted above on real +hardware! + + +.. _type_anno_check: + +Type annotation and runtime type checking +''''''''''''''''''''''''''''''''''''''''' + +Numba is slowly gaining type annotations. To facilitate the review of pull +requests that are incrementally adding type annotations, the test suite uses +`typeguard`_ to perform runtime type checking. This helps verify the validity +of type annotations. + +To enable runtime type checking in the test suite, users can use +`runtests.py`_ in the source root as the test runner and set environment +variable ``NUMBA_USE_TYPEGUARD=1``. For example:: + + $ NUMBA_USE_TYPEGUARD=1 python runtests.py numba.tests + +Things that help with pull requests +''''''''''''''''''''''''''''''''''' + +Even with the mitigating design above public CI can get overloaded which causes +a backlog of builds. It's therefore really helpful when opening pull requests if +you can limit the frequency of pushing changes. Ideally, please squash commits +to reduce the number of patches and/or push as infrequently as possible. Also, +once a pull request review has started, please don't rebase/force push/squash +or do anything that rewrites history of the reviewed code as GitHub cannot track +this and it makes it very hard for reviewers to see what has changed. + +The core developers thank everyone for their cooperation with the above! + +Why is my pull request/issue seemingly being ignored? +''''''''''''''''''''''''''''''''''''''''''''''''''''' + +Numba is an open source project and like many similar projects it has limited +resources. As a result, it is unfortunately necessary for the core developers to +associate a priority with issues/pull requests (PR). A great way to move your +issue/PR up the priority queue is to help out somewhere else in the project so +as to free up core developer time. Examples of ways to help: + +* Perform an initial review on a PR. This often doesn't require compiler + engineering knowledge and just involves checking that the proposed patch is of + good quality, fixes the problem/implements the feature, is well tested and + documented. +* Debug an issue, there are numerous issues which `"need triage" `_ + which essentially involves debugging the reported problem. Even if you cannot + get right to the bottom of a problem, leaving notes about what was discovered + for someone else is also helpful. +* Answer questions/provide help for users on `discourse `_ + and/or `gitter.im `_. + +The core developers thank everyone for their understanding with the above! + +Documentation +------------- + +The Numba documentation is split over two repositories: + +* This documentation is in the ``docs`` directory inside the + `Numba repository `_. + +* The `Numba homepage `_ has its sources in a + separate repository at https://github.com/numba/numba-webpage + + +Main documentation +'''''''''''''''''' + +This documentation is under the ``docs`` directory of the `Numba repository`_. +It is built with `Sphinx `_ and +`numpydoc `_, which are available using +conda or pip; i.e. ``conda install sphinx numpydoc``. + +To build the documentation, you need the bootstrap theme:: + + $ pip install sphinx_bootstrap_theme + +You can edit the source files under ``docs/source/``, after which you can +build and check the documentation:: + + $ make html + $ open _build/html/index.html + +Core developers can upload this documentation to the Numba website +at https://numba.pydata.org by using the ``gh-pages.py`` script under ``docs``:: + + $ python gh-pages.py version # version can be 'dev' or '0.16' etc + +then verify the repository under the ``gh-pages`` directory and use +``git push``. + +Web site homepage +''''''''''''''''' + +The Numba homepage on https://numba.pydata.org can be fetched from here: +https://github.com/numba/numba-webpage + +After pushing documentation to a new version, core developers will want to +update the website. Some notable files: + +* ``index.rst`` # Update main page +* ``_templates/sidebar_versions.html`` # Update sidebar links +* ``doc.rst`` # Update after adding a new version for numba docs +* ``download.rst`` # Updata after uploading new numba version to pypi + +After updating run:: + + $ make html + +and check out ``_build/html/index.html``. To push updates to the Web site:: + + $ python _scripts/gh-pages.py + +then verify the repository under the ``gh-pages`` directory. Make sure the +``CNAME`` file is present and contains a single line for ``numba.pydata.org``. +Finally, use ``git push`` to update the website. + + +.. _typeguard: https://typeguard.readthedocs.io/en/latest/ +.. _runtests.py: https://github.com/numba/numba/blob/main/runtests.py diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/custom_pipeline.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/custom_pipeline.rst new file mode 100644 index 0000000000000000000000000000000000000000..e670acf29ddfd52e7231ff8ce4e8a7d953a31b1d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/custom_pipeline.rst @@ -0,0 +1,173 @@ +.. _arch-pipeline: + +======================== +Customizing the Compiler +======================== + +.. warning:: The custom pipeline feature is for expert use only. Modifying + the compiler behavior can invalidate internal assumptions in the + numba source code. + + +For library developers looking for a way to extend or modify the compiler +behavior, you can do so by defining a custom compiler by inheriting from +``numba.compiler.CompilerBase``. The default Numba compiler is defined +as ``numba.compiler.Compiler``, implementing the ``.define_pipelines()`` +method, which adds the *nopython-mode*, *object-mode* and *interpreted-mode* +pipelines. For convenience these three pipelines are defined in +``numba.compiler.DefaultPassBuilder`` by the methods: + +* ``.define_nopython_pipeline()`` +* ``.define_objectmode_pipeline()`` +* ``.define_interpreted_pipeline()`` + +respectively. + +To use a custom subclass of ``CompilerBase``, supply it as the +``pipeline_class`` keyword argument to the ``@jit`` and ``@generated_jit`` +decorators. By doing so, the effect of the custom pipeline is limited to the +function being decorated. + +Implementing a compiler pass +---------------------------- + +Numba makes it possible to implement a new compiler pass and does so through the +use of an API similar to that of LLVM. The following demonstrates the basic +process involved. + + +Compiler pass classes +##################### + +All passes must inherit from ``numba.compiler_machinery.CompilerPass``, commonly +used subclasses are: + +* ``numba.compiler_machinery.FunctionPass`` for describing a pass that operates + on a function-at-once level and may mutate the IR state. +* ``numba.compiler_machinery.AnalysisPass`` for describing a pass that performs + analysis only. +* ``numba.compiler_machinery.LoweringPass`` for describing a pass that performs + lowering only. + +In this example a new compiler pass will be implemented that will rewrite all +``ir.Const(x)`` nodes, where ``x`` is a subclass of ``numbers.Number``, such +that the value of x is incremented by one. There is no use for this pass other +than to serve as a pedagogical vehicle! + +The ``numba.compiler_machinery.FunctionPass`` is appropriate for the suggested +pass behavior and so is the base class of the new pass. Further, a ``run_pass`` +method is defined to do the work (this method is abstract, all compiler passes +must implement it). + +First the new class: + +.. literalinclude:: compiler_pass_example.py + :language: python + :dedent: 4 + :start-after: magictoken.ex_compiler_pass.begin + :end-before: magictoken.ex_compiler_pass.end + + +Note also that the class must be registered with Numba's compiler machinery +using ``@register_pass``. This in part is to allow the declaration of whether +the pass mutates the control flow graph and whether it is an analysis only pass. + +Next, define a new compiler based on the existing +``numba.compiler.CompilerBase``. The compiler pipeline is defined through the +use of an existing pipeline and the new pass declared above is added to be run +after the ``IRProcessing`` pass. + + +.. literalinclude:: compiler_pass_example.py + :language: python + :dedent: 4 + :start-after: magictoken.ex_compiler_defn.begin + :end-before: magictoken.ex_compiler_defn.end + +Finally update the ``@njit`` decorator at the call site to make use of the newly +defined compilation pipeline. + +.. literalinclude:: compiler_pass_example.py + :language: python + :dedent: 4 + :start-after: magictoken.ex_compiler_call.begin + :end-before: magictoken.ex_compiler_call.end + +Debugging compiler passes +------------------------- + +Observing IR Changes +#################### + +It is often useful to be able to see the changes a pass makes to the IR. Numba +conveniently permits this through the use of the environment variable +:envvar:`NUMBA_DEBUG_PRINT_AFTER`. In the case of the above pass, running the +example code with ``NUMBA_DEBUG_PRINT_AFTER="ir_processing,consts_add_one"`` +gives: + + +.. code-block:: none + :emphasize-lines: 4, 7, 24, 27 + + ----------------------------nopython: ir_processing----------------------------- + label 0: + x = arg(0, name=x) ['x'] + $const0.1 = const(int, 10) ['$const0.1'] + a = $const0.1 ['$const0.1', 'a'] + del $const0.1 [] + $const0.2 = const(float, 20.2) ['$const0.2'] + b = $const0.2 ['$const0.2', 'b'] + del $const0.2 [] + $0.5 = x + a ['$0.5', 'a', 'x'] + del x [] + del a [] + $0.7 = $0.5 + b ['$0.5', '$0.7', 'b'] + del b [] + del $0.5 [] + c = $0.7 ['$0.7', 'c'] + del $0.7 [] + $0.9 = cast(value=c) ['$0.9', 'c'] + del c [] + return $0.9 ['$0.9'] + ----------------------------nopython: consts_add_one---------------------------- + label 0: + x = arg(0, name=x) ['x'] + $const0.1 = const(int, 11) ['$const0.1'] + a = $const0.1 ['$const0.1', 'a'] + del $const0.1 [] + $const0.2 = const(float, 21.2) ['$const0.2'] + b = $const0.2 ['$const0.2', 'b'] + del $const0.2 [] + $0.5 = x + a ['$0.5', 'a', 'x'] + del x [] + del a [] + $0.7 = $0.5 + b ['$0.5', '$0.7', 'b'] + del b [] + del $0.5 [] + c = $0.7 ['$0.7', 'c'] + del $0.7 [] + $0.9 = cast(value=c) ['$0.9', 'c'] + del c [] + return $0.9 ['$0.9'] + +Note the change in the values in the ``const`` nodes. + +Pass execution times +#################### + +Numba has built-in support for timing all compiler passes, the execution times +are stored in the metadata associated with a compilation result. This +demonstrates one way of accessing this information based on the previously +defined function, ``foo``: + +.. literalinclude:: compiler_pass_example.py + :language: python + :dedent: 4 + :start-after: magictoken.ex_compiler_timings.begin + :end-before: magictoken.ex_compiler_timings.end + +the output of which is, for example:: + + pass_timings(init=1.914000677061267e-06, run=4.308700044930447e-05, finalize=1.7400006981915794e-06) + +this displaying the pass initialization, run and finalization times in seconds. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/debugging.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/debugging.rst new file mode 100644 index 0000000000000000000000000000000000000000..544e9a6eb333b5275a6b7620148cdcff47b2fda5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/debugging.rst @@ -0,0 +1,138 @@ +.. _developer-debugging: + +================== +Notes on Debugging +================== + +This section describes techniques that can be useful in debugging the +compilation and execution of generated code. + +.. seealso:: + :ref:`debugging-jit-compiled-code` + + +Memcheck +-------- + +Memcheck_ is a memory error detector implemented using Valgrind_. It is useful +for detecting memory errors in compiled code, particularly out-of-bounds +accesses and use-after-free errors. Buggy or miscompiled native code can +generate these kinds of errors. The `Memcheck documentation +`_ explains its usage; here, we +discuss only the specifics of using it with Numba. + +.. _Memcheck: https://valgrind.org/docs/manual/mc-manual.html +.. _Valgrind: https://valgrind.org/ + +The Python interpreter and some of the libraries used by Numba can generate +false positives with Memcheck - see `this section of the manual +`_ for more +information on why false positives occur. The false positives can make it +difficult to determine when an actual error has occurred, so it is helpful to +suppress known false positives. This can be done by supplying a suppressions +file, which instructs Memcheck to ignore errors that match the suppressions +defined in it. + +The CPython source distribution includes a suppressions file, in the file +``Misc/valgrind-python.supp``. Using this file prevents a lot of spurious errors +generated by Python's memory allocation implementation. Additionally, the Numba +repository includes a suppressions file in ``contrib/valgrind-numba.supp``. + +.. note:: It is important to use the suppressions files from the versions of the + Python interpreter and Numba that you are using - these files evolve over + time, so non-current versions can fail to suppress some errors, or + erroneously suppress actual errors. + +To run the Python interpreter under Memcheck with both suppressions +files, it is invoked with the following command:: + + valgrind --tool=memcheck \ + --suppressions=${CPYTHON_SRC_DIR}/Misc/valgrind-python.supp \ + --suppressions=${NUMBA_SRC_DIR}/contrib/valgrind-numba.supp \ + python ${PYTHON_ARGS} + +where ``${CPYTHON_SRC_DIR}`` is set to the location of the CPython source +distribution, ``${NUMBA_SRC_DIR}`` is the location of the Numba source dir, and +``${PYTHON_ARGS}`` are the arguments to the Python interpreter. + +If there are errors, then messages describing them will be printed to standard +error. An example of an error is:: + + ==77113== at 0x24169A: PyLong_FromLong (longobject.c:251) + ==77113== by 0x241881: striter_next (bytesobject.c:3084) + ==77113== by 0x2D3C95: _PyEval_EvalFrameDefault (ceval.c:2809) + ==77113== by 0x21B499: _PyEval_EvalCodeWithName (ceval.c:3930) + ==77113== by 0x26B436: _PyFunction_FastCallKeywords (call.c:433) + ==77113== by 0x2D3605: call_function (ceval.c:4616) + ==77113== by 0x2D3605: _PyEval_EvalFrameDefault (ceval.c:3124) + ==77113== by 0x21B977: _PyEval_EvalCodeWithName (ceval.c:3930) + ==77113== by 0x21C2A4: _PyFunction_FastCallDict (call.c:376) + ==77113== by 0x2D5129: do_call_core (ceval.c:4645) + ==77113== by 0x2D5129: _PyEval_EvalFrameDefault (ceval.c:3191) + ==77113== by 0x21B499: _PyEval_EvalCodeWithName (ceval.c:3930) + ==77113== by 0x26B436: _PyFunction_FastCallKeywords (call.c:433) + ==77113== by 0x2D46DA: call_function (ceval.c:4616) + ==77113== by 0x2D46DA: _PyEval_EvalFrameDefault (ceval.c:3139) + ==77113== + ==77113== Use of uninitialised value of size 8 + +The traceback provided only outlines the C call stack, which can make it +difficult to determine what the Python interpreter was doing at the time of the +error. One can learn more about the state of the stack by looking at the +backtrace in the `GNU Debugger (GDB) `_. +Launch ``valgrind`` with an additional argument, ``--vgdb-error=0`` and attach +to the process using GDB as instructed by the output. Once an error is +encountered, GDB will stop at the error and the stack can be inspected. + +GDB does provide support for backtracing through the Python stack, but this +requires symbols which may not be easily available in your Python distribution. +In this case, it is still possible to determine some information about what was +happening in Python, but this depends on examining the backtrace closely. For +example, in a backtrace corresponding to the above error, we see items such as: + +.. code-block:: + + #18 0x00000000002722da in slot_tp_call ( + self=<_wrap_impl(_callable=<_wrap_missing_loc(func=) at remote 0x1d200bd0>, _imp=, + _context=, , , , , , , , , , , , , , , , , , , , , ], attributes=[, , , , , + , identified_types={}) at remote + 0xbb5add0>, name='cuconstRecAlign$7', + data_layout='e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64', + scope=, triple='nvptx64-nvidia-cuda', + globals={'_ZN08NumbaEnv5numba4cuda5tests6cudapy13test_constmem19cuconstRecAlign$247E5ArrayIdLi1E1C7mutable7ali...(truncated), + kwds=0x0) + +We can see some of the arguments, in particular the names of the compiled functions, e.g:: + + _ZN5numba4cuda5tests6cudapy13test_constmem19cuconstRecAlign$247E5ArrayIdLi1E1C7mutable7alignedE5ArrayIdLi1E1C7mutable7alignedE5ArrayIdLi1E1C7mutable7alignedE5ArrayIdLi1E1C7mutable7alignedE5ArrayIdLi1E1C7mutable7alignedE + +We can run this through ``c++filt`` to see a more human-readable representation:: + + numba::cuda::tests::cudapy::test_constmem::cuconstRecAlign$247( + Array, + Array, + Array, + Array, + Array) + +which is the fully qualified name of a jitted function and the types with which +it was called. + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/dispatching.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/dispatching.rst new file mode 100644 index 0000000000000000000000000000000000000000..ca50e6d66141187c279c0e31ba11aa2e76dc7160 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/dispatching.rst @@ -0,0 +1,267 @@ + +======================= +Polymorphic dispatching +======================= + +Functions compiled using :func:`~numba.jit` or :func:`~numba.vectorize` +are open-ended: they can be called with many different input types and +have to select (possibly compile on-the-fly) the right low-level +specialization. We hereby explain how this mechanism is implemented. + + +Requirements +============ + +JIT-compiled functions can take several arguments and each of them is +taken into account when selecting a specialization. Thus it is a +form of multiple dispatch, more complex than single dispatch. + +Each argument weighs in the selection based on its :ref:`Numba type +`. Numba types are often more granular than Python types: +for example, Numba types Numpy arrays differently depending on their +dimensionality and their layout (C-contiguous, etc.). + +Once a Numba type is inferred for each argument, a specialization must +be chosen amongst the available ones; or, if not suitable specialization +is found, a new one must be compiled. This is not a trivial decision: +there can be multiple specializations compatible with a given concrete +signature (for example, say a two-argument function has compiled +specializations for ``(float64, float64)`` and ``(complex64, complex64)``, +and it is called with ``(float32, float32)``). + +Therefore, there are two crucial steps in the dispatch mechanism: + +1. infer the Numba types of the concrete arguments +2. select the best available specialization (or choose to compile a new one) + for the inferred Numba types + +Compile-time vs. run-time +------------------------- + +This document discusses dispatching when it is done at runtime, i.e. +when a JIT-compiled function is called from pure Python. In that context, +performance is important. To stay in the realm of normal function call +overhead in Python, the overhead of dispatching should stay under a +microsecond. Of course, *the faster the better*... + +When a JIT-compiled function is called from another JIT-compiled +function (in :term:`nopython mode`), the polymorphism is resolved at +compile-time, using a non-performance critical mechanism, bearing zero +runtime performance overhead. + +.. note:: + In practice, the performance-critical parts described here are coded in C. + + +Type resolution +=============== + +The first step is therefore to infer, at call-time, a Numba type for each +of the function's concrete arguments. Given the finer granularity of +Numba types compared to Python types, one cannot simply lookup an object's +class and key a dictionary with it to obtain the corresponding Numba type. + +Instead, there is a machinery to inspect the object and, based on its +Python type, query various properties to infer the appropriate Numba +type. This can be more or less complex: for example, a Python ``int`` +argument will always infer to a Numba ``intp`` (a pointer-sized integer), +but a Python ``tuple`` argument can infer to multiple Numba types (depending +on the tuple's size and the concrete type of each of its elements). + +The Numba type system is high-level and written in pure Python; there is +a pure Python machinery, based on a generic function, to do said inference +(in :mod:`numba.typing.typeof`). That machinery is used for compile-time +inference, e.g. on constants. Unfortunately, it is too slow for run-time +value-based dispatching. It is only used as a fallback for rarely used +(or difficult to infer) types, and exhibits multiple-microsecond overhead. + +Typecodes +--------- + +The Numba type system is really too high-level to be manipulated efficiently +from C code. Therefore, the C dispatching layer uses another representation +based on integer typecodes. Each Numba type gets a unique integer typecode +when constructed; also, an interning system ensure no two instances of same +type are created. The dispatching layer is therefore able to *eschew* +the overhead of the Numba type system by working with simple integer +typecodes, amenable to well-known optimizations (fast hash tables, etc.). + +The goal of the type resolution step becomes: infer a Numba *typecode* +for each of the function's concrete arguments. Ideally, it doesn't deal +with Numba types anymore... + +Hard-coded fast paths +--------------------- + +While eschewing the abstraction and object-orientation overhead of the type +system, the integer typecodes still have the same conceptual complexity. +Therefore, an important technique to speed up inference is to first go +through checks for the most important types, and hard-code a fast resolution +for each of them. + +Several types benefit from such an optimization, notably: + +* basic Python scalars (``bool``, ``int``, ``float``, ``complex``); +* basic Numpy scalars (the various kinds of integer, floating-point, + complex numbers); +* Numpy arrays of certain dimensionalities and basic element types. + +Each of those fast paths ideally uses a hard-coded result value or a direct +table lookup after a few simple checks. + +However, we can't apply that technique to all argument types; there would +be an explosion of ad-hoc internal caches, and it would become difficult to +maintain. Besides, the recursive application of hard-coded fast paths +would not necessarily combine into a low overhead (in the nested tuple +case, for example). + +Fingerprint-based typecode cache +-------------------------------- + +For non-so-trivial types (imagine a tuple, or a Numpy ``datetime64`` array, +for example), the hard-coded fast paths don't match. Another mechanism +then kicks in, more generic. + +The principle here is to examine each argument value, as the pure Python +machinery would do, and to describe its Numba type unambiguously. The +difference is that *we don't actually compute a Numba type*. Instead, we +compute a simple bytestring, a low-level possible denotation of that +Numba type: a *fingerprint*. The fingerprint format is designed to be +short and extremely simple to compute from C code (in practice, it has +a bytecode-like format). + +Once the fingerprint is computed, it is looked up in a cache mapping +fingerprints to typecodes. The cache is a hash table, and the lookup +is fast thanks to the fingerprints being generally very short (rarely +more than 20 bytes). + +If the cache lookup fails, the typecode must first be computed using the +slow pure Python machinery. Luckily, this would only happen once: on +subsequent calls, the cached typecode would be returned for the given +fingerprint. + +In rare cases, a fingerprint cannot be computed efficiently. This is +the case for some types which cannot be easily inspected from C: for +example ``cffi`` function pointers. Then, the slow Pure Python machinery +is invoked at each function call with such an argument. + +.. note:: + Two fingerprints may denote a single Numba type. This does not make + the mechanism incorrect; it only creates more cache entries. + + +Summary +------- + +Type resolution of a function argument involves the following mechanisms +in order: + +* Try a few hard-coded fast paths, for common simple types. +* If the above failed, compute a fingerprint for the argument and lookup + its typecode in a cache. +* If all the above failed, invoke the pure Python machinery which will + determine a Numba type for the argument (and look up its typecode). + + +Specialization selection +======================== + +At the previous step, an integer typecode has been determined for each +concrete argument to the JIT-compiled function. Now it remains to match +that concrete signature against each of the available specializations for +the function. There can be three outcomes: + +* There is a satisfying best match: the corresponding specialization + is then invoked (it will handle argument unboxing and other details). +* There is a tie between two or more "best matches": an exception is raised, + refusing to solve the ambiguity. +* There is no satisfying match: a new specialization is compiled tailored + for the concrete argument types that were inferred. + +The selection works by looping over all available specializations, and +computing the compatibility of each concrete argument type with the +corresponding type in the specialization's intended signature. Specifically, +we are interested in: + +1. Whether the concrete argument type is allowed to convert implicitly to + the specialization's argument type; +2. If so, at what semantic (user-visible) cost the conversion comes. + +Implicit conversion rules +------------------------- + +There are five possible kinds of implicit conversion from a source type +to a destination type (note this is an asymmetric relationship): + +1. *exact match*: the two types are identical; this is the ideal case, + since the specialization would behave exactly as intended; +2. *same-kind promotion*: the two types belong to the same "kind" (for + example ``int32`` and ``int64`` are two integer types), and the source + type can be converted losslessly to the destination type (e.g. from + ``int32`` to ``int64``, but not the reverse); +3. *safe conversion*: the two types belong to different kinds, but the + source type can be reasonably converted to the destination type + (e.g. from ``int32`` to ``float64``, but not the reverse); +4. *unsafe conversion*: a conversion is available from the source type + to the destination type, but it may lose precision, magnitude, or + another desirable quality. +5. *no conversion*: there is no correct or reasonably efficient way to + convert between the two types (for example between an ``int64`` and a + ``datetime64``, or a C-contiguous array and a Fortran-contiguous array). + +When a specialization is examined, the latter two cases eliminate it from +the final choice: i.e. when at least one argument has *no conversion* or +only an *unsafe conversion* to the signature's argument type. + +.. note:: + However, if the function is compiled with explicit signatures + in the :func:`~numba.jit` call (and therefore it is not allowed to compile + new specializations), *unsafe conversion* is allowed. + +Candidates and best match +------------------------- + +If a specialization is not eliminated by the rule above, it enters the +list of *candidates* for the final choice. Those candidates are ranked +by an ordered 4-uple of integers: ``(number of unsafe conversions, +number of safe conversions, number of same-kind promotions, number of +exact matches)`` (note the sum of the tuple's elements is equal to the +number of arguments). The best match is then the #1 result in sorted +ascending order, thereby preferring exact matches over promotions, +promotions over safe conversions, safe conversions over unsafe conversions. + +Implementation +-------------- + +The above-described mechanism works on integer typecodes, not on Numba +types. It uses an internal hash table storing the possible conversion +kind for each pair of compatible types. The internal hash table is in part +built at startup (for built-in trivial types such as ``int32``, ``int64`` +etc.), in part filled dynamically (for arbitrarily complex types such +as array types: for example to allow using a C-contiguous 2D array where +a function expects a non-contiguous 2D array). + +Summary +------- + +Selecting the right specialization involves the following steps: + +* Examine each available specialization and match it against the concrete + argument types. +* Eliminate any specialization where at least one argument doesn't offer + sufficient compatibility. +* If there are remaining candidates, choose the best one in terms of + preserving the types' semantics. + + +Miscellaneous +============= + +Some `benchmarks of dispatch performance +`_ +exist in the `Numba benchmarks `_ +repository. + +Some unit tests of specific aspects of the machinery are available +in :mod:`numba.tests.test_typeinfer` and :mod:`numba.tests.test_typeof`. +Higher-level dispatching tests are in :mod:`numba.tests.test_dispatcher`. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/environment.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/environment.rst new file mode 100644 index 0000000000000000000000000000000000000000..39f4bd2341fb7796c8f83ddd77048dffac1b8fad --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/environment.rst @@ -0,0 +1,59 @@ + +================== +Environment Object +================== + +The Environment object (Env) is used to maintain references to python objects +that are needed to support compiled functions for both object-mode and +nopython-mode. + +In nopython-mode, the Env is used for: + +* Storing pyobjects for reconstruction from native values, + such as: + + * for printing native values of NumPy arrays; + * for returning or yielding native values back to the interpreter. + +In object-mode, the Env is used for: + +* storing constant values referenced in the code. +* storing a reference to the function's global dictionary to load global + values. + + +The Implementation +================== + +The Env is implemented in two parts. In ``_dynfunc.c``, the Env is defined +as ``EnvironmentObject`` as a Python C-extension type. In ``lowering.py``, +the `EnvironmentObject`` (exported as ``_dynfunc.Environment``) is extended +to support necessary operations needed at lowering. + + +Serialization +------------- + +The Env supports being pickled. Compilation cache files and ahead-of-time +compiled modules serialize all the used Envs for recreation at the runtime. + +Usage +----- + +At the start of the lowering for a function or a generator, an Env is created. +Throughout the compilation, the Env is mutated to attach additional +information. The compiled code references an Env via a global variable in +the emitted LLVM IR. The global variable is zero-initialized with "common" +linkage, which is the default linkage for C global values. The use of this +linkage allows multiple definitions of the global variable to be merged into +a single definition when the modules are linked together. The name of the +global variable is computed from the name of the function +(see ``FunctionDescriptor.env_name`` and ``.get_env_name()`` of the target +context). + +The Env is initialized when the compiled-function is loaded. The JIT engine +finds the address of the associated global variable for the Env and stores the +address of the Env into it. For cached functions, the same process applies. +For ahead-of-time compiled functions, the module initializer in the generated +library is responsible for initializing the global variables of all the Envs +in the module. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/event_api.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/event_api.rst new file mode 100644 index 0000000000000000000000000000000000000000..31aac33136826d188df66b532ed46974246bea7f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/event_api.rst @@ -0,0 +1,5 @@ +Event API +========= + +.. automodule:: numba.core.event + :members: \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/generators.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/generators.rst new file mode 100644 index 0000000000000000000000000000000000000000..b2936817b744d801bd82104dcf6f34636d787839 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/generators.rst @@ -0,0 +1,307 @@ + +.. _arch-generators: + +=================== +Notes on generators +=================== + +Numba recently gained support for compiling generator functions. This +document explains some of the implementation choices. + + +Terminology +=========== + +For clarity, we distinguish between *generator functions* and +*generators*. A generator function is a function containing one or +several ``yield`` statements. A generator (sometimes also called "generator +iterator") is the return value of a generator function; it resumes +execution inside its frame each time :py:func:`next` is called. + +A *yield point* is the place where a ``yield`` statement is called. +A *resumption point* is the place just after a *yield point* where execution +is resumed when :py:func:`next` is called again. + + +Function analysis +================= + +Suppose we have the following simple generator function:: + + def gen(x, y): + yield x + y + yield x - y + +Here is its CPython bytecode, as printed out using :py:func:`dis.dis`:: + + 7 0 LOAD_FAST 0 (x) + 3 LOAD_FAST 1 (y) + 6 BINARY_ADD + 7 YIELD_VALUE + 8 POP_TOP + + 8 9 LOAD_FAST 0 (x) + 12 LOAD_FAST 1 (y) + 15 BINARY_SUBTRACT + 16 YIELD_VALUE + 17 POP_TOP + 18 LOAD_CONST 0 (None) + 21 RETURN_VALUE + +When compiling this function with :envvar:`NUMBA_DUMP_IR` set to 1, the +following information is printed out:: + + ----------------------------------IR DUMP: gen---------------------------------- + label 0: + x = arg(0, name=x) ['x'] + y = arg(1, name=y) ['y'] + $0.3 = x + y ['$0.3', 'x', 'y'] + $0.4 = yield $0.3 ['$0.3', '$0.4'] + del $0.4 [] + del $0.3 [] + $0.7 = x - y ['$0.7', 'x', 'y'] + del y [] + del x [] + $0.8 = yield $0.7 ['$0.7', '$0.8'] + del $0.8 [] + del $0.7 [] + $const0.9 = const(NoneType, None) ['$const0.9'] + $0.10 = cast(value=$const0.9) ['$0.10', '$const0.9'] + del $const0.9 [] + return $0.10 ['$0.10'] + ------------------------------GENERATOR INFO: gen------------------------------- + generator state variables: ['$0.3', '$0.7', 'x', 'y'] + yield point #1: live variables = ['x', 'y'], weak live variables = ['$0.3'] + yield point #2: live variables = [], weak live variables = ['$0.7'] + + +What does it mean? The first part is the Numba IR, as already seen in +:ref:`arch_generate_numba_ir`. We can see the two yield points (``yield $0.3`` +and ``yield $0.7``). + +The second part shows generator-specific information. To understand it +we have to understand what suspending and resuming a generator means. + +When suspending a generator, we are not merely returning a value to the +caller (the operand of the ``yield`` statement). We also have to save the +generator's *current state* in order to resume execution. In trivial use +cases, perhaps the CPU's register values or stack slots would be preserved +until the next call to next(). However, any non-trivial case will hopelessly +clobber those values, so we have to save them in a well-defined place. + +What are the values we need to save? Well, in the context of the Numba +Intermediate Representation, we must save all *live variables* at each +yield point. These live variables are computed thanks to the control +flow graph. + +Once live variables are saved and the generator is suspended, resuming +the generator simply involves the inverse operation: the live variables +are restored from the saved generator state. + +.. note:: + It is the same analysis which helps insert Numba ``del`` instructions + where appropriate. + +Let's go over the generator info again:: + + generator state variables: ['$0.3', '$0.7', 'x', 'y'] + yield point #1: live variables = ['x', 'y'], weak live variables = ['$0.3'] + yield point #2: live variables = [], weak live variables = ['$0.7'] + +Numba has computed the union of all live variables (denoted as "state +variables"). This will help define the layout of the :ref:`generator +structure `. Also, for each yield point, we have +computed two sets of variables: + +* the *live variables* are the variables which are used by code following + the resumption point (i.e. after the ``yield`` statement) + +* the *weak live variables* are variables which are del'ed immediately + after the resumption point; they have to be saved in :term:`object mode`, + to ensure proper reference cleanup + + +.. _generator-structure: + +The generator structure +======================= + +Layout +------ + +Function analysis helps us gather enough information to define the +layout of the generator structure, which will store the entire execution +state of a generator. Here is a sketch of the generator structure's layout, +in pseudo-code:: + + struct gen_struct_t { + int32_t resume_index; + struct gen_args_t { + arg_0_t arg0; + arg_1_t arg1; + ... + arg_N_t argN; + } + struct gen_state_t { + state_0_t state_var0; + state_1_t state_var1; + ... + state_N_t state_varN; + } + } + +Let's describe those fields in order. + +* The first member, the *resume index*, is an integer telling the generator + at which resumption point execution must resume. By convention, it can + have two special values: 0 means execution must start at the beginning of + the generator (i.e. the first time :py:func:`next` is called); -1 means + the generator is exhausted and resumption must immediately raise + StopIteration. Other values indicate the yield point's index starting from 1 + (corresponding to the indices shown in the generator info above). + +* The second member, the *arguments structure* is read-only after it is first + initialized. It stores the values of the arguments the generator function + was called with. In our example, these are the values of ``x`` and ``y``. + +* The third member, the *state structure*, stores the live variables as + computed above. + +Concretely, our example's generator structure (assuming the generator +function is called with floating-point numbers) is then:: + + struct gen_struct_t { + int32_t resume_index; + struct gen_args_t { + double arg0; + double arg1; + } + struct gen_state_t { + double $0.3; + double $0.7; + double x; + double y; + } + } + +Note that here, saving ``x`` and ``y`` is redundant: Numba isn't able to +recognize that the state variables ``x`` and ``y`` have the same value +as ``arg0`` and ``arg1``. + +Allocation +---------- + +How does Numba ensure the generator structure is preserved long enough? +There are two cases: + +* When a Numba-compiled generator function is called from a Numba-compiled + function, the structure is allocated on the stack by the callee. In this + case, generator instantiation is practically costless. + +* When a Numba-compiled generator function is called from regular Python + code, a CPython-compatible wrapper is instantiated that has the right + amount of allocated space to store the structure, and whose + :c:member:`~PyTypeObject.tp_iternext` slot is a wrapper around the + generator's native code. + + +Compiling to native code +======================== + +When compiling a generator function, three native functions are actually +generated by Numba: + +* An initialization function. This is the function corresponding + to the generator function itself: it receives the function arguments and + stores them inside the generator structure (which is passed by pointer). + It also initialized the *resume index* to 0, indicating that the generator + hasn't started yet. + +* A next() function. This is the function called to resume execution + inside the generator. Its single argument is a pointer to the generator + structure and it returns the next yielded value (or a special exit code + is used if the generator is exhausted, for quick checking when called + from Numba-compiled functions). + +* An optional finalizer. In object mode, this function ensures that all + live variables stored in the generator state are decref'ed, even if the + generator is destroyed without having been exhausted. + +The next() function +------------------- + +The next() function is the least straight-forward of the three native +functions. It starts with a trampoline which dispatches execution to the +right resume point depending on the *resume index* stored in the generator +structure. Here is how the function start may look like in our example: + +.. code-block:: llvm + + define i32 @"__main__.gen.next"( + double* nocapture %retptr, + { i8*, i32 }** nocapture readnone %excinfo, + i8* nocapture readnone %env, + { i32, { double, double }, { double, double, double, double } }* nocapture %arg.gen) + { + entry: + %gen.resume_index = getelementptr { i32, { double, double }, { double, double, double, double } }* %arg.gen, i64 0, i32 0 + %.47 = load i32* %gen.resume_index, align 4 + switch i32 %.47, label %stop_iteration [ + i32 0, label %B0 + i32 1, label %generator_resume1 + i32 2, label %generator_resume2 + ] + + ; rest of the function snipped + +(uninteresting stuff trimmed from the LLVM IR to make it more readable) + +We recognize the pointer to the generator structure in ``%arg.gen``. +The trampoline switch has three targets (one for each *resume index* 0, 1 +and 2), and a fallback target label named ``stop_iteration``. Label ``B0`` +represents the function's start, ``generator_resume1`` (resp. +``generator_resume2``) is the resumption point after the first +(resp. second) yield point. + +After generation by LLVM, the whole native assembly code for this function +may look like this (on x86-64): + +.. code-block:: asm + + .globl __main__.gen.next + .align 16, 0x90 + __main__.gen.next: + movl (%rcx), %eax + cmpl $2, %eax + je .LBB1_5 + cmpl $1, %eax + jne .LBB1_2 + movsd 40(%rcx), %xmm0 + subsd 48(%rcx), %xmm0 + movl $2, (%rcx) + movsd %xmm0, (%rdi) + xorl %eax, %eax + retq + .LBB1_5: + movl $-1, (%rcx) + jmp .LBB1_6 + .LBB1_2: + testl %eax, %eax + jne .LBB1_6 + movsd 8(%rcx), %xmm0 + movsd 16(%rcx), %xmm1 + movaps %xmm0, %xmm2 + addsd %xmm1, %xmm2 + movsd %xmm1, 48(%rcx) + movsd %xmm0, 40(%rcx) + movl $1, (%rcx) + movsd %xmm2, (%rdi) + xorl %eax, %eax + retq + .LBB1_6: + movl $-3, %eax + retq + +Note the function returns 0 to indicate a value is yielded, -3 to indicate +StopIteration. ``%rcx`` points to the start of the generator structure, +where the resume index is stored. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/hashing.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/hashing.rst new file mode 100644 index 0000000000000000000000000000000000000000..b955324413f4ee4161e7319bd153eadf974cf446 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/hashing.rst @@ -0,0 +1,54 @@ + +================ +Notes on Hashing +================ + +Numba supports the built-in :func:`hash` and does so by simply calling the +:func:`__hash__` member function on the supplied argument. This makes it +trivial to add hash support for new types as all that is required is the +application of the extension API :func:`overload_method` decorator to overload +a function for computing the hash value for the new type registered to the +type's :func:`__hash__` method. For example:: + + from numba.extending import overload_method + + @overload_method(myType, '__hash__') + def myType_hash_overload(obj): + # implementation details + + +The Implementation +================== + +The implementation of the Numba hashing functions strictly follows that of +Python 3. The only exception to this is that for hashing Unicode and bytes (for +content longer than ``sys.hash_info.cutoff``) the only supported algorithm is +``siphash24`` (default in CPython 3). As a result Numba will match Python 3 +hash values for all supported types under the default conditions described. + +Unicode hash cache differences +------------------------------ + +Both Numba and CPython Unicode string internal representations have a ``hash`` +member for the purposes of caching the string's hash value. This member is +always checked ahead of computing a hash value with the view of simply providing +a value from cache as it is considerably cheaper to do so. The Numba Unicode +string hash caching implementation behaves in a similar way to that of +CPython's. The only notable behavioral change (and its only impact is a minor +potential change in performance) is that Numba always computes and caches the +hash for Unicode strings created in ``nopython mode`` at the time they are boxed +for reuse in Python, this is too eager in some cases in comparison to CPython +which may delay hashing a new Unicode string depending on creation method. It +should also be noted that Numba copies in the ``hash`` member of the CPython +internal representation for Unicode strings when unboxing them to its own +representation so as to not recompute the hash of a string that already has a +hash value associated with it. + +The accommodation of ``PYTHONHASHSEED`` +--------------------------------------- + +The ``PYTHONHASHSEED`` environment variable can be used to seed the CPython +hashing algorithms for e.g. the purposes of reproducibility. The Numba hashing +implementation directly reads the CPython hashing algorithms' internal state and +as a result the influence of ``PYTHONHASHSEED`` is replicated in Numba's +hashing implementations. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/index.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..2a8cbe53ec81c9ea81e9730c2a1aeb158484ae92 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/index.rst @@ -0,0 +1,32 @@ + +.. _developer-manual: + +Developer Manual +================ + +.. toctree:: + :maxdepth: 2 + + contributing.rst + release.rst + repomap.rst + architecture.rst + dispatching.rst + generators.rst + numba-runtime.rst + rewrites.rst + live_variable_analysis.rst + listings.rst + stencil.rst + custom_pipeline.rst + inlining.rst + environment.rst + hashing.rst + caching.rst + threading_implementation.rst + literal.rst + llvm_timings.rst + debugging.rst + event_api.rst + target_extension.rst + mission.rst diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/inline_example.py b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/inline_example.py new file mode 100644 index 0000000000000000000000000000000000000000..e57ba5c58182e23bfeea3749dc93a4d0c0b6a454 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/inline_example.py @@ -0,0 +1,82 @@ +from numba import njit +import numba +from numba.core import ir + + +@njit(inline='never') +def never_inline(): + return 100 + + +@njit(inline='always') +def always_inline(): + return 200 + + +def sentinel_cost_model(expr, caller_info, callee_info): + # this cost model will return True (i.e. do inlining) if either: + # a) the callee IR contains an `ir.Const(37)` + # b) the caller IR contains an `ir.Const(13)` logically prior to the call + # site + + # check the callee + for blk in callee_info.blocks.values(): + for stmt in blk.body: + if isinstance(stmt, ir.Assign): + if isinstance(stmt.value, ir.Const): + if stmt.value.value == 37: + return True + + # check the caller + before_expr = True + for blk in caller_info.blocks.values(): + for stmt in blk.body: + if isinstance(stmt, ir.Assign): + if isinstance(stmt.value, ir.Expr): + if stmt.value == expr: + before_expr = False + if isinstance(stmt.value, ir.Const): + if stmt.value.value == 13: + return True & before_expr + return False + + +@njit(inline=sentinel_cost_model) +def maybe_inline1(): + # Will not inline based on the callee IR with the declared cost model + # The following is ir.Const(300). + return 300 + + +@njit(inline=sentinel_cost_model) +def maybe_inline2(): + # Will inline based on the callee IR with the declared cost model + # The following is ir.Const(37). + return 37 + + +@njit +def foo(): + a = never_inline() # will never inline + b = always_inline() # will always inline + + # will not inline as the function does not contain a magic constant known to + # the cost model, and the IR up to the call site does not contain a magic + # constant either + d = maybe_inline1() + + # declare this magic constant to trigger inlining of maybe_inline1 in a + # subsequent call + magic_const = 13 + + # will inline due to above constant declaration + e = maybe_inline1() + + # will inline as the maybe_inline2 function contains a magic constant known + # to the cost model + c = maybe_inline2() + + return a + b + c + d + e + magic_const + + +foo() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/inline_overload_example.py b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/inline_overload_example.py new file mode 100644 index 0000000000000000000000000000000000000000..f28f44a557d27c9b9c1199a693e94ac48c305753 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/inline_overload_example.py @@ -0,0 +1,61 @@ +import numba +from numba.extending import overload +from numba import njit, types + + +def bar(x): + """A function stub to overload""" + pass + + +@overload(bar, inline='always') +def ol_bar_tuple(x): + # An overload that will always inline, there is a type guard so that this + # only applies to UniTuples. + if isinstance(x, types.UniTuple): + def impl(x): + return x[0] + return impl + + +def cost_model(expr, caller, callee): + # Only inline if the type of the argument is an Integer + return isinstance(caller.typemap[expr.args[0].name], types.Integer) + + +@overload(bar, inline=cost_model) +def ol_bar_scalar(x): + # An overload that will inline based on a cost model, it only applies to + # scalar values in the numerical domain as per the type guard on Number + if isinstance(x, types.Number): + def impl(x): + return x + 1 + return impl + + +@njit +def foo(): + + # This will resolve via `ol_bar_tuple` as the argument is a types.UniTuple + # instance. It will always be inlined as specified in the decorator for this + # overload. + a = bar((1, 2, 3)) + + # This will resolve via `ol_bar_scalar` as the argument is a types.Number + # instance, hence the cost_model will be used to determine whether to + # inline. + # The function will be inlined as the value 100 is an IntegerLiteral which + # is an instance of a types.Integer as required by the cost_model function. + b = bar(100) + + # This will also resolve via `ol_bar_scalar` as the argument is a + # types.Number instance, again the cost_model will be used to determine + # whether to inline. + # The function will not be inlined as the complex value is not an instance + # of a types.Integer as required by the cost_model function. + c = bar(300j) + + return a + b + c + + +foo() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/inlining.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/inlining.rst new file mode 100644 index 0000000000000000000000000000000000000000..d48240234c90d5cef24a32a4a79df00f534529bc --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/inlining.rst @@ -0,0 +1,281 @@ + +================= +Notes on Inlining +================= + +There are occasions where it is useful to be able to inline a function at its +call site, at the Numba IR level of representation. The decorators such as +:func:`numba.jit`, :func:`numba.extending.overload` and +:func:`register_jitable` support the keyword argument ``inline``, to facilitate +this behaviour. + +When attempting to inline at this level, it is important to understand what +purpose this serves and what effect this will have. In contrast to the inlining +performed by LLVM, which is aimed at improving performance, the main reason to +inline at the Numba IR level is to allow type inference to cross function +boundaries. + +As an example, consider the following snippet: + +.. code:: python + + from numba import njit + + + @njit + def bar(a): + a.append(10) + + + @njit + def foo(): + z = [] + bar(z) + + + foo() + +This will fail to compile and run, because the type of ``z`` can not be inferred +as it will only be refined within ``bar``. If we now add ``inline=True`` to the +decorator for ``bar`` the snippet will compile and run. This is because inlining +the call to ``a.append(10)`` will mean that ``z`` will be refined to hold integers +and so type inference will succeed. + +So, to recap, inlining at the Numba IR level is unlikely to have a performance +benefit. Whereas inlining at the LLVM level stands a better chance. + +The ``inline`` keyword argument can be one of three values: + +* The string ``'never'``, this is the default and results in the function not + being inlined under any circumstances. +* The string ``'always'``, this results in the function being inlined at all + call sites. +* A python function that takes three arguments. The first argument is always the + ``ir.Expr`` node that is the ``call`` requesting the inline, this is present + to allow the function to make call contextually aware decisions. The second + and third arguments are: + + * In the case of an untyped inline, i.e. that which occurs when using the + :func:`numba.jit` family of decorators, both arguments are + ``numba.ir.FunctionIR`` instances. The second argument corresponding to the + IR of the caller, the third argument corresponding to the IR of the callee. + + * In the case of a typed inline, i.e. that which occurs when using + :func:`numba.extending.overload`, both arguments are instances of a + ``namedtuple`` with fields (corresponding to their standard use in the + compiler internals): + + * ``func_ir`` - the function's Numba IR. + * ``typemap`` - the function's type map. + * ``calltypes`` - the call types of any calls in the function. + * ``signature`` - the function's signature. + + The second argument holds the information from the caller, the third holds + the information from the callee. + + In all cases the function should return True to inline and return False to not + inline, this essentially permitting custom inlining rules (typical use might + be cost models). +* Recursive functions with ``inline='always'`` will result in a non-terminating + compilation. If you wish to avoid this, supply a function to limit the + recursion depth (see below). + +.. note:: No guarantee is made about the order in which functions are assessed + for inlining or about the order in which they are inlined. + + +Example using :func:`numba.jit` +=============================== + +An example of using all three options to ``inline`` in the :func:`numba.njit` +decorator: + +.. literalinclude:: inline_example.py + +which produces the following when executed (with a print of the IR after the +legalization pass, enabled via the environment variable +``NUMBA_DEBUG_PRINT_AFTER="ir_legalization"``): + +.. code-block:: none + :emphasize-lines: 2, 3, 9, 16, 17, 21, 22, 26, 35 + + label 0: + $0.1 = global(never_inline: CPUDispatcher()) ['$0.1'] + $0.2 = call $0.1(func=$0.1, args=[], kws=(), vararg=None) ['$0.1', '$0.2'] + del $0.1 [] + a = $0.2 ['$0.2', 'a'] + del $0.2 [] + $0.3 = global(always_inline: CPUDispatcher()) ['$0.3'] + del $0.3 [] + $const0.1.0 = const(int, 200) ['$const0.1.0'] + $0.2.1 = $const0.1.0 ['$0.2.1', '$const0.1.0'] + del $const0.1.0 [] + $0.4 = $0.2.1 ['$0.2.1', '$0.4'] + del $0.2.1 [] + b = $0.4 ['$0.4', 'b'] + del $0.4 [] + $0.5 = global(maybe_inline1: CPUDispatcher()) ['$0.5'] + $0.6 = call $0.5(func=$0.5, args=[], kws=(), vararg=None) ['$0.5', '$0.6'] + del $0.5 [] + d = $0.6 ['$0.6', 'd'] + del $0.6 [] + $const0.7 = const(int, 13) ['$const0.7'] + magic_const = $const0.7 ['$const0.7', 'magic_const'] + del $const0.7 [] + $0.8 = global(maybe_inline1: CPUDispatcher()) ['$0.8'] + del $0.8 [] + $const0.1.2 = const(int, 300) ['$const0.1.2'] + $0.2.3 = $const0.1.2 ['$0.2.3', '$const0.1.2'] + del $const0.1.2 [] + $0.9 = $0.2.3 ['$0.2.3', '$0.9'] + del $0.2.3 [] + e = $0.9 ['$0.9', 'e'] + del $0.9 [] + $0.10 = global(maybe_inline2: CPUDispatcher()) ['$0.10'] + del $0.10 [] + $const0.1.4 = const(int, 37) ['$const0.1.4'] + $0.2.5 = $const0.1.4 ['$0.2.5', '$const0.1.4'] + del $const0.1.4 [] + $0.11 = $0.2.5 ['$0.11', '$0.2.5'] + del $0.2.5 [] + c = $0.11 ['$0.11', 'c'] + del $0.11 [] + $0.14 = a + b ['$0.14', 'a', 'b'] + del b [] + del a [] + $0.16 = $0.14 + c ['$0.14', '$0.16', 'c'] + del c [] + del $0.14 [] + $0.18 = $0.16 + d ['$0.16', '$0.18', 'd'] + del d [] + del $0.16 [] + $0.20 = $0.18 + e ['$0.18', '$0.20', 'e'] + del e [] + del $0.18 [] + $0.22 = $0.20 + magic_const ['$0.20', '$0.22', 'magic_const'] + del magic_const [] + del $0.20 [] + $0.23 = cast(value=$0.22) ['$0.22', '$0.23'] + del $0.22 [] + return $0.23 ['$0.23'] + + +Things to note in the above: + +1. The call to the function ``never_inline`` remains as a call. +2. The ``always_inline`` function has been inlined, note its + ``const(int, 200)`` in the caller body. +3. There is a call to ``maybe_inline1`` before the ``const(int, 13)`` + declaration, the cost model prevented this from being inlined. +4. After the ``const(int, 13)`` the subsequent call to ``maybe_inline1`` has + been inlined as shown by the ``const(int, 300)`` in the caller body. +5. The function ``maybe_inline2`` has been inlined as demonstrated by + ``const(int, 37)`` in the caller body. +6. That dead code elimination has not been performed and as a result there are + superfluous statements present in the IR. + + +Example using :func:`numba.extending.overload` +============================================== + +An example of using inlining with the :func:`numba.extending.overload` +decorator. It is most interesting to note that if a function is supplied as the +argument to ``inline`` a lot more information is available via the supplied +function arguments for use in decision making. Also that different +``@overload`` s can have different inlining behaviours, with multiple ways to +achieve this: + +.. literalinclude:: inline_overload_example.py + +which produces the following when executed (with a print of the IR after the +legalization pass, enabled via the environment variable +``NUMBA_DEBUG_PRINT_AFTER="ir_legalization"``): + +.. code-block:: none + :emphasize-lines: 2, 3, 4, 5, 6, 15, 16, 17, 18, 19, 20, 21, 22, 28, 29, 30 + + label 0: + $const0.2 = const(tuple, (1, 2, 3)) ['$const0.2'] + x.0 = $const0.2 ['$const0.2', 'x.0'] + del $const0.2 [] + $const0.2.2 = const(int, 0) ['$const0.2.2'] + $0.3.3 = getitem(value=x.0, index=$const0.2.2) ['$0.3.3', '$const0.2.2', 'x.0'] + del x.0 [] + del $const0.2.2 [] + $0.4.4 = $0.3.3 ['$0.3.3', '$0.4.4'] + del $0.3.3 [] + $0.3 = $0.4.4 ['$0.3', '$0.4.4'] + del $0.4.4 [] + a = $0.3 ['$0.3', 'a'] + del $0.3 [] + $const0.5 = const(int, 100) ['$const0.5'] + x.5 = $const0.5 ['$const0.5', 'x.5'] + del $const0.5 [] + $const0.2.7 = const(int, 1) ['$const0.2.7'] + $0.3.8 = x.5 + $const0.2.7 ['$0.3.8', '$const0.2.7', 'x.5'] + del x.5 [] + del $const0.2.7 [] + $0.4.9 = $0.3.8 ['$0.3.8', '$0.4.9'] + del $0.3.8 [] + $0.6 = $0.4.9 ['$0.4.9', '$0.6'] + del $0.4.9 [] + b = $0.6 ['$0.6', 'b'] + del $0.6 [] + $0.7 = global(bar: ) ['$0.7'] + $const0.8 = const(complex, 300j) ['$const0.8'] + $0.9 = call $0.7($const0.8, func=$0.7, args=[Var($const0.8, inline_overload_example.py (56))], kws=(), vararg=None) ['$0.7', '$0.9', '$const0.8'] + del $const0.8 [] + del $0.7 [] + c = $0.9 ['$0.9', 'c'] + del $0.9 [] + $0.12 = a + b ['$0.12', 'a', 'b'] + del b [] + del a [] + $0.14 = $0.12 + c ['$0.12', '$0.14', 'c'] + del c [] + del $0.12 [] + $0.15 = cast(value=$0.14) ['$0.14', '$0.15'] + del $0.14 [] + return $0.15 ['$0.15'] + +Things to note in the above: + +1. The first highlighted section is the always inlined overload for the + ``UniTuple`` argument type. +2. The second highlighted section is the overload for the ``Number`` argument + type that has been inlined as the cost model function decided to do so as the + argument was an ``Integer`` type instance. +3. The third highlighted section is the overload for the ``Number`` argument + type that has not inlined as the cost model function decided to reject it as + the argument was an ``Complex`` type instance. +4. That dead code elimination has not been performed and as a result there are + superfluous statements present in the IR. + +Using a function to limit the inlining depth of a recursive function +==================================================================== + +When using recursive inlines, you can terminate the compilation by using +a cost model. + +.. code:: python + + from numba import njit + import numpy as np + + class CostModel(object): + def __init__(self, max_inlines): + self._count = 0 + self._max_inlines = max_inlines + + def __call__(self, expr, caller, callee): + ret = self._count < self._max_inlines + self._count += 1 + return ret + + @njit(inline=CostModel(3)) + def factorial(n): + if n <= 0: + return 1 + return n * factorial(n - 1) + + factorial(5) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/listings.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/listings.rst new file mode 100644 index 0000000000000000000000000000000000000000..1c4b1873d6a10dcdf0eba41b4e906b170199c9a8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/listings.rst @@ -0,0 +1,29 @@ +Listings +======== + +This shows listings from compiler internal registries (e.g. lowering +definitions). The information is provided as developer reference. +When possible, links to source code are provided via github links. + +New style listings +------------------ + +The following listings are generated from ``numba.help.inspector.write_listings()``. Users can run ``python -m numba.help.inspector --format=rst `` to recreate the the documentation. + +.. toctree:: + :maxdepth: 2 + + autogen_builtins_listing.rst + autogen_math_listing.rst + autogen_cmath_listing.rst + autogen_numpy_listing.rst + + +Old style listings +------------------ + +.. toctree:: + :maxdepth: 2 + + autogen_lower_listing.rst + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/literal.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/literal.rst new file mode 100644 index 0000000000000000000000000000000000000000..dd6d8d1872376811bc7664113dd35e7499fea62d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/literal.rst @@ -0,0 +1,82 @@ +.. _developer-literally: + +====================== +Notes on Literal Types +====================== + +.. note:: This document describes an advanced feature designed to overcome + some limitations of the compilation mechanism relating to types. + +Some features need to specialize based on the literal value during +compilation to produce type stable code necessary for successful compilation in +Numba. This can be achieved by propagating the literal value through the type +system. Numba recognizes inline literal values as :class:`numba.types.Literal`. +For example:: + + def foo(x): + a = 123 + return bar(x, a) + +Numba will infer the type of ``a`` as ``Literal[int](123)``. The definition of +``bar()`` can subsequently specialize its implementation knowing that the +second argument is an ``int`` with the value ``123``. + +``Literal`` Type +---------------- + +Classes and methods related to the ``Literal`` type. + +.. autoclass:: numba.types.Literal + +.. autofunction:: numba.types.literal + +.. autofunction:: numba.types.unliteral + +.. autofunction:: numba.types.maybe_literal + +Specifying for Literal Typing +----------------------------- + +To specify a value as a ``Literal`` type in code scheduled for JIT compilation, +use the following function: + +.. autofunction:: numba.literally + +Code Example +~~~~~~~~~~~~ + +.. literalinclude:: ../../../numba/tests/doc_examples/test_literally_usage.py + :language: python + :caption: from ``test_literally_usage`` of ``numba/tests/doc_examples/test_literally_usage.py`` + :start-after: magictoken.ex_literally_usage.begin + :end-before: magictoken.ex_literally_usage.end + :dedent: 4 + :linenos: + + +Internal Details +~~~~~~~~~~~~~~~~ + +Internally, the compiler raises a ``ForceLiteralArgs`` exception to signal +the dispatcher to wrap specified arguments using the ``Literal`` type. + +.. autoclass:: numba.errors.ForceLiteralArg + :members: __init__, combine, __or__ + + +Inside Extensions +----------------- + +``@overload`` extensions can use ``literally`` inside the implementation body +like in normal jit-code. + +Explicit handling of literal requirements is possible through use of the +following: + +.. autoclass:: numba.extending.SentryLiteralArgs + :members: + +.. autoclass:: numba.extending.BoundLiteralArgs + :members: + +.. autofunction:: numba.extending.sentry_literal_args diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/live_variable_analysis.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/live_variable_analysis.rst new file mode 100644 index 0000000000000000000000000000000000000000..fdeedfa6366b98e9080fbcbbb9279ac444c73456 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/live_variable_analysis.rst @@ -0,0 +1,86 @@ +.. _live variable analysis: + +====================== +Live Variable Analysis +====================== + +(Related issue https://github.com/numba/numba/pull/1611) + +Numba uses reference-counting for garbage collection, a technique that +requires cooperation by the compiler. The Numba IR encodes the location +where a decref must be inserted. These locations are determined by live +variable analysis. The corresponding source code is the ``_insert_var_dels()`` +method in https://github.com/numba/numba/blob/main/numba/interpreter.py. + + +In Python semantic, once a variable is defined inside a function, it is alive +until the variable is explicitly deleted or the function scope is ended. +However, Numba analyzes the code to determine the minimum bound of the lifetime +of each variable by its definition and usages during compilation. +As soon as a variable is unreachable, a ``del`` instruction is inserted at the +closest basic-block (either at the start of the next block(s) or at the +end of the current block). This means variables can be released earlier than in +regular Python code. + +The behavior of the live variable analysis affects memory usage of the compiled +code. Internally, Numba does not differentiate temporary variables and user +variables. Since each operation generates at least one temporary variable, +a function can accumulate a high number of temporary variables if they are not +released as soon as possible. +Our generator implementation can benefit from early releasing of variables, +which reduces the size of the state to suspend at each yield point. + + +Notes on behavior of the live variable analysis +================================================ + + +Variable deleted before definition +----------------------------------- + +(Related issue: https://github.com/numba/numba/pull/1738) + +When a variable lifetime is confined within the loop body (its definition and +usage does not escape the loop body), like: + +.. code-block:: python + + def f(arr): + # BB 0 + res = 0 + # BB 1 + for i in (0, 1): + # BB 2 + t = arr[i] + if t[i] > 1: + # BB 3 + res += t[i] + # BB 4 + return res + + +Variable ``t`` is never referenced outside of the loop. +A ``del`` instruction is emitted for ``t`` at the head of the loop (BB 1) +before a variable is defined. The reason is obvious once we know the control +flow graph:: + + +------------------------------> BB4 + | + | + BB 0 --> BB 1 --> BB 2 ---> BB 3 + ^ | | + | V V + +---------------------+ + + +Variable ``t`` is defined in BB 1. In BB 2, the evaluation of +``t[i] > 1`` uses ``t``, which is the last use if execution takes the false +branch and goto BB 1. In BB 3, ``t`` is only used in ``res += t[i]``, which is +the last use if execution takes the true branch. Because BB 3, an outgoing +branch of BB 2 uses ``t``, ``t`` must be deleted at the common predecessor. +The closest point is BB 1, which does not have ``t`` defined from the incoming +edge of BB 0. + +Alternatively, if ``t`` is deleted at BB 4, we will still have to delete the +variable before its definition because BB4 can be executed without executing +the loop body (BB 2 and BB 3), where the variable is defined. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/llvm_timings.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/llvm_timings.rst new file mode 100644 index 0000000000000000000000000000000000000000..f25a58451ce5eb845d13be86130384a021cb89dd --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/llvm_timings.rst @@ -0,0 +1,107 @@ +.. _developer-llvm-timings: + +==================== +Notes on timing LLVM +==================== + + +Getting LLVM Pass Timings +------------------------- + +The dispatcher stores LLVM pass timings in the dispatcher object metadata under +the ``llvm_pass_timings`` key when :envvar:`NUMBA_LLVM_PASS_TIMINGS` is +enabled or ``numba.config.LLVM_PASS_TIMINGS`` is set to truthy. +The timings information contains details on how much time +has been spent in each pass. The pass timings are also grouped by their purpose. +For example, there will be pass timings for function-level pre-optimizations, +module-level optimizations, and object code generation. + + +Code Example +~~~~~~~~~~~~ + +.. literalinclude:: ../../../numba/tests/doc_examples/test_llvm_pass_timings.py + :language: python + :caption: from ``test_pass_timings`` of ``numba/tests/doc_examples/test_llvm_pass_timings.py`` + :start-after: magictoken.ex_llvm_pass_timings.begin + :end-before: magictoken.ex_llvm_pass_timings.end + :dedent: 16 + :linenos: + +Example output: + +.. code-block:: text + + Printing pass timings for JITCodeLibrary('DocsLLVMPassTimings.test_pass_timings..foo') + Total time: 0.0376 + == #0 Function passes on '_ZN5numba5tests12doc_examples22test_llvm_pass_timings19DocsLLVMPassTimings17test_pass_timings12$3clocals$3e7foo$241Ex' + Percent: 4.8% + Total 0.0018s + Top timings: + 0.0015s ( 81.6%) SROA #3 + 0.0002s ( 9.3%) Early CSE #2 + 0.0001s ( 4.0%) Simplify the CFG #9 + 0.0000s ( 1.5%) Prune NRT refops #4 + 0.0000s ( 1.1%) Post-Dominator Tree Construction #5 + == #1 Function passes on '_ZN7cpython5numba5tests12doc_examples22test_llvm_pass_timings19DocsLLVMPassTimings17test_pass_timings12$3clocals$3e7foo$241Ex' + Percent: 0.8% + Total 0.0003s + Top timings: + 0.0001s ( 30.4%) Simplify the CFG #10 + 0.0001s ( 24.1%) Early CSE #3 + 0.0001s ( 17.8%) SROA #4 + 0.0000s ( 8.8%) Prune NRT refops #5 + 0.0000s ( 5.6%) Post-Dominator Tree Construction #6 + == #2 Function passes on 'cfunc._ZN5numba5tests12doc_examples22test_llvm_pass_timings19DocsLLVMPassTimings17test_pass_timings12$3clocals$3e7foo$241Ex' + Percent: 0.5% + Total 0.0002s + Top timings: + 0.0001s ( 27.7%) Early CSE #4 + 0.0001s ( 26.8%) Simplify the CFG #11 + 0.0000s ( 13.8%) Prune NRT refops #6 + 0.0000s ( 7.4%) Post-Dominator Tree Construction #7 + 0.0000s ( 6.7%) Dominator Tree Construction #29 + == #3 Module passes (cheap optimization for refprune) + Percent: 3.7% + Total 0.0014s + Top timings: + 0.0007s ( 52.0%) Combine redundant instructions + 0.0001s ( 5.4%) Function Integration/Inlining + 0.0001s ( 4.9%) Prune NRT refops #2 + 0.0001s ( 4.8%) Natural Loop Information + 0.0001s ( 4.6%) Post-Dominator Tree Construction #2 + == #4 Module passes (full optimization) + Percent: 43.9% + Total 0.0165s + Top timings: + 0.0032s ( 19.5%) Combine redundant instructions #9 + 0.0022s ( 13.5%) Combine redundant instructions #7 + 0.0010s ( 6.1%) Induction Variable Simplification + 0.0008s ( 4.8%) Unroll loops #2 + 0.0007s ( 4.5%) Loop Vectorization + == #5 Finalize object + Percent: 46.3% + Total 0.0174s + Top timings: + 0.0060s ( 34.6%) X86 DAG->DAG Instruction Selection #2 + 0.0019s ( 11.0%) Greedy Register Allocator #2 + 0.0013s ( 7.4%) Machine Instruction Scheduler #2 + 0.0012s ( 7.1%) Loop Strength Reduction + 0.0004s ( 2.3%) Induction Variable Users + + +API for custom analysis +~~~~~~~~~~~~~~~~~~~~~~~ + +It is possible to get more details then the summary text in the above example. +The pass timings are stored in a +:class:`numba.misc.llvm_pass_timings.PassTimingsCollection`, which contains +methods for accessing individual record for each pass. + +.. autoclass:: numba.misc.llvm_pass_timings.PassTimingsCollection + :members: get_total_time, list_longest_first, summary, __getitem__, __len__ + +.. autoclass:: numba.misc.llvm_pass_timings.ProcessedPassTimings + :members: get_raw_data, get_total_time, list_records, list_top, summary + +.. autoclass:: numba.misc.llvm_pass_timings.PassTimingRecord diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/mission.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/mission.rst new file mode 100644 index 0000000000000000000000000000000000000000..07fb3c2d3fdb03ee1c2e4217f2e104b9864ecdb8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/mission.rst @@ -0,0 +1,82 @@ +Numba Mission Statement +======================= + +Introduction +------------ + +This document is the mission statement for the Numba project. It exists to +provide a clear description of the purposes and goals of the project. As such, +this document provides background on Numba's users and use-cases, and outlines +the project's overall goals. + +This is a living document: + +=========================== ============= +The first revision date is: May 2022 +The last updated date is: May 2022 +The next review date is: November 2022 +=========================== ============= + +Background +---------- + +The Numba project provides tools to improve the performance of Python software. +It comprises numerous facilities including just-in-time (JIT) compilation, +extension points for library authors, and a compiler toolkit on which new +computational acceleration technologies can be explored and built. + +The range of use-cases and applications that can be targeted by Numba includes, +but is not limited to: + +* Scientific Computing +* Computationally intensive tasks +* Numerically oriented applications +* Data science utilities and programs + +The user base of Numba includes anyone needing to perform intensive +computational work, including users from a wide range of disciplines, examples +include: + +* The most common use case, a user wanting to JIT compile some numerical + functions. +* Users providing JIT accelerated libraries for domain specific use cases e.g. + scientific researchers. +* Users providing JIT accelerated libraries for use as part of the numerical + Python ecosystem. +* Those writing more advanced JIT accelerated libraries containing their own + domain specific data types etc. +* Compiler engineers who explore new compiler use-cases and/or need a custom + compiler. +* Hardware vendors looking to extend Numba to provide Python support for their + custom silicon or new hardware. + +Project Goals +------------- + +The primary aims of the Numba project are: + +* To make it easier for Python users to write high performance code. +* To have a core package with a well defined and pragmatically selected feature + scope that meets the needs of the user base without being overly complex. +* To provide a compiler toolkit for Python that is extensible and can be + customized to meet the needs of the user base. This comes with the expectation + that users potentially need to invest time and effort to extend and/or + customize the software themselves. +* To support both the Python core language/standard libraries and NumPy. +* To consistently produce high quality software: + + * Feature stability across versions. + * Well established and tested public APIs. + * Clearly documented deprecation cycles. + * Internally stable code base. + * Externally tested release candidates. + * Regular releases with a predictable and published release cycle. + * Maintain suitable infrastructure for both testing and releasing. With as + much in public as feasible. + +* To make it as easy as possible for people to contribute. +* To have a maintained public roadmap which will also include areas under active + development. +* To have a governance document in place and it working in practice. +* To ensure that Numba receives timely updates for its core dependencies: LLVM, + NumPy and Python. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/numba-runtime.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/numba-runtime.rst new file mode 100644 index 0000000000000000000000000000000000000000..2eb290062e5f9098b377bae02bcf5b2aa0b3ade4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/numba-runtime.rst @@ -0,0 +1,200 @@ +.. _arch-numba-runtime: + +====================== +Notes on Numba Runtime +====================== + + +The *Numba Runtime (NRT)* provides the language runtime to the *nopython mode* +Python subset. NRT is a standalone C library with a Python binding. This +allows :term:`NPM` runtime feature to be used without the GIL. Currently, the +only language feature implemented in NRT is memory management. + + +Memory Management +================= + +NRT implements memory management for :term:`NPM` code. It uses *atomic +reference count* for threadsafe, deterministic memory management. NRT maintains +a separate ``MemInfo`` structure for storing information about each allocation. + +Cooperating with CPython +------------------------ + +For NRT to cooperate with CPython, the NRT python binding provides adaptors for +converting python objects that export a memory region. When such an +object is used as an argument to a :term:`NPM` function, a new ``MemInfo`` is +created and it acquires a reference to the Python object. When a :term:`NPM` +value is returned to the Python interpreter, the associated ``MemInfo`` +(if any) is checked. If the ``MemInfo`` references a Python object, the +underlying Python object is released and returned instead. Otherwise, the +``MemInfo`` is wrapped in a Python object and returned. Additional process +maybe required depending on the type. + +The current implementation supports Numpy array and any buffer-exporting types. + + +Compiler-side Cooperation +------------------------- + +NRT reference counting requires the compiler to emit incref/decref operations +according to the usage. When the reference count drops to zero, the compiler +must call the destructor routine in NRT. + + +.. _nrt-refct-opt-pass: + +Optimizations +------------- + +The compiler is allowed to emit incref/decref operations naively. It relies +on an optimization pass to remove redundant reference count operations. + +A new optimization pass is implemented in version 0.52.0 to remove reference +count operations that fall into the following four categories of control-flow +structure---per basic-block, diamond, fanout, fanout+raise. See the documentation +for :envvar:`NUMBA_LLVM_REFPRUNE_FLAGS` for their descriptions. + +The old optimization pass runs at block level to avoid control flow analysis. +It depends on LLVM function optimization pass to simplify the control flow, +stack-to-register, and simplify instructions. It works by matching and +removing incref and decref pairs within each block. The old pass can be +enabled by setting :envvar:`NUMBA_LLVM_REFPRUNE_PASS` to `0`. + +Important assumptions +--------------------- + +Both the old (pre-0.52.0) and the new (post-0.52.0) optimization passes assume +that the only function that can consume a reference is ``NRT_decref``. +It is important that there are no other functions that will consume references. +Since the passes operate on LLVM IR, the "functions" here are referring to any +callee in a LLVM call instruction. + +To summarize, all functions exposed to the refcount optimization pass +**must not** consume counted references unless done so via ``NRT_decref``. + + +Quirks of the old optimization pass +----------------------------------- + +Since the pre-0.52.0 `refcount optimization pass `_ +requires the LLVM function optimization pass, the pass works on the LLVM IR as +text. The optimized IR is then materialized again as a new LLVM in-memory +bitcode object. + + +Debugging Leaks +--------------- + +To debug reference leaks in NRT MemInfo, each MemInfo python object has a +``.refcount`` attribute for inspection. To get the MemInfo from a ndarray +allocated by NRT, use the ``.base`` attribute. + +To debug memory leaks in NRT, the ``numba.core.runtime.rtsys`` defines +``.get_allocation_stats()``. It returns a namedtuple containing the +number of allocation and deallocation since the start of the program. +Checking that the allocation and deallocation counters are matching is the +simplest way to know if the NRT is leaking. + + +Debugging Leaks in C +-------------------- + +The start of `numba/core/runtime/nrt.h +`_ +has these lines: + +.. code-block:: C + + /* Debugging facilities - enabled at compile-time */ + /* #undef NDEBUG */ + #if 0 + # define NRT_Debug(X) X + #else + # define NRT_Debug(X) if (0) { X; } + #endif + +Undefining NDEBUG (uncomment the ``#undef NDEBUG`` line) enables the assertion +check in NRT. + +Enabling the NRT_Debug (replace ``#if 0`` with ``#if 1``) turns on +debug print inside NRT. + + +Recursion Support +================= + +During the compilation of a pair of mutually recursive functions, one of the +functions will contain unresolved symbol references since the compiler handles +one function at a time. The memory for the unresolved symbols is allocated and +initialized to the address of the *unresolved symbol abort* function +(``nrt_unresolved_abort``) just before the machine code is +generated by LLVM. These symbols are tracked and resolved as new functions are +compiled. If a bug prevents the resolution of these symbols, +the abort function will be called, raising a ``RuntimeError`` exception. + +The *unresolved symbol abort* function is defined in the NRT with a zero-argument +signature. The caller is safe to call it with arbitrary number of +arguments. Therefore, it is safe to be used inplace of the intended callee. + +Using the NRT from C code +========================= + +Externally compiled C code should use the ``NRT_api_functions`` struct as a +function table to access the NRT API. The struct is defined in +:ghfile:`numba/core/runtime/nrt_external.h`. Users can use the utility function +``numba.extending.include_path()`` to determine the include directory for +Numba provided C headers. + +.. literalinclude:: ../../../numba/core/runtime/nrt_external.h + :language: C + :caption: `numba/core/runtime/nrt_external.h` + +Inside Numba compiled code, the ``numba.core.unsafe.nrt.NRT_get_api()`` +intrinsic can be used to obtain a pointer to the ``NRT_api_functions``. + +Here is an example that uses the ``nrt_external.h``: + +.. code-block:: C + + #include + #include "numba/core/runtime/nrt_external.h" + + void my_dtor(void *ptr) { + free(ptr); + } + + NRT_MemInfo* my_allocate(NRT_api_functions *nrt) { + /* heap allocate some memory */ + void * data = malloc(10); + /* wrap the allocated memory; yield a new reference */ + NRT_MemInfo *mi = nrt->manage_memory(data, my_dtor); + /* acquire reference */ + nrt->acquire(mi); + /* release reference */ + nrt->release(mi); + return mi; + } + +It is important to ensure that the NRT is initialized prior to making calls to +it, calling ``numba.core.runtime.nrt.rtsys.initialize(context)`` from Python +will have the desired effect. Similarly the code snippet: + +.. code-block:: Python + + from numba.core.registry import cpu_target # Get the CPU target singleton + cpu_target.target_context # Access the target_context property to initialize + +will achieve the same specifically for Numba's CPU target (the default). Failure +to initialize the NRT will result in access violations as function pointers for +various internal atomic operations will be missing in the ``NRT_MemSys`` struct. + +Future Plan +=========== + +The plan for NRT is to make a standalone shared library that can be linked to +Numba compiled code, including use within the Python interpreter and without +the Python interpreter. To make that work, we will be doing some refactoring: + +* numba :term:`NPM` code references statically compiled code in "helperlib.c". + Those functions should be moved to NRT. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/release.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/release.rst new file mode 100644 index 0000000000000000000000000000000000000000..b547ee2bbc5f42e83aaf8d2ea2eb7faf147ada0b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/release.rst @@ -0,0 +1,49 @@ +Numba Release Process +===================== + +The goal of the Numba release process -- from a high level perspective -- is to +publish source and binary artifacts that correspond to a given version +number. This usually involves a sequence of individual tasks that must be +performed in the correct order and with diligence. Numba and llvmlite are +commonly released in lockstep since there is usually a one-to-one mapping +between a Numba version and a corresponding llvmlite version. + +This section contains various notes and templates that can be used to create a +Numba release checklist on the Numba Github issue tracker. This is an aid for +the maintainers during the release process and helps to ensure that all tasks +are completed in the correct order and that no tasks are accidentally omitted. + +If new or additional items do appear during release, please do remember to add +them to the checklist templates. Also note that the release process itself is +always a work in progress. This means that some of the information here may be +outdated. If you notice this please do remember to submit a pull-request to +update this document. + +All release checklists are available as Gitub issue templates. To create a new +release checklist simply open a new issue and select the correct template. + + +Primary Release Candidate Checklist +----------------------------------- + +This is for the first/primary release candidate for minor release i.e. the +first release of every series. It is special, because during this release, the +release branch will have to be created. Release candidate indexing begins at 1. + +.. literalinclude:: ../../../.github/ISSUE_TEMPLATE/first_rc_checklist.md + :language: md + :lines: 9- + +`Open a primary release checklist `_. + +Subsequent Release Candidates, Final Releases and Patch Releases +---------------------------------------------------------------- + +Releases subsequent to the first release in a series usually involves a series +of cherry-picks, the recipe is therefore slightly different. + +.. literalinclude:: ../../../.github/ISSUE_TEMPLATE/sub_rc_checklist.md + :language: md + :lines: 9- + +`Open a subsequent release checklist `_. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/repomap.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/repomap.rst new file mode 100644 index 0000000000000000000000000000000000000000..12bfd1b166f727f26796780996babcbc9f998422 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/repomap.rst @@ -0,0 +1,582 @@ +A Map of the Numba Repository +============================= + +The Numba repository is quite large, and due to age has functionality spread +around many locations. To help orient developers, this document will try to +summarize where different categories of functionality can be found. + + +Support Files +------------- + +Build and Packaging +''''''''''''''''''' + +- :ghfile:`setup.py` - Standard Python distutils/setuptools script +- :ghfile:`MANIFEST.in` - Distutils packaging instructions +- :ghfile:`requirements.txt` - Pip package requirements, not used by conda +- :ghfile:`versioneer.py` - Handles automatic setting of version in + installed package from git tags +- :ghfile:`.flake8` - Preferences for code formatting. Files should be + fixed and removed from the exception list as time allows. +- :ghfile:`.pre-commit-config.yaml` - Configuration file for pre-commit hooks. +- :ghfile:`.readthedocs.yml` - Configuration file for Read the Docs. +- :ghfile:`buildscripts/condarecipe.local` - Conda build recipe +- :ghfile:`buildscripts/condarecipe_clone_icc_rt` - Recipe to build a + standalone icc_rt package. + + +Continuous Integration +'''''''''''''''''''''' +- :ghfile:`azure-pipelines.yml` - Azure Pipelines CI config (active: + Win/Mac/Linux) +- :ghfile:`buildscripts/azure/` - Azure Pipeline configuration for specific + platforms +- :ghfile:`buildscripts/appveyor/` - Appveyor build scripts +- :ghfile:`buildscripts/incremental/` - Generic scripts for building Numba + on various CI systems +- :ghfile:`codecov.yml` - Codecov.io coverage reporting + + +Documentation +''''''''''''' +- :ghfile:`LICENSE` - License for Numba +- :ghfile:`LICENSES.third-party` - License for third party code vendored + into Numba +- :ghfile:`README.rst` - README for repo, also uploaded to PyPI +- :ghfile:`CONTRIBUTING.md` - Documentation on how to contribute to project + (out of date, should be updated to point to Sphinx docs) +- :ghfile:`CHANGE_LOG` - History of Numba releases, also directly embedded + into Sphinx documentation +- :ghfile:`docs/` - Documentation source +- :ghfile:`docs/_templates/` - Directory for templates (to override defaults + with Sphinx theme) +- :ghfile:`docs/Makefile` - Used to build Sphinx docs with ``make`` +- :ghfile:`docs/source` - ReST source for Numba documentation +- :ghfile:`docs/_static/` - Static CSS and image assets for Numba docs +- :ghfile:`docs/gh-pages.py` - Utility script to update Numba docs (stored + as gh-pages) +- :ghfile:`docs/make.bat` - Not used (remove?) +- :ghfile:`docs/requirements.txt` - Pip package requirements for building docs + with Read the Docs. +- :ghfile:`numba/scripts/generate_lower_listing.py` - Dump all registered + implementations decorated with ``@lower*`` for reference + documentation. Currently misses implementations from the higher + level extension API. + + +Numba Source Code +----------------- + +Numba ships with both the source code and tests in one package. + +- :ghfile:`numba/` - all of the source code and tests + + +Public API +'''''''''' + +These define aspects of the public Numba interface. + +- :ghfile:`numba/core/decorators.py` - User-facing decorators for compiling + regular functions on the CPU +- :ghfile:`numba/core/extending.py` - Public decorators for extending Numba + (``overload``, ``intrinsic``, etc) + - :ghfile:`numba/experimental/structref.py` - Public API for defining a mutable struct +- :ghfile:`numba/core/ccallback.py` - ``@cfunc`` decorator for compiling + functions to a fixed C signature. Used to make callbacks. +- :ghfile:`numba/np/ufunc/decorators.py` - ufunc/gufunc compilation + decorators +- :ghfile:`numba/core/config.py` - Numba global config options and environment + variable handling +- :ghfile:`numba/core/annotations` - Gathering and printing type annotations of + Numba IR +- :ghfile:`numba/core/annotations/pretty_annotate.py` - Code highlighting of + Numba functions and types (both ANSI terminal and HTML) +- :ghfile:`numba/core/event.py` - A simple event system for applications to + listen to specific compiler events. + + +Dispatching +''''''''''' + +- :ghfile:`numba/core/dispatcher.py` - Dispatcher objects are compiled functions + produced by ``@jit``. A dispatcher has different implementations + for different type signatures. +- :ghfile:`numba/_dispatcher.cpp` - C++ dispatcher implementation (for speed on + common data types) +- :ghfile:`numba/core/retarget.py` - Support for dispatcher objects to switch + target via a specific with-context. + + +Compiler Pipeline +''''''''''''''''' + +- :ghfile:`numba/core/compiler.py` - Compiler pipelines and flags +- :ghfile:`numba/core/errors.py` - Numba exception and warning classes +- :ghfile:`numba/core/ir.py` - Numba IR data structure objects +- :ghfile:`numba/core/bytecode.py` - Bytecode parsing and function identity (??) +- :ghfile:`numba/core/interpreter.py` - Translate Python interpreter bytecode to + Numba IR +- :ghfile:`numba/core/analysis.py` - Utility functions to analyze Numba IR + (variable lifetime, prune branches, etc) +- :ghfile:`numba/core/dataflow.py` - Dataflow analysis for Python bytecode (used + in analysis.py) +- :ghfile:`numba/core/controlflow.py` - Control flow analysis of Numba IR and + Python bytecode +- :ghfile:`numba/core/typeinfer.py` - Type inference algorithm +- :ghfile:`numba/core/transforms.py` - Numba IR transformations +- :ghfile:`numba/core/rewrites` - Rewrite passes used by compiler +- :ghfile:`numba/core/rewrites/__init__.py` - Loads all rewrite passes so they + are put into the registry +- :ghfile:`numba/core/rewrites/registry.py` - Registry object for collecting + rewrite passes +- :ghfile:`numba/core/rewrites/ir_print.py` - Write print() calls into special + print nodes in the IR +- :ghfile:`numba/core/rewrites/static_raise.py` - Converts exceptions with + static arguments into a special form that can be lowered +- :ghfile:`numba/core/rewrites/static_getitem.py` - Rewrites getitem and setitem + with constant arguments to allow type inference +- :ghfile:`numba/core/rewrites/static_binop.py` - Rewrites binary operations + (specifically ``**``) with constant arguments so faster code can be + generated +- :ghfile:`numba/core/inline_closurecall.py` - Inlines body of closure functions + to call site. Support for array comprehensions, reduction inlining, + and stencil inlining. +- :ghfile:`numba/core/postproc.py` - Postprocessor for Numba IR that computes + variable lifetime, inserts del operations, and handles generators +- :ghfile:`numba/core/lowering.py` - General implementation of lowering Numba IR + to LLVM + :ghfile:`numba/core/environment.py` - Runtime environment object +- :ghfile:`numba/core/withcontexts.py` - General scaffolding for implementing + context managers in nopython mode, and the objectmode context + manager +- :ghfile:`numba/core/pylowering.py` - Lowering of Numba IR in object mode +- :ghfile:`numba/core/pythonapi.py` - LLVM IR code generation to interface with + CPython API +- :ghfile:`numba/core/targetconfig.py` - Utils for target configurations such + as compiler flags. + + +Type Management +''''''''''''''' + +- :ghfile:`numba/core/typeconv/` - Implementation of type casting and type + signature matching in both C++ and Python +- :ghfile:`numba/capsulethunk.h` - Used by typeconv +- :ghfile:`numba/core/types/` - definition of the Numba type hierarchy, used + everywhere in compiler to select implementations +- :ghfile:`numba/core/consts.py` - Constant inference (used to make constant + values available during codegen when possible) +- :ghfile:`numba/core/datamodel` - LLVM IR representations of data types in + different contexts +- :ghfile:`numba/core/datamodel/models.py` - Models for most standard types +- :ghfile:`numba/core/datamodel/registry.py` - Decorator to register new data + models +- :ghfile:`numba/core/datamodel/packer.py` - Pack typed values into a data + structure +- :ghfile:`numba/core/datamodel/testing.py` - Data model tests (this should + move??) +- :ghfile:`numba/core/datamodel/manager.py` - Map types to data models + + +Compiled Extensions +''''''''''''''''''' + +Numba uses a small amount of compiled C/C++ code for core +functionality, like dispatching and type matching where performance +matters, and it is more convenient to encapsulate direct interaction +with CPython APIs. + +- :ghfile:`numba/_arraystruct.h` - Struct for holding NumPy array + attributes. Used in helperlib and the Numba Runtime. +- :ghfile:`numba/_helperlib.c` - C functions required by Numba compiled code + at runtime. Linked into ahead-of-time compiled modules +- :ghfile:`numba/_helpermod.c` - Python extension module with pointers to + functions from ``_helperlib.c`` and ``_npymath_exports.c`` +- :ghfile:`numba/_npymath_exports.c` - Export function pointer table to + NumPy C math functions +- :ghfile:`numba/_dynfuncmod.c` - Python extension module exporting + _dynfunc.c functionality +- :ghfile:`numba/_dynfunc.c` - C level Environment and Closure objects (keep + in sync with numba/target/base.py) +- :ghfile:`numba/mathnames.h` - Macros for defining names of math functions +- :ghfile:`numba/_pymodule.h` - C macros for Python 2/3 portable naming of C + API functions +- :ghfile:`numba/mviewbuf.c` - Handles Python memoryviews +- :ghfile:`numba/_typeof.{h,c}` - C implementation of type fingerprinting, + used by dispatcher +- :ghfile:`numba/_numba_common.h` - Portable C macro for marking symbols + that can be shared between object files, but not outside the + library. + + + +Misc Support +'''''''''''' + +- :ghfile:`numba/_version.py` - Updated by versioneer +- :ghfile:`numba/core/runtime` - Language runtime. Currently manages + reference-counted memory allocated on the heap by Numba-compiled + functions +- :ghfile:`numba/core/ir_utils.py` - Utility functions for working with Numba IR + data structures +- :ghfile:`numba/core/cgutils.py` - Utility functions for generating common code + patterns in LLVM IR +- :ghfile:`numba/core/utils.py` - Python 2 backports of Python 3 functionality + (also imports local copy of ``six``) +- :ghfile:`numba/core/overload_glue.py` - Functions for wrapping split typing + and lowering API use cases into overloads. +- :ghfile:`numba/misc/appdirs.py` - Vendored package for determining application + config directories on every platform +- :ghfile:`numba/core/compiler_lock.py` - Global compiler lock because Numba's + usage of LLVM is not thread-safe +- :ghfile:`numba/misc/special.py` - Python stub implementations of special Numba + functions (prange, gdb*) +- :ghfile:`numba/core/itanium_mangler.py` - Python implementation of Itanium C++ + name mangling +- :ghfile:`numba/misc/findlib.py` - Helper function for locating shared + libraries on all platforms +- :ghfile:`numba/core/debuginfo.py` - Helper functions to construct LLVM IR + debug + info +- :ghfile:`numba/core/unsafe/refcount.py` - Read reference count of object +- :ghfile:`numba/core/unsafe/eh.py` - Exception handling helpers +- :ghfile:`numba/core/unsafe/nrt.py` - Numba runtime (NRT) helpers +- :ghfile:`numba/cpython/unsafe/tuple.py` - Replace a value in a tuple slot +- :ghfile:`numba/np/unsafe/ndarray.py` - NumPy array helpers +- :ghfile:`numba/core/unsafe/bytes.py` - Copying and dereferencing data from + void pointers +- :ghfile:`numba/misc/dummyarray.py` - Used by GPU backends to hold array + information on the host, but not the data. +- :ghfile:`numba/core/callwrapper.py` - Handles argument unboxing and releasing + the GIL when moving from Python to nopython mode +- :ghfile:`numba/np/numpy_support.py` - Helper functions for working with NumPy + and translating Numba types to and from NumPy dtypes. +- :ghfile:`numba/core/tracing.py` - Decorator for tracing Python calls and + emitting log messages +- :ghfile:`numba/core/funcdesc.py` - Classes for describing function metadata + (used in the compiler) +- :ghfile:`numba/core/sigutils.py` - Helper functions for parsing and + normalizing Numba type signatures +- :ghfile:`numba/core/serialize.py` - Support for pickling compiled functions +- :ghfile:`numba/core/caching.py` - Disk cache for compiled functions +- :ghfile:`numba/np/npdatetime.py` - Helper functions for implementing NumPy + datetime64 support +- :ghfile:`numba/misc/llvm_pass_timings.py` - Helper to record timings of + LLVM passes. +- :ghfile:`numba/cloudpickle` - Vendored cloudpickle subpackage + +Core Python Data Types +'''''''''''''''''''''' + +- :ghfile:`numba/_hashtable.{h,c}` - Adaptation of the Python 3.7 hash table + implementation +- :ghfile:`numba/cext/dictobject.{h,c}` - C level implementation of typed + dictionary +- :ghfile:`numba/typed/dictobject.py` - Nopython mode wrapper for typed + dictionary +- :ghfile:`numba/cext/listobject.{h,c}` - C level implementation of typed list +- :ghfile:`numba/typed/listobject.py` - Nopython mode wrapper for typed list +- :ghfile:`numba/typed/typedobjectutils.py` - Common utilities for typed + dictionary and list +- :ghfile:`numba/cpython/unicode.py` - Unicode strings (Python 3.5 and later) +- :ghfile:`numba/typed` - Python interfaces to statically typed containers +- :ghfile:`numba/typed/typeddict.py` - Python interface to typed dictionary +- :ghfile:`numba/typed/typedlist.py` - Python interface to typed list +- :ghfile:`numba/experimental/jitclass` - Implementation of experimental JIT + compilation of Python classes +- :ghfile:`numba/core/generators.py` - Support for lowering Python generators + + +Math +'''' + +- :ghfile:`numba/_random.c` - Reimplementation of NumPy / CPython random + number generator +- :ghfile:`numba/_lapack.c` - Wrappers for calling BLAS and LAPACK functions + (requires SciPy) + + +ParallelAccelerator +''''''''''''''''''' + +Code transformation passes that extract parallelizable code from +a function and convert it into multithreaded gufunc calls. + +- :ghfile:`numba/parfors/parfor.py` - General ParallelAccelerator +- :ghfile:`numba/parfors/parfor_lowering.py` - gufunc lowering for + ParallelAccelerator +- :ghfile:`numba/parfors/array_analysis.py` - Array analysis passes used in + ParallelAccelerator + + +Stencil +''''''' + +Implementation of ``@stencil``: + +- :ghfile:`numba/stencils/stencil.py` - Stencil function decorator (implemented + without ParallelAccelerator) +- :ghfile:`numba/stencils/stencilparfor.py` - ParallelAccelerator implementation + of stencil + + +Debugging Support +''''''''''''''''' + +- :ghfile:`numba/misc/gdb_hook.py` - Hooks to jump into GDB from nopython + mode +- :ghfile:`numba/misc/cmdlang.gdb` - Commands to setup GDB for setting + explicit breakpoints from Python + + +Type Signatures (CPU) +''''''''''''''''''''' + +Some (usually older) Numba supported functionality separates the +declaration of allowed type signatures from the definition of +implementations. This package contains registries of type signatures +that must be matched during type inference. + +- :ghfile:`numba/core/typing` - Type signature module +- :ghfile:`numba/core/typing/templates.py` - Base classes for type signature + templates +- :ghfile:`numba/core/typing/cmathdecl.py` - Python complex math (``cmath``) + module +- :ghfile:`numba/core/typing/bufproto.py` - Interpreting objects supporting the + buffer protocol +- :ghfile:`numba/core/typing/mathdecl.py` - Python ``math`` module +- :ghfile:`numba/core/typing/listdecl.py` - Python lists +- :ghfile:`numba/core/typing/builtins.py` - Python builtin global functions and + operators +- :ghfile:`numba/core/typing/randomdecl.py` - Python and NumPy ``random`` + modules +- :ghfile:`numba/core/typing/setdecl.py` - Python sets +- :ghfile:`numba/core/typing/npydecl.py` - NumPy ndarray (and operators), NumPy + functions +- :ghfile:`numba/core/typing/arraydecl.py` - Python ``array`` module +- :ghfile:`numba/core/typing/context.py` - Implementation of typing context + (class that collects methods used in type inference) +- :ghfile:`numba/core/typing/collections.py` - Generic container operations and + namedtuples +- :ghfile:`numba/core/typing/ctypes_utils.py` - Typing ctypes-wrapped function + pointers +- :ghfile:`numba/core/typing/enumdecl.py` - Enum types +- :ghfile:`numba/core/typing/cffi_utils.py` - Typing of CFFI objects +- :ghfile:`numba/core/typing/typeof.py` - Implementation of typeof operations + (maps Python object to Numba type) +- :ghfile:`numba/core/typing/asnumbatype.py` - Implementation of + ``as_numba_type`` operations (maps Python types to Numba type) +- :ghfile:`numba/core/typing/npdatetime.py` - Datetime dtype support for NumPy + arrays + + +Target Implementations (CPU) +'''''''''''''''''''''''''''' + +Implementations of Python / NumPy functions and some data models. +These modules are responsible for generating LLVM IR during lowering. +Note that some of these modules do not have counterparts in the typing +package because newer Numba extension APIs (like overload) allow +typing and implementation to be specified together. + +- :ghfile:`numba/core/cpu.py` - Context for code gen on CPU +- :ghfile:`numba/core/base.py` - Base class for all target contexts +- :ghfile:`numba/core/codegen.py` - Driver for code generation +- :ghfile:`numba/core/boxing.py` - Boxing and unboxing for most data + types +- :ghfile:`numba/core/intrinsics.py` - Utilities for converting LLVM + intrinsics to other math calls +- :ghfile:`numba/core/callconv.py` - Implements different calling + conventions for Numba-compiled functions +- :ghfile:`numba/core/options.py` - Container for options that control + lowering +- :ghfile:`numba/core/optional.py` - Special type representing value or + ``None`` +- :ghfile:`numba/core/registry.py` - Registry object for collecting + implementations for a specific target +- :ghfile:`numba/core/imputils.py` - Helper functions for lowering +- :ghfile:`numba/core/externals.py` - Registers external C functions + needed to link generated code +- :ghfile:`numba/core/fastmathpass.py` - Rewrite pass to add fastmath + attributes to function call sites and binary operations +- :ghfile:`numba/core/removerefctpass.py` - Rewrite pass to remove + unnecessary incref/decref pairs +- :ghfile:`numba/core/descriptors.py` - empty base class for all target + descriptors (is this needed?) +- :ghfile:`numba/cpython/builtins.py` - Python builtin functions and + operators +- :ghfile:`numba/cpython/cmathimpl.py` - Python complex math module +- :ghfile:`numba/cpython/enumimpl.py` - Enum objects +- :ghfile:`numba/cpython/hashing.py` - Hashing algorithms +- :ghfile:`numba/cpython/heapq.py` - Python ``heapq`` module +- :ghfile:`numba/cpython/iterators.py` - Iterable data types and iterators +- :ghfile:`numba/cpython/listobj.py` - Python lists +- :ghfile:`numba/cpython/mathimpl.py` - Python ``math`` module +- :ghfile:`numba/cpython/numbers.py` - Numeric values (int, float, etc) +- :ghfile:`numba/cpython/printimpl.py` - Print function +- :ghfile:`numba/cpython/randomimpl.py` - Python and NumPy ``random`` + modules +- :ghfile:`numba/cpython/rangeobj.py` - Python `range` objects +- :ghfile:`numba/cpython/slicing.py` - Slice objects, and index calculations + used in slicing +- :ghfile:`numba/cpython/setobj.py` - Python set type +- :ghfile:`numba/cpython/tupleobj.py` - Tuples (statically typed as + immutable struct) +- :ghfile:`numba/misc/cffiimpl.py` - CFFI functions +- :ghfile:`numba/misc/quicksort.py` - Quicksort implementation used with + list and array objects +- :ghfile:`numba/misc/mergesort.py` - Mergesort implementation used with + array objects +- :ghfile:`numba/np/arraymath.py` - Math operations on arrays (both + Python and NumPy) +- :ghfile:`numba/np/arrayobj.py` - Array operations (both NumPy and + buffer protocol) +- :ghfile:`numba/np/linalg.py` - NumPy linear algebra operations +- :ghfile:`numba/np/npdatetime.py` - NumPy datetime operations +- :ghfile:`numba/np/npyfuncs.py` - Kernels used in generating some + NumPy ufuncs +- :ghfile:`numba/np/npyimpl.py` - Implementations of most NumPy ufuncs +- :ghfile:`numba/np/polynomial.py` - ``numpy.roots`` function +- :ghfile:`numba/np/ufunc_db.py` - Big table mapping types to ufunc + implementations + + +Ufunc Compiler and Runtime +'''''''''''''''''''''''''' + +- :ghfile:`numba/np/ufunc` - ufunc compiler implementation +- :ghfile:`numba/np/ufunc/_internal.{h,c}` - Python extension module with + helper functions that use CPython & NumPy C API +- :ghfile:`numba/np/ufunc/_ufunc.c` - Used by `_internal.c` +- :ghfile:`numba/np/ufunc/deviceufunc.py` - Custom ufunc dispatch for + non-CPU targets +- :ghfile:`numba/np/ufunc/gufunc_scheduler.{h,cpp}` - Schedule work chunks + to threads +- :ghfile:`numba/np/ufunc/dufunc.py` - Special ufunc that can compile new + implementations at call time +- :ghfile:`numba/np/ufunc/ufuncbuilder.py` - Top-level orchestration of + ufunc/gufunc compiler pipeline +- :ghfile:`numba/np/ufunc/sigparse.py` - Parser for generalized ufunc + indexing signatures +- :ghfile:`numba/np/ufunc/parallel.py` - Codegen for ``parallel`` target +- :ghfile:`numba/np/ufunc/array_exprs.py` - Rewrite pass for turning array + expressions in regular functions into ufuncs +- :ghfile:`numba/np/ufunc/wrappers.py` - Wrap scalar function kernel with + loops +- :ghfile:`numba/np/ufunc/workqueue.{h,c}` - Threading backend based on + pthreads/Windows threads and queues +- :ghfile:`numba/np/ufunc/omppool.cpp` - Threading backend based on OpenMP +- :ghfile:`numba/np/ufunc/tbbpool.cpp` - Threading backend based on TBB + + + +Unit Tests (CPU) +'''''''''''''''' + +CPU unit tests (GPU target unit tests listed in later sections + +- :ghfile:`runtests.py` - Convenience script that launches test runner and + turns on full compiler tracebacks +- :ghfile:`.coveragerc` - Coverage.py configuration +- :ghfile:`numba/runtests.py` - Entry point to unittest runner +- :ghfile:`numba/testing/_runtests.py` - Implementation of custom test runner + command line interface +- :ghfile:`numba/tests/test_*` - Test cases +- :ghfile:`numba/tests/*_usecases.py` - Python functions compiled by some + unit tests +- :ghfile:`numba/tests/support.py` - Helper functions for testing and + special TestCase implementation +- :ghfile:`numba/tests/dummy_module.py` - Module used in + ``test_dispatcher.py`` +- :ghfile:`numba/tests/npyufunc` - ufunc / gufunc compiler tests +- :ghfile:`numba/testing` - Support code for testing +- :ghfile:`numba/testing/loader.py` - Find tests on disk +- :ghfile:`numba/testing/notebook.py` - Support for testing notebooks +- :ghfile:`numba/testing/main.py` - Numba test runner + + +Command Line Utilities +'''''''''''''''''''''' +- :ghfile:`bin/numba` - Command line stub, delegates to main in + ``numba_entry.py`` +- :ghfile:`numba/misc/numba_entry.py` - Main function for ``numba`` command line + tool +- :ghfile:`numba/pycc` - Ahead of time compilation of functions to shared + library extension +- :ghfile:`numba/pycc/__init__.py` - Main function for ``pycc`` command line + tool +- :ghfile:`numba/pycc/cc.py` - User-facing API for tagging functions to + compile ahead of time +- :ghfile:`numba/pycc/compiler.py` - Compiler pipeline for creating + standalone Python extension modules +- :ghfile:`numba/pycc/llvm_types.py` - Aliases to LLVM data types used by + ``compiler.py`` +- :ghfile:`numba/pycc/pycc` - Stub to call main function. Is this still + used? +- :ghfile:`numba/pycc/modulemixin.c` - C file compiled into every compiled + extension. Pulls in C source from Numba core that is needed to make + extension standalone +- :ghfile:`numba/pycc/platform.py` - Portable interface to platform-specific + compiler toolchains +- :ghfile:`numba/pycc/decorators.py` - Deprecated decorators for tagging + functions to compile. Use ``cc.py`` instead. + + +CUDA GPU Target +''''''''''''''' + +Note that the CUDA target does reuse some parts of the CPU target. + +- :ghfile:`numba/cuda/` - The implementation of the CUDA (NVIDIA GPU) target + and associated unit tests +- :ghfile:`numba/cuda/decorators.py` - Compiler decorators for CUDA kernels + and device functions +- :ghfile:`numba/cuda/dispatcher.py` - Dispatcher for CUDA JIT functions +- :ghfile:`numba/cuda/printimpl.py` - Special implementation of device printing +- :ghfile:`numba/cuda/libdevice.py` - Registers libdevice functions +- :ghfile:`numba/cuda/kernels/` - Custom kernels for reduction and transpose +- :ghfile:`numba/cuda/device_init.py` - Initializes the CUDA target when + imported +- :ghfile:`numba/cuda/compiler.py` - Compiler pipeline for CUDA target +- :ghfile:`numba/cuda/intrinsic_wrapper.py` - CUDA device intrinsics + (shuffle, ballot, etc) +- :ghfile:`numba/cuda/initialize.py` - Deferred initialization of the CUDA + device and subsystem. Called only when user imports ``numba.cuda`` +- :ghfile:`numba/cuda/simulator_init.py` - Initializes the CUDA simulator + subsystem (only when user requests it with env var) +- :ghfile:`numba/cuda/random.py` - Implementation of random number generator +- :ghfile:`numba/cuda/api.py` - User facing APIs imported into ``numba.cuda.*`` +- :ghfile:`numba/cuda/stubs.py` - Python placeholders for functions that + only can be used in GPU device code +- :ghfile:`numba/cuda/simulator/` - Simulate execution of CUDA kernels in + Python interpreter +- :ghfile:`numba/cuda/vectorizers.py` - Subclasses of ufunc/gufunc compilers + for CUDA +- :ghfile:`numba/cuda/args.py` - Management of kernel arguments, including + host<->device transfers +- :ghfile:`numba/cuda/target.py` - Typing and target contexts for GPU +- :ghfile:`numba/cuda/cudamath.py` - Type signatures for math functions in + CUDA Python +- :ghfile:`numba/cuda/errors.py` - Validation of kernel launch configuration +- :ghfile:`numba/cuda/nvvmutils.py` - Helper functions for generating + NVVM-specific IR +- :ghfile:`numba/cuda/testing.py` - Support code for creating CUDA unit + tests and capturing standard out +- :ghfile:`numba/cuda/cudadecl.py` - Type signatures of CUDA API (threadIdx, + blockIdx, atomics) in Python on GPU +- :ghfile:`numba/cuda/cudaimpl.py` - Implementations of CUDA API functions + on GPU +- :ghfile:`numba/cuda/codegen.py` - Code generator object for CUDA target +- :ghfile:`numba/cuda/cudadrv/` - Wrapper around CUDA driver API +- :ghfile:`numba/cuda/tests/` - CUDA unit tests, skipped when CUDA is not + detected +- :ghfile:`numba/cuda/tests/cudasim/` - Tests of CUDA simulator +- :ghfile:`numba/cuda/tests/nocuda/` - Tests for NVVM functionality when + CUDA not present +- :ghfile:`numba/cuda/tests/cudapy/` - Tests of compiling Python functions + for GPU +- :ghfile:`numba/cuda/tests/cudadrv/` - Tests of Python wrapper around CUDA + API + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/rewrites.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/rewrites.rst new file mode 100644 index 0000000000000000000000000000000000000000..ff162c6118567d895d374fc0357cc0e43be16ab8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/rewrites.rst @@ -0,0 +1,397 @@ +===================================================== +Using the Numba Rewrite Pass for Fun and Optimization +===================================================== + +Overview +======== + +This section introduces intermediate representation (IR) rewrites, and +how they can be used to implement optimizations. + +As discussed earlier in ":ref:`rewrite-typed-ir`", rewriting the Numba +IR allows us to perform optimizations that would be much more +difficult to perform at the lower LLVM level. Similar to the Numba +type and lowering subsystems, the rewrite subsystem is user +extensible. This extensibility affords Numba the possibility of +supporting a wide variety of domain-specific optimizations (DSO's). + +The remaining subsections detail the mechanics of implementing a +rewrite, registering a rewrite with the rewrite registry, and provide +examples of adding new rewrites, as well as internals of the array +expression optimization pass. We conclude by reviewing some use cases +exposed in the examples, as well as reviewing any points where +developers should take care. + + +Rewriting Passes +================ + +Rewriting passes have a simple :func:`~Rewrite.match` and +:func:`~Rewrite.apply` interface. The division between matching and +rewriting follows how one would define a term rewrite in a declarative +domain-specific languages (DSL's). In such DSL's, one may write a +rewrite as follows:: + + => + + +The ```` and ```` symbols represent IR term +expressions, where the left-hand side presents a pattern to match, and +the right-hand side an IR term constructor to build upon matching. +Whenever the rewrite matches an IR pattern, any free variables in the +left-hand side are bound within a custom environment. When applied, +the rewrite uses the pattern matching environment to bind any free +variables in the right-hand side. + +As Python is not commonly used in a declarative capacity, Numba uses +object state to handle the transfer of information between the +matching and application steps. + + +The :class:`Rewrite` Base Class +------------------------------- + +.. class:: Rewrite + + The :class:`Rewrite` class simply defines an abstract base class + for Numba rewrites. Developers should define rewrites as + subclasses of this base type, overloading the + :func:`~Rewrite.match` and :func:`~Rewrite.apply` methods. + + .. attribute:: pipeline + + The pipeline attribute contains the + :class:`numba.compiler.Pipeline` instance that is currently + compiling the function under consideration for rewriting. + + .. method:: __init__(self, pipeline, *args, **kws) + + The base constructor for rewrites simply stashes its arguments + into attributes of the same name. Unless being used in + debugging or testing, rewrites should only be constructed by + the :class:`RewriteRegistry` in the + :func:`RewriteRegistry.apply` method, and the construction + interface should remain stable (though the pipeline will + commonly contain just about everything there is to know). + + .. method:: match(self, block, typemap, callmap) + + The :func:`~Rewrite.match` method takes four arguments other + than *self*: + + * *func_ir*: This is an instance of :class:`numba.ir.FunctionIR` for the + function being rewritten. + + * *block*: This is an instance of :class:`numba.ir.Block`. The + matching method should iterate over the instructions contained + in the :attr:`numba.ir.Block.body` member. + + * *typemap*: This is a Python :class:`dict` instance mapping + from symbol names in the IR, represented as strings, to Numba + types. + + * *callmap*: This is another :class:`dict` instance mapping from + calls, represented as :class:`numba.ir.Expr` instances, to + their corresponding call site type signatures, represented as + a :class:`numba.typing.templates.Signature` instance. + + The :func:`~Rewrite.match` method should return a :class:`bool` + result. A :obj:`True` result should indicate that one or more + matches were found, and the :func:`~Rewrite.apply` method will + return a new replacement :class:`numba.ir.Block` instance. A + :obj:`False` result should indicate that no matches were found, and + subsequent calls to :func:`~Rewrite.apply` will return undefined + or invalid results. + + .. method:: apply(self) + + The :func:`~Rewrite.apply` method should only be invoked + following a successful call to :func:`~Rewrite.match`. This + method takes no additional parameters other than *self*, and + should return a replacement :class:`numba.ir.Block` instance. + + As mentioned above, the behavior of calling + :func:`~Rewrite.apply` is undefined unless + :func:`~Rewrite.match` has already been called and returned + :obj:`True`. + + +Subclassing :class:`Rewrite` +---------------------------- + +Before going into the expectations for the overloaded methods any +:class:`Rewrite` subclass must have, let's step back a minute to +review what is taking place here. By providing an extensible +compiler, Numba opens itself to user-defined code generators which may +be incomplete, or worse, incorrect. When a code generator goes awry, +it can cause abnormal program behavior or early termination. +User-defined rewrites add a new level of complexity because they must +not only generate correct code, but the code they generate should +ensure that the compiler does not get stuck in a match/apply loop. +Non-termination by the compiler will directly lead to non-termination +of user function calls. + +There are several ways to help ensure that a rewrite terminates: + +* *Typing*: A rewrite should generally attempt to decompose composite + types, and avoid composing new types. If the rewrite is matching a + specific type, changing expression types to a lower-level type will + ensure they will no long match after the rewrite is applied. + +* *Special instructions*: A rewrite may synthesize custom operators or + use special functions in the target IR. This technique again + generates code that is no longer within the domain of the original + match, and the rewrite will terminate. + +In the ":ref:`case-study-array-expressions`" subsection, below, we'll +see how the array expression rewriter uses both of these techniques. + + +Overloading :func:`Rewrite.match` +--------------------------------- + +Every rewrite developer should seek to have their implementation of +:func:`~Rewrite.match` return a :obj:`False` value as quickly as +possible. Numba is a just-in-time compiler, and adding compilation +time ultimately adds to the user's run time. When a rewrite returns +:obj:`False` for a given block, the registry will no longer process that +block with that rewrite, and the compiler is that much closer to +proceeding to lowering. + +This need for timeliness has to be balanced against collecting the +necessary information to make a match for a rewrite. Rewrite +developers should be comfortable adding dynamic attributes to their +subclasses, and then having these new attributes guide construction of +the replacement basic block. + + +Overloading :func:`Rewrite.apply` +----------------------------------- + +The :func:`~Rewrite.apply` method should return a replacement +:class:`numba.ir.Block` instance to replace the basic block that +contained a match for the rewrite. As mentioned above, the IR built +by :func:`~Rewrite.apply` methods should preserve the semantics of the +user's code, but also seek to avoid generating another match for the +same rewrite or set of rewrites. + + +The Rewrite Registry +==================== + +When you want to include a rewrite in the rewrite pass, you should +register it with the rewrite registry. The :mod:`numba.rewrites` +module provides both the abstract base class and a class decorator for +hooking into the Numba rewrite subsystem. The following illustrates a +stub definition of a new rewrite:: + + from numba import rewrites + + @rewrites.register_rewrite + class MyRewrite(rewrites.Rewrite): + + def match(self, block, typemap, calltypes): + raise NotImplementedError("FIXME") + + def apply(self): + raise NotImplementedError("FIXME") + + +Developers should note that using the class decorator as shown above +will register a rewrite at import time. It is the developer's +responsibility to ensure their extensions are loaded before +compilation starts. + + +.. _`case-study-array-expressions`: + +Case study: Array Expressions +============================= + +This subsection looks at the array expression rewriter in more depth. +The array expression rewriter, and most of its support functionality, +are found in the :mod:`numba.npyufunc.array_exprs` module. The +rewriting pass itself is implemented in the :class:`RewriteArrayExprs` +class. In addition to the rewriter, the +:mod:`~numba.npyufunc.array_exprs` module includes a function for +lowering array expressions, +:func:`~numba.npyufunc.array_exprs._lower_array_expr`. The overall +optimization process is as follows: + +* :func:`RewriteArrayExprs.match`: The rewrite pass looks for two or + more array operations that form an array expression. + +* :func:`RewriteArrayExprs.apply`: Once an array expression is found, + the rewriter replaces the individual array operations with a new + kind of IR expression, the ``arrayexpr``. + +* :func:`numba.npyufunc.array_exprs._lower_array_expr`: During + lowering, the code generator calls + :func:`~numba.npyufunc.array_exprs._lower_array_expr` whenever it + finds an ``arrayexpr`` IR expression. + +More details on each step of the optimization are given below. + + +The :func:`RewriteArrayExprs.match` method +------------------------------------------ + +The array expression optimization pass starts by looking for array +operations, including calls to supported :class:`~numpy.ufunc`\'s and +user-defined :class:`~numba.DUFunc`\'s. Numba IR follows the +conventions of a static single assignment (SSA) language, meaning that +the search for array operators begins with looking for assignment +instructions. + +When the rewriting pass calls the :func:`RewriteArrayExprs.match` +method, it first checks to see if it can trivially reject the basic +block. If the method determines the block to be a candidate for +matching, it sets up the following state variables in the rewrite +object: + +* *crnt_block*: The current basic block being matched. + +* *typemap*: The *typemap* for the function being matched. + +* *matches*: A list of variable names that reference array expressions. + +* *array_assigns*: A map from assignment variable names to the actual + assignment instructions that define the given variable. + +* *const_assigns*: A map from assignment variable names to the + constant valued expression that defines the constant variable. + +At this point, the match method iterates over the assignment +instructions in the input basic block. For each assignment +instruction, the matcher looks for one of two things: + +* Array operations: If the right-hand side of the assignment + instruction is an expression, and the result of that expression is + an array type, the matcher checks to see if the expression is either + a known array operation, or a call to a universal function. If an + array operator is found, the matcher stores the left-hand variable + name and the whole instruction in the *array_assigns* member. + Finally, the matcher tests to see if any operands of the array + operation have also been identified as targets of other array + operations. If one or more operands are also targets of array + operations, then the matcher will also append the left-hand side + variable name to the *matches* member. + +* Constants: Constants (even scalars) can be operands to array + operations. Without worrying about the constant being apart of an + array expression, the matcher stores constant names and values in + the *const_assigns* member. + +The end of the matching method simply checks for a non-empty *matches* +list, returning :obj:`True` if there were one or more matches, and +:obj:`False` when *matches* is empty. + + +The :func:`RewriteArrayExprs.apply` method +------------------------------------------ + +When one or matching array expressions are found by +:func:`RewriteArrayExprs.match`, the rewriting pass will call +:func:`RewriteArrayExprs.apply`. The apply method works in two +passes. The first pass iterates over the matches found, and builds a +map from instructions in the old basic block to new instructions in +the new basic block. The second pass iterates over the instructions +in the old basic block, copying instructions that are not changed by +the rewrite, and replacing or deleting instructions that were +identified by the first pass. + +The :func:`RewriteArrayExprs._handle_matches` implements the first +pass of the code generation portion of the rewrite. For each match, +this method builds a special IR expression that contains an expression +tree for the array expression. To compute the leaves of the +expression tree, the :func:`~RewriteArrayExprs._handle_matches` method +iterates over the operands of the identified root operation. If the +operand is another array operation, it is translated into an +expression sub-tree. If the operand is a constant, +:func:`~RewriteArrayExprs._handle_matches` copies the constant value. +Otherwise, the operand is marked as being used by an array expression. +As the method builds array expression nodes, it builds a map from old +instructions to new instructions (*replace_map*), as well as sets of +variables that may have moved (*used_vars*), and variables that should +be removed altogether (*dead_vars*). These three data structures are +returned back to the calling :func:`RewriteArrayExprs.apply` method. + +The remaining part of the :func:`RewriteArrayExprs.apply` method +iterates over the instructions in the old basic block. For each +instruction, this method either replaces, deletes, or duplicates that +instruction based on the results of +:func:`RewriteArrayExprs._handle_matches`. The following list +describes how the optimization handles individual instructions: + +* When an instruction is an assignment, + :func:`~RewriteArrayExprs.apply` checks to see if it is in the + replacement instruction map. When an assignment instruction is found + in the instruction map, :func:`~RewriteArrayExprs.apply` must then + check to see if the replacement instruction is also in the replacement + map. The optimizer continues this check until it either arrives at a + :obj:`None` value or an instruction that isn't in the replacement map. + Instructions that have a replacement that is :obj:`None` are deleted. + Instructions that have a non-:obj:`None` replacement are replaced. + Assignment instructions not in the replacement map are appended to the + new basic block with no changes made. + +* When the instruction is a delete instruction, the rewrite checks to + see if it deletes a variable that may still be used by a later array + expression, or if it deletes a dead variable. Delete instructions for + used variables are added to a map of deferred delete instructions that + :func:`~RewriteArrayExprs.apply` uses to move them past any uses of + that variable. The loop copies delete instructions for non-dead + variables, and ignores delete instructions for dead variables + (effectively removing them from the basic block). + +* All other instructions are appended to the new basic block. + +Finally, the :func:`~RewriteArrayExprs.apply` method returns the new +basic block for lowering. + + +The :func:`~numba.npyufunc.array_exprs._lower_array_expr` function +------------------------------------------------------------------ + +If we left things at just the rewrite, then the lowering stage of the +compiler would fail, complaining it doesn't know how to lower +``arrayexpr`` operations. We start by hooking a lowering function +into the target context whenever the :class:`RewriteArrayExprs` class +is instantiated by the compiler. This hook causes the lowering pass to +call :func:`~numba.npyufunc.array_exprs._lower_array_expr` whenever it +encounters an ``arrayexr`` operator. + +This function has two steps: + +* Synthesize a Python function that implements the array expression: + This new Python function essentially behaves like a Numpy + :class:`~numpy.ufunc`, returning the result of the expression on + scalar values in the broadcasted array arguments. The lowering + function accomplishes this by translating from the array expression + tree into a Python AST. + +* Compile the synthetic Python function into a kernel: At this point, + the lowering function relies on existing code for lowering ufunc and + DUFunc kernels, calling + :func:`numba.targets.numpyimpl.numpy_ufunc_kernel` after defining + how to lower calls to the synthetic function. + +The end result is similar to loop lifting in Numba's object mode. + + +Conclusions and Caveats +======================= + +We have seen how to implement rewrites in Numba, starting with the +interface, and ending with an actual optimization. The key points of +this section are: + +* When writing a good plug-in, the matcher should try to get a + go/no-go result as soon as possible. + +* The rewrite application portion can be more computationally + expensive, but should still generate code that won't cause infinite + loops in the compiler. + +* We use object state to communicate any results of matching to the + rewrite application pass. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/stencil.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/stencil.rst new file mode 100644 index 0000000000000000000000000000000000000000..f27447a7fb898f45741691f117a880924b2c7181 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/stencil.rst @@ -0,0 +1,170 @@ +.. Copyright (c) 2017 Intel Corporation + SPDX-License-Identifier: BSD-2-Clause + +.. _arch-stencil: + +================= +Notes on stencils +================= + +Numba provides the :ref:`@stencil decorator ` to +represent stencil computations. This document explains how this +feature is implemented in the several different modes available in +Numba. Currently, calls to the stencil from non-jitted code is +supported as well as calls from jitted code, either with or without +the :ref:`parallel=True ` option. + +The stencil decorator +===================== + +The stencil decorator itself just returns a ``StencilFunc`` object. +This object encapsulates the original stencil kernel function +as specified in the program and the options passed to the +stencil decorator. Also of note is that after the first compilation +of the stencil, the computed neighborhood of the stencil is +stored in the ``StencilFunc`` object in the ``neighborhood`` attribute. + +Handling the three modes +======================== + +As mentioned above, Numba supports the calling of stencils +from inside or outside a ``@jit`` compiled function, with or +without the :ref:`parallel=True ` option. + +Outside jit context +------------------- + +``StencilFunc`` overrides the ``__call__`` method so that calls +to ``StencilFunc`` objects execute the stencil:: + + def __call__(self, *args, **kwargs): + result = kwargs.get('out') + + new_stencil_func = self._stencil_wrapper(result, None, *args) + + if result is None: + return new_stencil_func.entry_point(*args) + else: + return new_stencil_func.entry_point(*args, result) + +First, the presence of the optional :ref:`out ` +parameter is checked. If it is present then the output array is +stored in ``result``. Then, the call to ``_stencil_wrapper`` +generates the stencil function given the result and argument types +and finally the generated stencil function is executed and its result +returned. + +Jit without ``parallel=True`` +----------------------------- + +When constructed, a ``StencilFunc`` inserts itself into the typing +context's set of user functions and provides the ``_type_me`` +callback. In this way, the standard Numba compiler is able to +determine the output type and signature of a ``StencilFunc``. +Each ``StencilFunc`` maintains a cache of previously seen combinations +of input argument types and keyword types. If previously seen, +the ``StencilFunc`` returns the computed signature. If not previously +computed, the ``StencilFunc`` computes the return type of the stencil +by running the Numba compiler frontend on the stencil kernel and +then performing type inference on the :term:`Numba IR` (IR) to get the scalar +return type of the kernel. From that, a Numpy array type is constructed +whose element type matches that scalar return type. + +After computing the signature of the stencil for a previously +unseen combination of input and keyword types, the ``StencilFunc`` +then :ref:`creates the stencil function ` itself. +``StencilFunc`` then installs the new stencil function's definition +in the target context so that jitted code is able to call it. + +Thus, in this mode, the generated stencil function is a stand-alone +function called like a normal function from within jitted code. + +Jit with ``parallel=True`` +-------------------------- + +When calling a ``StencilFunc`` from a jitted context with ``parallel=True``, +a separate stencil function as generated by :ref:`arch-stencil-create-function` +is not used. Instead, `parfors` (:ref:`parallel-accelerator`) are +created within the current function that implements the stencil. +This code again starts with the stencil kernel and does a similar kernel +size computation but then rather than standard Python looping syntax, +corresponding `parfors` are created so that the execution of the stencil +will take place in parallel. + +The stencil to `parfor` translations can also be selectively disabled +by setting ``parallel={'stencil': False}``, among other sub-options +described in :ref:`parallel-accelerator`. + +.. _arch-stencil-create-function: + +Creating the stencil function +============================= + +Conceptually, a stencil function is created from the user-specified +stencil kernel by adding looping code around the kernel, transforming +the relative kernel indices into absolute array indices based on the +loop indices, and replacing the kernel's ``return`` statement with +a statement to assign the computed value into the output array. + +To accomplish this transformation, first, a copy of the stencil +kernel IR is created so that subsequent modifications of the IR +for different stencil signatures will not effect each other. + +Then, an approach similar to how GUFunc's are created for `parfors` +is employed. In a text buffer, a Python function is created with +a unique name. The input array parameter is added to the function +definition and if the ``out`` argument type is present then an +``out`` parameter is added to the stencil function definition. +If the ``out`` argument is not present then first an output array +is created with ``numpy.zeros`` having the same shape as the +input array. + +The kernel is then analyzed to compute the stencil size and the +shape of the boundary (or the ``neighborhood`` stencil decorator +argument is used for this purpose if present). +Then, one ``for`` loop for each dimension of the input array is +added to the stencil function definition. The range of each +loop is controlled by the stencil kernel size previously computed +so that the boundary of the output image is not modified but instead +left as is. The body of the innermost ``for`` loop is a single +``sentinel`` statement that is easily recognized in the IR. +A call to ``exec`` with the text buffer is used to force the +stencil function into existence and an ``eval`` is used to get +access to the corresponding function on which ``run_frontend`` is +used to get the stencil function IR. + +Various renaming and relabeling is performed on the stencil function +IR and the kernel IR so that the two can be combined without conflict. +The relative indices in the kernel IR (i.e., ``getitem`` calls) are +replaced with expressions where the corresponding loop index variables +are added to the relative indices. The ``return`` statement in the +kernel IR is replaced with a ``setitem`` for the corresponding element +in the output array. +The stencil function IR is then scanned for the sentinel and the +sentinel replaced with the modified kernel IR. + +Next, ``compile_ir`` is used to compile the combined stencil function +IR. The resulting compile result is cached in the ``StencilFunc`` so that +other calls to the same stencil do not need to undertake this process +again. + +Exceptions raised +================= + +Various checks are performed during stencil compilation to make sure +that user-specified options do not conflict with each other or with +other runtime parameters. For example, if the user has manually +specified a ``neighborhood`` to the stencil decorator, the length of +that neighborhood must match the dimensionality of the input array. +If this is not the case, a ``ValueError`` is raised. + +If the neighborhood has not been specified then it must be inferred +and a requirement to infer the kernel is that all indices are constant +integers. If they are not, a ``ValueError`` is raised indicating that +kernel indices may not be non-constant. + +Finally, the stencil implementation detects the output array type +by running Numba type inference on the stencil kernel. If the +return type of this kernel does not match the type of the value +passed to the ``cval`` stencil decorator option then a ``ValueError`` +is raised. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/target_extension.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/target_extension.rst new file mode 100644 index 0000000000000000000000000000000000000000..bebd574ad27754f12af8cf4b4c0a411b73f9ad41 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/target_extension.rst @@ -0,0 +1,61 @@ +========================== +Notes on Target Extensions +========================== + +.. warning:: All features and APIs described in this page are in-development and + may change at any time without deprecation notices being issued. + + +Inheriting compiler flags from the caller +========================================= + +Compiler flags, i.e. options such as ``fastmath``, ``nrt`` in +``@jit(nrt=True, fastmath=True))`` are specified per-function but their +effects are not well-defined---some flags affect the entire callgraph, some +flags affect only the current function. Sometimes it is necessary for callees +to inherit flags from the caller; for example the ``fastmath`` flag should be +infectious. + +To address the problem, the following are needed: + +1. Better definitions for the semantics of compiler flags. Preferably, all flags should + limit their effect to the current function. (TODO) +2. Allow compiler flags to be inherited from the caller. (Done) +3. Consider compiler flags in function resolution. (TODO) + +:class:`numba.core.targetconfig.ConfigStack` is used to propagate the compiler flags +throughout the compiler. At the start of the compilation, the flags are pushed +into the ``ConfigStack``, which maintains a thread-local stack for the +compilation. Thus, callees can check the flags in the caller. + +.. autoclass:: numba.core.targetconfig.ConfigStack + :members: + +Compiler flags +-------------- + +`Compiler flags`_ are defined as a subclass of ``TargetConfig``: + +.. _Compiler flags: https://github.com/numba/numba/blob/7e8538140ce3f8d01a5273a39233b5481d8b20b1/numba/core/compiler.py#L39 + +.. autoclass:: numba.core.targetconfig.TargetConfig + :members: + + +These are internal compiler flags and they are different from the user-facing +options used in the jit decorators. + +Internally, `the user-facing options are mapped to the internal compiler flags `_ +by :class:`numba.core.options.TargetOptions`. Each target can override the +default compiler flags and control the flag inheritance in +``TargetOptions.finalize``. `The CPU target overrides it. +`_ + +.. autoclass:: numba.core.options.TargetOptions + :members: finalize + + +In :meth:`numba.core.options.TargetOptions.finalize`, +use :meth:`numba.core.targetconfig.TargetConfig.inherit_if_not_set` +to request a compiler flag from the caller if it is not set for the current +function. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/threading_implementation.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/threading_implementation.rst new file mode 100644 index 0000000000000000000000000000000000000000..487bc2894f62335c6abd0ca8ee2eb25c0b012df5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/developer/threading_implementation.rst @@ -0,0 +1,249 @@ +========================================= +Notes on Numba's threading implementation +========================================= + +The execution of the work presented by the Numba ``parallel`` targets is +undertaken by the Numba threading layer. Practically, the "threading layer" +is a Numba built-in library that can perform the required concurrent execution. +At the time of writing there are three threading layers available, each +implemented via a different lower level native threading library. More +information on the threading layers and appropriate selection of a threading +layer for a given application/system can be found in the +:ref:`threading layer documentation `. + +The pertinent information to note for the following sections is that the +function in the threading library that performs the parallel execution is the +``parallel_for`` function. The job of this function is to both orchestrate and +execute the parallel tasks. + +The relevant source files referenced in this document are + +- ``numba/np/ufunc/tbbpool.cpp`` +- ``numba/np/ufunc/omppool.cpp`` +- ``numba/np/ufunc/workqueue.c`` + + These files contain the TBB, OpenMP, and workqueue threadpool + implementations, respectively. Each includes the functions + ``set_num_threads()``, ``get_num_threads()``, and ``get_thread_id()``, as + well as the relevant logic for thread masking in their respective + schedulers. Note that the basic thread local variable logic is duplicated in + each of these files, and not shared between them. + +- ``numba/np/ufunc/parallel.py`` + + This file contains the Python and JIT compatible wrappers for + ``set_num_threads()``, ``get_num_threads()``, and ``get_thread_id()``, as + well as the code that loads the above libraries into Python and launches the + threadpool. + +- ``numba/parfors/parfor_lowering.py`` + + This file contains the main logic for generating code for the parallel + backend. The thread mask is accessed in this file in the code that generates + scheduler code, and passed to the relevant backend scheduler function (see + below). + +Thread masking +-------------- + +As part of its design, Numba never launches new threads beyond the threads +that are launched initially with ``numba.np.ufunc.parallel._launch_threads()`` +when the first parallel execution is run. This is due to the way threads were +already implemented in Numba prior to thread masking being implemented. This +restriction was kept to keep the design simple, although it could be removed +in the future. Consequently, it's possible to programmatically set the number +of threads, but only to less than or equal to the total number that have +already been launched. This is done by "masking" out unused threads, causing +them to do no work. For example, on a 16 core machine, if the user were to +call ``set_num_threads(4)``, Numba would always have 16 threads present, but +12 of them would sit idle for parallel computations. A further call to +``set_num_threads(16)`` would cause those same threads to do work in later +computations. + +:ref:`Thread masking ` was added to make +it possible for a user to programmatically alter the number of threads +performing work in the threading layer. Thread masking proved challenging to +implement as it required the development of a programming model that is suitable +for users, easy to reason about, and could be implemented safely, with +consistent behavior across the various threading layers. + +Programming model +~~~~~~~~~~~~~~~~~ + +The programming model chosen is similar to that found in OpenMP. The reasons +for this choice were that it is familiar to a lot of users, restricted in +scope and also simple. The number of threads in use is specified by calling +``set_num_threads`` and the number of threads in use can be queried by calling +``get_num_threads``.These two functions are synonymous with their OpenMP +counterparts (with the above restriction that the mask must be less than or +equal to the number of launched threads). The execution semantics are also +similar to OpenMP in that once a parallel region is launched, altering the +thread mask has no impact on the currently executing region, but will have an +impact on parallel regions executed subsequently. + +The Implementation +~~~~~~~~~~~~~~~~~~ + +So as to place no further restrictions on user code other than those that +already existed in the threading layer libraries, careful consideration of the +design of thread masking was required. The "thread mask" cannot be stored in a +global value as concurrent use of the threading layer may result in classic +forms of race conditions on the value itself. Numerous designs were discussed +involving various types of mutex on such a global value, all of which were +eventually broken through thought experiment alone. It eventually transpired +that, following some OpenMP implementations, the "thread mask" is best +implemented as a ``thread local``. This means each thread that executes a Numba +parallel function will have a thread local storage (TLS) slot that contains the +value of the thread mask to use when scheduling threads in the ``parallel_for`` +function. + +The above notion of TLS use for a thread mask is relatively easy to implement, +``get_num_threads`` and ``set_num_threads`` simply need to address the TLS slot +in a given threading layer. This also means that the execution schedule for a +parallel region can be derived from a run time call to ``get_num_threads``. This +is achieved via a well known and relatively easy to implement pattern of a ``C`` +library function registration and wrapping it in the internal Numba +implementation. + +In addition to satisfying the original upfront thread masking requirements, a +few more complicated scenarios needed consideration as follows. + +Nested parallelism +****************** + +In all threading layers a "main thread" will invoke the ``parallel_for`` +function and then in the parallel region, depending on the threading layer, +some number of additional threads will assist in doing the actual work. +If the work contains a call to another parallel function (i.e. nested +parallelism) it is necessary for the thread making the call to know what the +"thread mask" of the main thread is so that it can propagate it into the +``parallel_for`` call it makes when executing the nested parallel function. +The implementation of this behavior is threading layer specific but the general +principle is for the "main thread" to always "send" the value of the thread mask +from its TLS slot to all threads in the threading layer that are active in the +parallel region. These active threads then update their TLS slots with this +value prior to performing any work. The net result of this implementation detail +is that: + +* thread masks correctly propagate into nested functions +* it's still possible for each thread in a parallel region to safely have a + different mask with which to call nested functions, if it's not set explicitly + then the inherited mask from the "main thread" is used +* threading layers which have dynamic scheduling with threads potentially + joining and leaving the active pool during a ``parallel_for`` execution are + successfully accommodated +* any "main thread" thread mask is entirely decoupled from the in-flux nature + of the thread masks of the threads in the active thread pool + +Python threads independently invoking parallel functions +******************************************************** + +The threading layer launch sequence is heavily guarded to ensure that the +launch is both thread and process safe and run once per process. In a system +with numerous Python ``threading`` module threads all using Numba, the first +thread through the launch sequence will get its thread mask set appropriately, +but no further threads can run the launch sequence. This means that other +threads will need their initial thread mask set some other way. This is +achieved when ``get_num_threads`` is called and no thread mask is present, in +this case the thread mask will be set to the default. In the implementation, +"no thread mask is present" is represented by the value ``-1`` and the "default +thread mask" (unset) is represented by the value ``0``. The implementation also +immediately calls ``set_num_threads(NUMBA_NUM_THREADS)`` after doing this, so +if either ``-1`` or ``0`` is encountered as a result from ``get_num_threads()`` it +indicates a bug in the above processes. + +OS ``fork()`` calls +******************* + +The use of TLS was also in part driven by the Linux (the most popular +platform for Numba use by far) having a ``fork(2, 3P)`` call that will do TLS +propagation into child processes, see ``clone(2)``\ 's ``CLONE_SETTLS``. + +Thread ID +********* + +A private ``get_thread_id()`` function was added to each threading backend, +which returns a unique ID for each thread. This can be accessed from Python by +``numba.np.ufunc.parallel._get_thread_id()`` (it can also be used inside a +JIT compiled function). The thread ID function is useful for testing that the +thread masking behavior is correct, but it should not be used outside of the +tests. For example, one can call ``set_num_threads(4)`` and then collect all +unique ``_get_thread_id()``\ s in a parallel region to verify that only 4 +threads are run. + +Caveats +~~~~~~~ + +Some caveats to be aware of when testing thread masking: + +- The TBB backend may choose to schedule fewer than the given mask number of + threads. Thus a test such as the one described above may return fewer than 4 + unique threads. + +- The workqueue backend is not threadsafe, so attempts to do multithreading + nested parallelism with it may result in deadlocks or other undefined + behavior. The workqueue backend will raise a SIGABRT signal if it detects + nested parallelism. + +- Certain backends may reuse the main thread for computation, but this + behavior shouldn't be relied upon (for instance, if propagating exceptions). + +Use in Code Generation +~~~~~~~~~~~~~~~~~~~~~~ + +The general pattern for using ``get_num_threads`` in code generation is + +.. code:: python + + from llvmlite import ir as llvmir + + get_num_threads = cgutils.get_or_insert_function(builder.module + llvmir.FunctionType(llvmir.IntType(types.intp.bitwidth), []), + name="get_num_threads") + + num_threads = builder.call(get_num_threads, []) + + with cgutils.if_unlikely(builder, builder.icmp_signed('<=', num_threads, + num_threads.type(0))): + cgutils.printf(builder, "num_threads: %d\n", num_threads) + context.call_conv.return_user_exc(builder, RuntimeError, + ("Invalid number of threads. " + "This likely indicates a bug in Numba.",)) + + # Pass num_threads through to the appropriate backend function here + +See the code in ``numba/parfors/parfor_lowering.py``. + +The guard against ``num_threads`` being <= 0 is not strictly necessary, but it +can protect against accidentally incorrect behavior in case the thread masking +logic contains a bug. + +The ``num_threads`` variable should be passed through to the appropriate +backend function, such as ``do_scheduling`` or ``parallel_for``. If it's used +in some way other than passing it through to the backend function, the above +considerations should be taken into account to ensure the use of the +``num_threads`` variable is safe. It would probably be better to keep such +logic in the threading backends, rather than trying to do it in code +generation. + +.. _chunk-details-label: + +Parallel Chunksize Details +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are some cases in which the actual parallel work chunk sizes may differ +from the requested +chunk size that is requested through :func:`numba.set_parallel_chunksize`. +First, if the number of required chunks based on the specified chunk size +is less than the number of configured threads then Numba will use all of the configured +threads to execute the parallel region. In this case, the actual chunk size will be +less than the requested chunk size. Second, due to truncation, in cases where the +iteration count is slightly less than a multiple of the chunk size +(e.g., 14 iterations and a specified chunk size of 5), the actual chunk size will be +larger than the specified chunk size. As in the given example, the number of chunks +would be 2 and the actual chunk size would be 7 (i.e. 14 / 2). Lastly, since Numba +divides an N-dimensional iteration space into N-dimensional (hyper)rectangular chunks, +it may be the case there are not N integer factors whose product is equal to the chunk +size. In this case, some chunks will have an area/volume larger than the chunk size +whereas others will be less than the specified chunk size. + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/entrypoints.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/entrypoints.rst new file mode 100644 index 0000000000000000000000000000000000000000..143c2e090156d2d25e06e7607aa92366f2be20e1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/entrypoints.rst @@ -0,0 +1,65 @@ +Registering Extensions with Entry Points +======================================== + +Often, third party packages will have a user-facing API as well as define +extensions to the Numba compiler. In those situations, the new types and +overloads can registered with Numba when the package is imported by the user. +However, there are situations where a Numba extension would not normally be +imported directly by the user, but must still be registered with the Numba +compiler. An example of this is the `numba-scipy +`_ package, which adds support for some +SciPy functions to Numba. The end user does not need to ``import +numba_scipy`` to enable compiler support for SciPy, the extension only needs +to be installed in the Python environment. + +Numba discovers extensions using the `entry points +`_ +feature of ``setuptools``. This allows a Python package to register an +initializer function that will be called before ``numba`` compiles for the +first time. The delay ensures that the cost of importing extensions is +deferred until it is necessary. + + +Adding Support for the "Init" Entry Point +----------------------------------------- + +A package can register an initialization function with Numba by adding the +``entry_points`` argument to the ``setup()`` function call in ``setup.py``: + +.. code-block:: python + + setup( + ..., + entry_points={ + "numba_extensions": [ + "init = numba_scipy:_init_extension", + ], + }, + ... + ) + +Numba currently only looks for the ``init`` entry point in the +``numba_extensions`` group. The entry point should be a function (any name, +as long as it matches what is listed in ``setup.py``) that takes no arguments, +and the return value is ignored. This function should register types, +overloads, or call other Numba extension APIs. The order of initialization of +extensions is undefined. + +Testing your Entry Point +------------------------ + +Numba loads all entry points when the first function is compiled. To test your +entry point, it is not sufficient to just ``import numba``; you have to define +and run a small function, like this: + +.. code-block:: python + + import numba; numba.njit(lambda x: x + 1)(123) + +It is not necessary to import your module: entry points are identified by the +``entry_points.txt`` file in your library's ``*.egg-info`` directory. + +The ``setup.py build`` command does not create eggs, but ``setup.py sdist`` +(for testing in a local directory) and ``setup.py install`` do. All entry points +registered in eggs that are on the Python path are loaded. Be sure to check for +stale ``entry_points.txt`` when debugging. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/high-level.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/high-level.rst new file mode 100644 index 0000000000000000000000000000000000000000..4e4877336f3df16570e0c9374132a8f8eeaf3d13 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/high-level.rst @@ -0,0 +1,254 @@ + +.. _high-level-extending: + +High-level extension API +======================== + +This extension API is exposed through the :mod:`numba.extending` module. + +To aid debugging extensions to Numba, it's recommended to set the following +environment variable:: + + NUMBA_CAPTURED_ERRORS="new_style" + +this makes it easy to differentiate between errors in implementation and +acceptable errors that can take part in e.g. type inference. For more +information see :envvar:`NUMBA_CAPTURED_ERRORS`. + +Implementing functions +---------------------- + +The ``@overload`` decorator allows you to implement arbitrary functions +for use in :term:`nopython mode` functions. The function decorated with +``@overload`` is called at compile-time with the *types* of the function's +runtime arguments. It should return a callable representing the +*implementation* of the function for the given types. The returned +implementation is compiled by Numba as if it were a normal function +decorated with ``@jit``. Additional options to ``@jit`` can be passed as +dictionary using the ``jit_options`` argument. + +For example, let's pretend Numba doesn't support the :func:`len` function +on tuples yet. Here is how to implement it using ``@overload``:: + + from numba import types + from numba.extending import overload + + @overload(len) + def tuple_len(seq): + if isinstance(seq, types.BaseTuple): + n = len(seq) + def len_impl(seq): + return n + return len_impl + + +You might wonder, what happens if :func:`len()` is called with something +else than a tuple? If a function decorated with ``@overload`` doesn't +return anything (i.e. returns None), other definitions are tried until +one succeeds. Therefore, multiple libraries may overload :func:`len()` +for different types without conflicting with each other. + +Implementing methods +-------------------- + +The ``@overload_method`` decorator similarly allows implementing a +method on a type well-known to Numba. + +.. autofunction:: numba.core.extending.overload_method + +Implementing classmethods +------------------------- + +The ``@overload_classmethod`` decorator similarly allows implementing a +classmethod on a type well-known to Numba. + +.. autofunction:: numba.core.extending.overload_classmethod + + +Implementing attributes +----------------------- + +The ``@overload_attribute`` decorator allows implementing a data +attribute (or property) on a type. Only reading the attribute is +possible; writable attributes are only supported through the +:ref:`low-level API `. + +The following example implements the :attr:`~numpy.ndarray.nbytes` attribute +on Numpy arrays:: + + @overload_attribute(types.Array, 'nbytes') + def array_nbytes(arr): + def get(arr): + return arr.size * arr.itemsize + return get + +.. _cython-support: + +Importing Cython Functions +-------------------------- + +The function ``get_cython_function_address`` obtains the address of a +C function in a Cython extension module. The address can be used to +access the C function via a :func:`ctypes.CFUNCTYPE` callback, thus +allowing use of the C function inside a Numba jitted function. For +example, suppose that you have the file ``foo.pyx``:: + + from libc.math cimport exp + + cdef api double myexp(double x): + return exp(x) + +You can access ``myexp`` from Numba in the following way:: + + import ctypes + from numba.extending import get_cython_function_address + + addr = get_cython_function_address("foo", "myexp") + functype = ctypes.CFUNCTYPE(ctypes.c_double, ctypes.c_double) + myexp = functype(addr) + +The function ``myexp`` can now be used inside jitted functions, for +example:: + + @njit + def double_myexp(x): + return 2*myexp(x) + +One caveat is that if your function uses Cython's fused types, then +the function's name will be mangled. To find out the mangled name of +your function you can check the extension module's ``__pyx_capi__`` +attribute. + +Implementing intrinsics +----------------------- + +The ``@intrinsic`` decorator is used for marking a function *func* as typing and +implementing the function in ``nopython`` mode using the +`llvmlite IRBuilder API `_. +This is an escape hatch for expert users to build custom LLVM IR that will be +inlined into the caller, there is no safety net! + +The first argument to *func* is the typing context. The rest of the arguments +corresponds to the type of arguments of the decorated function. These arguments +are also used as the formal argument of the decorated function. If *func* has +the signature ``foo(typing_context, arg0, arg1)``, the decorated function will +have the signature ``foo(arg0, arg1)``. + +The return values of *func* should be a 2-tuple of expected type signature, and +a code-generation function that will passed to +:func:`~numba.targets.imputils.lower_builtin`. For an unsupported operation, +return ``None``. + +Here is an example that cast any integer to a byte pointer:: + + from numba import types + from numba.extending import intrinsic + + @intrinsic + def cast_int_to_byte_ptr(typingctx, src): + # check for accepted types + if isinstance(src, types.Integer): + # create the expected type signature + result_type = types.CPointer(types.uint8) + sig = result_type(types.uintp) + # defines the custom code generation + def codegen(context, builder, signature, args): + # llvm IRBuilder code here + [src] = args + rtype = signature.return_type + llrtype = context.get_value_type(rtype) + return builder.inttoptr(src, llrtype) + return sig, codegen + +it may be used as follows:: + + from numba import njit + + @njit('void(int64)') + def foo(x): + y = cast_int_to_byte_ptr(x) + + foo.inspect_types() + +and the output of ``.inspect_types()`` demonstrates the cast (note the +``uint8*``):: + + def foo(x): + + # x = arg(0, name=x) :: int64 + # $0.1 = global(cast_int_to_byte_ptr: ) :: Function() + # $0.3 = call $0.1(x, func=$0.1, args=[Var(x, check_intrin.py (24))], kws=(), vararg=None) :: (uint64,) -> uint8* + # del x + # del $0.1 + # y = $0.3 :: uint8* + # del y + # del $0.3 + # $const0.4 = const(NoneType, None) :: none + # $0.5 = cast(value=$const0.4) :: none + # del $const0.4 + # return $0.5 + + y = cast_int_to_byte_ptr(x) + + +Implementing mutable structures +------------------------------- + +.. warning:: This is an experimental feature, the API may change without warning. + +The ``numba.experimental.structref`` module provides utilities for defining +mutable pass-by-reference structures, a ``StructRef``. The following example +demonstrates how to define a basic mutable structure: + +Defining a StructRef +'''''''''''''''''''' + +.. literalinclude:: ../../../numba/tests/doc_examples/test_structref_usage.py + :language: python + :caption: from ``numba/tests/doc_examples/test_structref_usage.py`` + :start-after: magictoken.ex_structref_type_definition.begin + :end-before: magictoken.ex_structref_type_definition.end + :dedent: 0 + :linenos: + +The following demonstrates using the above mutable struct definition: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_structref_usage.py + :language: python + :caption: from ``test_type_definition`` of ``numba/tests/doc_examples/test_structref_usage.py`` + :start-after: magictoken.ex_structref_type_definition_test.begin + :end-before: magictoken.ex_structref_type_definition_test.end + :dedent: 8 + :linenos: + + +Defining a method on StructRef +'''''''''''''''''''''''''''''' + +Methods and attributes can be attached using ``@overload_*`` as shown in the +previous sections. + +The following demonstrates the use of ``@overload_method`` to insert a +method for instances of ``MyStructType``: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_structref_usage.py + :language: python + :caption: from ``test_overload_method`` of ``numba/tests/doc_examples/test_structref_usage.py`` + :start-after: magictoken.ex_structref_method.begin + :end-before: magictoken.ex_structref_method.end + :dedent: 8 + :linenos: + + +``numba.experimental.structref`` API Reference +'''''''''''''''''''''''''''''''''''''''''''''' + +.. automodule:: numba.experimental.structref + :members: + +Determining if a function is already wrapped by a ``jit`` family decorator +-------------------------------------------------------------------------- + +The following function is provided for this purpose. + +.. automethod:: numba.extending.is_jitted diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/index.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..fb6cd5160050dc86149db6a8d48327736d4ea3d6 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/index.rst @@ -0,0 +1,30 @@ + +Extending Numba +=============== + +.. module:: numba.extending + +This chapter describes how to extend Numba to make it recognize and support +additional operations, functions or types. Numba provides two categories +of APIs to this end: + +* The high-level APIs provide abstracted entry points which are sufficient + for simple uses. They require little knowledge of Numba's internal + compilation chain. + +* The low-level APIs reflect Numba's internal compilation chain and allow + flexible interaction with its various layers, but require more effort + and experience with Numba internals. + +It may be helpful for readers of this chapter to also read some of the +documents in the :doc:`developer manual <../developer/index>`, especially +the :doc:`architecture document <../developer/architecture>`. + + +.. toctree:: + high-level.rst + low-level.rst + interval-example.rst + overloading-guide.rst + entrypoints.rst + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/interval-example.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/interval-example.rst new file mode 100644 index 0000000000000000000000000000000000000000..e561ee9bd8a7722a1a59af8e3583df00bd3eaf55 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/interval-example.rst @@ -0,0 +1,345 @@ + +Example: an interval type +========================= + +We will extend the Numba frontend to support a class that it does not +currently support so as to allow: + +* Passing an instance of the class to a Numba function +* Accessing attributes of the class in a Numba function +* Constructing and returning a new instance of the class from a Numba function + +(all the above in :term:`nopython mode`) + +We will mix APIs from the :ref:`high-level extension API ` +and the :ref:`low-level extension API `, depending on what is +available for a given task. + +The starting point for our example is the following pure Python class:: + + class Interval(object): + """ + A half-open interval on the real number line. + """ + def __init__(self, lo, hi): + self.lo = lo + self.hi = hi + + def __repr__(self): + return 'Interval(%f, %f)' % (self.lo, self.hi) + + @property + def width(self): + return self.hi - self.lo + + +Extending the typing layer +"""""""""""""""""""""""""" + +Creating a new Numba type +------------------------- + +As the ``Interval`` class is not known to Numba, we must create a new Numba +type to represent instances of it. Numba does not deal with Python types +directly: it has its own type system that allows a different level of +granularity as well as various meta-information not available with regular +Python types. + +We first create a type class ``IntervalType`` and, since we don't need the +type to be parametric, we instantiate a single type instance ``interval_type``:: + + from numba import types + + class IntervalType(types.Type): + def __init__(self): + super(IntervalType, self).__init__(name='Interval') + + interval_type = IntervalType() + + +Type inference for Python values +-------------------------------- + +In itself, creating a Numba type doesn't do anything. We must teach Numba +how to infer some Python values as instances of that type. In this example, +it is trivial: any instance of the ``Interval`` class should be treated as +belonging to the type ``interval_type``:: + + from numba.extending import typeof_impl + + @typeof_impl.register(Interval) + def typeof_index(val, c): + return interval_type + +Function arguments and global values will thusly be recognized as belonging +to ``interval_type`` whenever they are instances of ``Interval``. + + +Type inference for Python annotations +------------------------------------- + +While ``typeof`` is used to infer the Numba type of Python objects, +``as_numba_type`` is used to infer the Numba type of Python types. For simple +cases, we can simply register that the Python type ``Interval`` corresponds with +the Numba type ``interval_type``:: + + from numba.extending import as_numba_type + + as_numba_type.register(Interval, interval_type) + +Note that ``as_numba_type`` is only used to infer types from type annotations at +compile time. The ``typeof`` registry above is used to infer the type of +objects at runtime. + + +Type inference for operations +----------------------------- + +We want to be able to construct interval objects from Numba functions, so +we must teach Numba to recognize the two-argument ``Interval(lo, hi)`` +constructor. The arguments should be floating-point numbers:: + + from numba.extending import type_callable + + @type_callable(Interval) + def type_interval(context): + def typer(lo, hi): + if isinstance(lo, types.Float) and isinstance(hi, types.Float): + return interval_type + return typer + + +The :func:`type_callable` decorator specifies that the decorated function +should be invoked when running type inference for the given callable object +(here the ``Interval`` class itself). The decorated function must simply +return a typer function that will be called with the argument types. The +reason for this seemingly convoluted setup is for the typer function to have +*exactly* the same signature as the typed callable. This allows handling +keyword arguments correctly. + +The *context* argument received by the decorated function is useful in +more sophisticated cases where computing the callable's return type +requires resolving other types. + + +Extending the lowering layer +"""""""""""""""""""""""""""" + +We have finished teaching Numba about our type inference additions. +We must now teach Numba how to actually generate code and data for +the new operations. + + +Defining the data model for native intervals +-------------------------------------------- + +As a general rule, :term:`nopython mode` does not work on Python objects +as they are generated by the CPython interpreter. The representations +used by the interpreter are far too inefficient for fast native code. +Each type supported in :term:`nopython mode` therefore has to define +a tailored native representation, also called a *data model*. + +A common case of data model is an immutable struct-like data model, that +is akin to a C ``struct``. Our interval datatype conveniently falls in +that category, and here is a possible data model for it:: + + from numba.extending import models, register_model + + @register_model(IntervalType) + class IntervalModel(models.StructModel): + def __init__(self, dmm, fe_type): + members = [ + ('lo', types.float64), + ('hi', types.float64), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +This instructs Numba that values of type ``IntervalType`` (or any instance +thereof) are represented as a structure of two fields ``lo`` and ``hi``, +each of them a double-precision floating-point number (``types.float64``). + +.. note:: + Mutable types need more sophisticated data models to be able to + persist their values after modification. They typically cannot be + stored and passed on the stack or in registers like immutable types do. + + +Exposing data model attributes +------------------------------ + +We want the data model attributes ``lo`` and ``hi`` to be exposed under +the same names for use in Numba functions. Numba provides a convenience +function to do exactly that:: + + from numba.extending import make_attribute_wrapper + + make_attribute_wrapper(IntervalType, 'lo', 'lo') + make_attribute_wrapper(IntervalType, 'hi', 'hi') + +This will expose the attributes in read-only mode. As mentioned above, +writable attributes don't fit in this model. + + +Exposing a property +------------------- + +As the ``width`` property is computed rather than stored in the structure, +we cannot simply expose it like we did for ``lo`` and ``hi``. We have to +re-implement it explicitly:: + + from numba.extending import overload_attribute + + @overload_attribute(IntervalType, "width") + def get_width(interval): + def getter(interval): + return interval.hi - interval.lo + return getter + +You might ask why we didn't need to expose a type inference hook for this +attribute? The answer is that ``@overload_attribute`` is part of the +high-level API: it combines type inference and code generation in a +single API. + + +Implementing the constructor +---------------------------- + +Now we want to implement the two-argument ``Interval`` constructor:: + + from numba.extending import lower_builtin + from numba.core import cgutils + + @lower_builtin(Interval, types.Float, types.Float) + def impl_interval(context, builder, sig, args): + typ = sig.return_type + lo, hi = args + interval = cgutils.create_struct_proxy(typ)(context, builder) + interval.lo = lo + interval.hi = hi + return interval._getvalue() + + +There is a bit more going on here. ``@lower_builtin`` decorates the +implementation of the given callable or operation (here the ``Interval`` +constructor) for some specific argument types. This allows defining +type-specific implementations of a given operation, which is important +for heavily overloaded functions such as :func:`len`. + +``types.Float`` is the class of all floating-point types (``types.float64`` +is an instance of ``types.Float``). It is generally more future-proof +to match argument types on their class rather than on specific instances +(however, when *returning* a type -- chiefly during the type inference +phase --, you must usually return a type instance). + +``cgutils.create_struct_proxy()`` and ``interval._getvalue()`` are a bit +of boilerplate due to how Numba passes values around. Values are passed +as instances of :class:`llvmlite.ir.Value`, which can be too limited: +LLVM structure values especially are quite low-level. A struct proxy +is a temporary wrapper around a LLVM structure value allowing to easily +get or set members of the structure. The ``_getvalue()`` call simply +gets the LLVM value out of the wrapper. + + +Boxing and unboxing +------------------- + +If you try to use an ``Interval`` instance at this point, you'll certainly +get the error *"cannot convert Interval to native value"*. This is because +Numba doesn't yet know how to make a native interval value from a Python +``Interval`` instance. Let's teach it how to do it:: + + from numba.extending import unbox, NativeValue + + @unbox(IntervalType) + def unbox_interval(typ, obj, c): + """ + Convert a Interval object to a native interval structure. + """ + lo_obj = c.pyapi.object_getattr_string(obj, "lo") + hi_obj = c.pyapi.object_getattr_string(obj, "hi") + interval = cgutils.create_struct_proxy(typ)(c.context, c.builder) + interval.lo = c.pyapi.float_as_double(lo_obj) + interval.hi = c.pyapi.float_as_double(hi_obj) + c.pyapi.decref(lo_obj) + c.pyapi.decref(hi_obj) + is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) + return NativeValue(interval._getvalue(), is_error=is_error) + +*Unbox* is the other name for "convert a Python object to a native value" +(it fits the idea of a Python object as a sophisticated box containing +a simple native value). The function returns a ``NativeValue`` object +which gives its caller access to the computed native value, the error bit +and possibly other information. + +The snippet above makes abundant use of the ``c.pyapi`` object, which +gives access to a subset of the +`Python interpreter's C API `_. +Note the use of ``c.pyapi.err_occurred()`` to detect any errors that +may have happened when unboxing the object (try passing ``Interval('a', 'b')`` +for example). + +We also want to do the reverse operation, called *boxing*, so as to return +interval values from Numba functions:: + + from numba.extending import box + + @box(IntervalType) + def box_interval(typ, val, c): + """ + Convert a native interval structure to an Interval object. + """ + interval = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) + lo_obj = c.pyapi.float_from_double(interval.lo) + hi_obj = c.pyapi.float_from_double(interval.hi) + class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Interval)) + res = c.pyapi.call_function_objargs(class_obj, (lo_obj, hi_obj)) + c.pyapi.decref(lo_obj) + c.pyapi.decref(hi_obj) + c.pyapi.decref(class_obj) + return res + + +Using it +"""""""" + +:term:`nopython mode` functions are now able to make use of Interval objects +and the various operations you have defined on them. You can try for +example the following functions:: + + from numba import jit + + @jit(nopython=True) + def inside_interval(interval, x): + return interval.lo <= x < interval.hi + + @jit(nopython=True) + def interval_width(interval): + return interval.width + + @jit(nopython=True) + def sum_intervals(i, j): + return Interval(i.lo + j.lo, i.hi + j.hi) + + +Conclusion +"""""""""" + +We have shown how to do the following tasks: + +* Define a new Numba type class by subclassing the ``Type`` class +* Define a singleton Numba type instance for a non-parametric type +* Teach Numba how to infer the Numba type of Python values of a certain class, + using ``typeof_impl.register`` +* Teach Numba how to infer the Numba type of the Python type itself, using + ``as_numba_type.register`` +* Define the data model for a Numba type using ``StructModel`` + and ``register_model`` +* Implementing a boxing function for a Numba type using the ``@box`` decorator +* Implementing an unboxing function for a Numba type using the ``@unbox`` decorator + and the ``NativeValue`` class +* Type and implement a callable using the ``@type_callable`` and + ``@lower_builtin`` decorators +* Expose a read-only structure attribute using the ``make_attribute_wrapper`` + convenience function +* Implement a read-only property using the ``@overload_attribute`` decorator diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/low-level.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/low-level.rst new file mode 100644 index 0000000000000000000000000000000000000000..8eba72b3ecbc100d57ac92f55108934070644a01 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/low-level.rst @@ -0,0 +1,194 @@ + +.. _low-level-extending: + +Low-level extension API +======================= + +This extension API is available through the :mod:`numba.extending` module. +It allows you to hook directly into the Numba compilation chain. As such, +it distinguished between several compilation phases: + +* The :term:`typing` phase deduces the types of variables in a compiled + function by looking at the operations performed. + +* The :term:`lowering` phase converts high-level Python operations into + low-level LLVM code. This phase exploits the typing information derived + by the typing phase. + +* *Boxing* and *unboxing* convert Python objects into native values, and + vice-versa. They occur at the boundaries of calling a Numba function + from the Python interpreter. + + +Typing +------ + +.. XXX the API described here can be insufficient for some use cases. + Should we describe the whole templates menagerie? + +Type inference -- or simply *typing* -- is the process of assigning +Numba types to all values involved in a function, so as to enable +efficient code generation. Broadly speaking, typing comes in two flavours: +typing plain Python *values* (e.g. function arguments or global variables) +and typing *operations* (or *functions*) on known value types. + +.. decorator:: typeof_impl.register(cls) + + Register the decorated function as typing Python values of class *cls*. + The decorated function will be called with the signature ``(val, c)`` + where *val* is the Python value being typed and *c* is a context + object. + + +.. decorator:: type_callable(func) + + Register the decorated function as typing the callable *func*. + *func* can be either an actual Python callable or a string denoting + a operation internally known to Numba (for example ``'getitem'``). + The decorated function is called with a single *context* argument + and must return a typer function. The typer function should have + the same signature as the function being typed, and it is called + with the Numba *types* of the function arguments; it should return + either the Numba type of the function's return value, or ``None`` + if inference failed. + +.. function:: as_numba_type.register(py_type, numba_type) + + Register that the Python type *py_type* corresponds with the Numba type + *numba_type*. This can be used to register a new type or overwrite the + existing default (e.g. to treat ``float`` as ``numba.float32`` instead of + ``numba.float64``). + +.. decorator:: as_numba_type.register + + Register the decorated function as a type inference function used by + ``as_numba_type`` when trying to infer the Numba type of a Python type. + The decorated function is called with a single *py_type* argument + and returns either a corresponding Numba type, or None if it cannot infer + that *py_type*. + + +Lowering +-------- + +The following decorators all take a type specification of some kind. +A type specification is usually a type class (such as ``types.Float``) +or a specific type instance (such as ``types.float64``). Some values +have a special meaning: + +* ``types.Any`` matches any type; this allows doing your own dispatching + inside the implementation + +* ``types.VarArg()`` matches any number of arguments of the + given type; it can only appear as the last type specification when + describing a function's arguments. + +A *context* argument in the following APIs is a target context providing +various utility methods for code generation (such as creating a constant, +converting from a type to another, looking up the implementation of a +specific function, etc.). A *builder* argument is a +:class:`llvmlite.ir.IRBuilder` instance for the LLVM code being generated. + +A *signature* is an object specifying the concrete type of an operation. +The ``args`` attribute of the signature is a tuple of the argument types. +The ``return_type`` attribute of the signature is the type that the +operation should return. + +.. note:: + Numba always reasons on Numba types, but the values being passed + around during lowering are LLVM values: they don't hold the required + type information, which is why Numba types are passed explicitly too. + + LLVM has its own, very low-level type system: you can access the LLVM + type of a value by looking up its ``.type`` attribute. + + +Native operations +''''''''''''''''' + +.. decorator:: lower_builtin(func, typespec, ...) + + Register the decorated function as implementing the callable *func* + for the arguments described by the given Numba *typespecs*. + As with :func:`type_callable`, *func* can be either an actual Python + callable or a string denoting a operation internally known to Numba + (for example ``'getitem'``). + + The decorated function is called with four arguments + ``(context, builder, sig, args)``. ``sig`` is the concrete signature + the callable is being invoked with. ``args`` is a tuple of the values + of the arguments the callable is being invoked with; each value in + ``args`` corresponds to a type in ``sig.args``. The function + must return a value compatible with the type ``sig.return_type``. + +.. decorator:: lower_getattr(typespec, name) + + Register the decorated function as implementing the attribute *name* + of the given *typespec*. The decorated function is called with four + arguments ``(context, builder, typ, value)``. *typ* is the concrete + type the attribute is being looked up on. *value* is the value the + attribute is being looked up on. + +.. decorator:: lower_getattr_generic(typespec) + + Register the decorated function as a fallback for attribute lookup + on a given *typespec*. Any attribute that does not have a corresponding + :func:`lower_getattr` declaration will go through + :func:`lower_getattr_generic`. The decorated function is called with + five arguments ``(context, builder, typ, value, name)``. *typ* + and *value* are as in :func:`lower_getattr`. *name* is the name + of the attribute being looked up. + +.. decorator:: lower_cast(fromspec, tospec) + + Register the decorated function as converting from types described by + *fromspec* to types described by *tospec*. The decorated function + is called with five arguments ``(context, builder, fromty, toty, value)``. + *fromty* and *toty* are the concrete types being converted from and to, + respectively. *value* is the value being converted. The function + must return a value compatible with the type ``toty``. + + +Constants +''''''''' + +.. decorator:: lower_constant(typespec) + + Register the decorated function as implementing the creation of + constants for the Numba *typespec*. The decorated function + is called with four arguments ``(context, builder, ty, pyval)``. + *ty* is the concrete type to create a constant for. *pyval* + is the Python value to convert into a LLVM constant. + The function must return a value compatible with the type ``ty``. + + +Boxing and unboxing +''''''''''''''''''' + +In these functions, *c* is a convenience object with several attributes: + +* its ``context`` attribute is a target context as above +* its ``builder`` attribute is a :class:`llvmlite.ir.IRBuilder` as above +* its ``pyapi`` attribute is an object giving access to a subset of the + `Python interpreter's C API `_ + +An object, as opposed to a native value, is a ``PyObject *`` pointer. +Such pointers can be produced or processed by the methods in the ``pyapi`` +object. + +.. decorator:: box(typespec) + + Register the decorated function as boxing values matching the *typespec*. + The decorated function is called with three arguments ``(typ, val, c)``. + *typ* is the concrete type being boxed. *val* is the value being + boxed. The function should return a Python object, or NULL to signal + an error. + +.. decorator:: unbox(typespec) + + Register the decorated function as unboxing values matching the *typespec*. + The decorated function is called with three arguments ``(typ, obj, c)``. + *typ* is the concrete type being unboxed. *obj* is the Python object + (a ``PyObject *`` pointer, in C terms) being unboxed. The function + should return a ``NativeValue`` object giving the unboxing result value + and an optional error bit. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/mynorm.py b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/mynorm.py new file mode 100644 index 0000000000000000000000000000000000000000..884634ea31cd6638432fe66274fc1d6cf8755a78 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/mynorm.py @@ -0,0 +1,72 @@ +import numpy as np +from numba import njit, types +from numba.extending import overload, register_jitable +from numba.core.errors import TypingError + +import scipy.linalg + + +@register_jitable +def _oneD_norm_2(a): + # re-usable implementation of the 2-norm + val = np.abs(a) + return np.sqrt(np.sum(val * val)) + + +@overload(scipy.linalg.norm) +def jit_norm(a, ord=None): + if isinstance(ord, types.Optional): + ord = ord.type + # Reject non integer, floating-point or None types for ord + if not isinstance(ord, (types.Integer, types.Float, types.NoneType)): + raise TypingError("'ord' must be either integer or floating-point") + # Reject non-ndarray types + if not isinstance(a, types.Array): + raise TypingError("Only accepts NumPy ndarray") + # Reject ndarrays with non integer or floating-point dtype + if not isinstance(a.dtype, (types.Integer, types.Float)): + raise TypingError("Only integer and floating point types accepted") + # Reject ndarrays with unsupported dimensionality + if not (0 <= a.ndim <= 2): + raise TypingError('3D and beyond are not allowed') + # Implementation for scalars/0d-arrays + elif a.ndim == 0: + return a.item() + # Implementation for vectors + elif a.ndim == 1: + def _oneD_norm_x(a, ord=None): + if ord == 2 or ord is None: + return _oneD_norm_2(a) + elif ord == np.inf: + return np.max(np.abs(a)) + elif ord == -np.inf: + return np.min(np.abs(a)) + elif ord == 0: + return np.sum(a != 0) + elif ord == 1: + return np.sum(np.abs(a)) + else: + return np.sum(np.abs(a)**ord)**(1. / ord) + return _oneD_norm_x + # Implementation for matrices + elif a.ndim == 2: + def _two_D_norm_2(a, ord=None): + return _oneD_norm_2(a.ravel()) + return _two_D_norm_2 + + +if __name__ == "__main__": + @njit + def use(a, ord=None): + # simple test function to check that the overload works + return scipy.linalg.norm(a, ord) + + # spot check for vectors + a = np.arange(10) + print(use(a)) + print(scipy.linalg.norm(a)) + + # spot check for matrices + b = np.arange(9).reshape((3, 3)) + print(use(b)) + print(scipy.linalg.norm(b)) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/overloading-guide.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/overloading-guide.rst new file mode 100644 index 0000000000000000000000000000000000000000..a5645067ea0159ad788da0efbd9cd62d1de30bab --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/overloading-guide.rst @@ -0,0 +1,192 @@ + +.. _overloading-guide: + +============================== +A guide to using ``@overload`` +============================== + + +As mentioned in the :ref:`high-level extension API `, you +can use the ``@overload`` decorator to create a Numba implementation of a +function that can be used in :term:`nopython mode` functions. A common use case +is to re-implement NumPy functions so that they can be called in ``@jit`` +decorated code. This section discusses how and when to use the ``@overload`` +decorator and what contributing such a function to the Numba code base might +entail. This should help you get started when needing to use the ``@overload`` +decorator or when attempting to contribute new functions to Numba itself. + +The ``@overload`` decorator and it's variants are useful when you have a +third-party library that you do not control and you wish to provide Numba +compatible implementations for specific functions from that library. + +Concrete Example +================ + +Let's assume that you are working on a minimization algorithm that makes use of +|scipy.linalg.norm|_ to find different vector norms and the `frobenius +norm `_ for matrices. +You know that only integer and real numbers will be involved. (While this may +sound like an artificial example, especially because a Numba implementation of +``numpy.linalg.norm`` exists, it is largely pedagogical and serves to +illustrate how and when to use ``@overload``). + +.. |scipy.linalg.norm| replace:: ``scipy.linalg.norm`` +.. _scipy.linalg.norm: https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.norm.html + +The skeleton might look something like this:: + + def algorithm(): + # setup + v = ... + while True: + # take a step + d = scipy.linalg.norm(v) + if d < tolerance: + break + +Now, let's further assume, that you have heard of Numba and you now wish to use +it to accelerate your function. However, after adding the +``jit(nopython=True)`` +decorator, Numba complains that ``scipy.linalg.norm`` isn't supported. From +looking at the documentation, you realize that a norm is probably fairly easy +to implement using NumPy. A good starting point is the following template. + +.. literalinclude:: template.py + +After some deliberation and tinkering, you end up with the following code: + +.. literalinclude:: mynorm.py + +As you can see, the implementation only supports what you need right now: + +* Only supports integer and floating-point types +* All vector norms +* Only the Frobenius norm for matrices +* Code sharing between vector and matrix implementations using + ``@register_jitable``. +* Norms are implemented using NumPy syntax. (This is possible because + Numba is very aware of NumPy and many functions are supported.) + +So what actually happens here? The ``overload`` decorator registers a suitable +implementation for ``scipy.linalg.norm`` in case a call to this is encountered +in code that is being JIT-compiled, for example when you decorate your +``algorithm`` function with ``@jit(nopython=True)``. In that case, the function +``jit_norm`` will be called with the currently encountered types and will then +return either ``_oneD_norm_x`` in the vector case and ``_two_D_norm_2``. + +You can download the example code here: :download:`mynorm.py ` + +Implementing ``@overload`` for NumPy functions +============================================== + +Numba supports NumPy through the provision of ``@jit`` compatible +re-implementations of NumPy functions. In such cases ``@overload`` is a very +convenient option for writing such implementations, however there are a few +additional things to watch out for. + +* The Numba implementation should match the NumPy implementation as closely as + feasible with respect to accepted types, arguments, raised exceptions and + algorithmic complexity (Big-O / Landau order). + +* When implementing supported argument types, bear in mind that, due to + duck typing, NumPy does tend to accept a multitude of argument types beyond + NumPy arrays such as scalar, list, tuple, set, iterator, generator etc. + You will need to account for that during type inference and subsequently as + part of the tests. + +* A NumPy function may return a scalar, array or a data structure + which matches one of its inputs, you need to be aware of type + unification problems and dispatch to appropriate implementations. For + example, |np.corrcoef|_ may return an array or a scalar depending on its + inputs. + +.. |np.corrcoef| replace:: ``np.corrcoef`` +.. _np.corrcoef: https://docs.scipy.org/doc/numpy/reference/generated/numpy.corrcoef.html + +* If you are implementing a new function, you should always update the + `documentation + `_. + The sources can be found in ``docs/source/reference/numpysupported.rst``. Be + sure to mention any limitations that your implementation has, e.g. no support + for the ``axis`` keyword. + +* When writing tests for the functionality itself, it's useful to include + handling of non-finite values, arrays with different shapes and layouts, + complex inputs, scalar inputs, inputs with types for which support is not + documented (e.g. a function which the NumPy docs say requires a float or int + input might also 'work' if given a bool or complex input). + +* When writing tests for exceptions, for example if adding tests to + ``numba/tests/test_np_functions.py``, you may encounter the following error + message: + + .. code:: + + ====================================================================== + FAIL: test_foo (numba.tests.test_np_functions.TestNPFunctions) + ---------------------------------------------------------------------- + Traceback (most recent call last): + File "/numba/numba/tests/support.py", line 645, in tearDown + self.memory_leak_teardown() + File "/numba/numba/tests/support.py", line 619, in memory_leak_teardown + self.assert_no_memory_leak() + File "/numba/numba/tests/support.py", line 628, in assert_no_memory_leak + self.assertEqual(total_alloc, total_free) + AssertionError: 36 != 35 + + This occurs because raising exceptions from jitted code leads to reference + leaks. Ideally, you will place all exception testing in a separate test + method and then add a call in each test to ``self.disable_leak_check()`` to + disable the leak-check (inherit from ``numba.tests.support.TestCase`` to make + that available). + +* For many of the functions that are available in NumPy, there are + corresponding methods defined on the NumPy ``ndarray`` type. For example, the + function ``repeat`` is available as a NumPy module level function and a + member function on the ``ndarray`` class. + + .. code:: python + + import numpy as np + a = np.arange(10) + # function + np.repeat(a, 10) + # method + a.repeat(10) + + Once you have written the function implementation, you can easily use + ``@overload_method`` and reuse it. Just be sure to check that NumPy doesn't + diverge in the implementations of its function/method. + + As an example, the ``repeat`` function/method: + + .. code:: python + + @extending.overload_method(types.Array, 'repeat') + def array_repeat(a, repeats): + def array_repeat_impl(a, repeat): + # np.repeat has already been overloaded + return np.repeat(a, repeat) + + return array_repeat_impl + +* If you need to create ancillary functions, for example to re-use a small + utility function or to split your implementation across functions for the + sake of readability, you can make use of the ``@register_jitable`` decorator. + This will make those functions available from within your ``@jit`` and + ``@overload`` decorated functions. + +* The Numba continuous integration (CI) set up tests a wide variety of NumPy + versions, you'll sometimes be alerted to a change in behaviour from some + previous NumPy version. If you can find supporting evidence in the NumPy + change log / repository, then you'll need to decide whether to create + branches and attempt to replicate the logic across versions, or use a version + gate (with associated wording in the documentation) to advertise that Numba + replicates NumPy from some particular version onwards. + +* You can look at the Numba source code for inspiration, many of the overloaded + NumPy functions and methods are in ``numba/targets/arrayobj.py``. Below, you + will find a list of implementations to look at that are well implemented in + terms of accepted types and test coverage. + + * ``np.repeat`` diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/template.py b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/template.py new file mode 100644 index 0000000000000000000000000000000000000000..19b98cc015be288a0dea04ff4fe25d297ce228e5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/extending/template.py @@ -0,0 +1,21 @@ +# Declare that function `myfunc` is going to be overloaded (have a +# substitutable Numba implementation) +@overload(myfunc) +# Define the overload function with formal arguments +# these arguments must be matched in the inner function implementation +def jit_myfunc(arg0, arg1, arg2, ...): + # This scope is for typing, access is available to the *type* of all + # arguments. This information can be used to change the behaviour of the + # implementing function and check that the types are actually supported + # by the implementation. + + print(arg0) # this will show the Numba type of arg0 + + # This is the definition of the function that implements the `myfunc` work. + # It does whatever algorithm is needed to implement myfunc. + def myfunc_impl(arg0, arg1, arg2, ...): # match arguments to jit_myfunc + # < Implementation goes here > + return # whatever needs to be returned by the algorithm + + # return the implementation + return myfunc_impl diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/glossary.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/glossary.rst new file mode 100644 index 0000000000000000000000000000000000000000..35b690ca34e229ff32df624a1a0eb7948700bf8b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/glossary.rst @@ -0,0 +1,106 @@ + +Glossary +======== + +.. glossary:: + + ahead-of-time compilation + AOT compilation + AOT + Compilation of a function in a separate step before running the + program code, producing an on-disk binary object which can be distributed + independently. This is the traditional kind of compilation known + in languages such as C, C++ or Fortran. + + bytecode + Python bytecode + The original form in which Python functions are executed. Python + bytecode describes a stack-machine executing abstract (untyped) + operations using operands from both the function stack and the + execution environment (e.g. global variables). + + compile-time constant + An expression whose value Numba can infer and freeze at compile-time. + Global variables and closure variables are compile-time constants. + + just-in-time compilation + JIT compilation + JIT + Compilation of a function at execution time, as opposed to + :term:`ahead-of-time compilation`. + + JIT function + Shorthand for "a function :term:`JIT-compiled ` with Numba using + the :ref:`@jit ` decorator." + + loop-lifting + loop-jitting + A feature of compilation in :term:`object mode` where a loop can be + automatically extracted and compiled in :term:`nopython mode`. This + allows functions with operations unsupported in nopython mode to see + significant performance improvements if they contain loops with only + nopython-supported operations. + + lowering + The act of translating :term:`Numba IR` into LLVM IR. The term + "lowering" stems from the fact that LLVM IR is low-level and + machine-specific while Numba IR is high-level and abstract. + + NPM + nopython mode + A Numba compilation mode that generates code that does not access the + Python C API. This compilation mode produces the highest performance + code, but requires that the native types of all values in the function + can be :term:`inferred `. Unless otherwise instructed, + the ``@jit`` decorator will automatically fall back to :term:`object + mode` if nopython mode cannot be used. + + Numba IR + Numba intermediate representation + A representation of a piece of Python code which is more amenable + to analysis and transformations than the original Python + :term:`bytecode`. + + object mode + A Numba compilation mode that generates code that handles all values + as Python objects and uses the Python C API to perform all operations + on those objects. Code compiled in object mode will often run + no faster than Python interpreted code, unless the Numba compiler can + take advantage of :term:`loop-jitting`. + + ``OptionalType`` + An ``OptionalType`` is effectively a type union of a ``type`` and ``None``. + They typically occur in practice due to a variable being set to ``None`` + and then in a branch the variable being set to some other value. It's + often not possible at compile time to determine if the branch will execute + so to permit :term:`type inference` to complete, the type of the variable + becomes the union of a ``type`` (from the value) and ``None``, + i.e. ``OptionalType(type)``. + + type inference + The process by which Numba determines the specialized types of all + values within a function being compiled. Type inference can fail + if arguments or globals have Python types unknown to Numba, or if + functions are used that are not recognized by Numba. Successful + type inference is a prerequisite for compilation in + :term:`nopython mode`. + + typing + The act of running :term:`type inference` on a value or operation. + + ufunc + A NumPy `universal function `_. + Numba can create new compiled ufuncs with + the :ref:`@vectorize ` decorator. + + reflection + In numba, when a mutable container is passed as argument to a nopython + function from the Python interpreter, the container object and all its + contained elements are converted into nopython values. To match the + semantics of Python, any mutation on the container inside the nopython + function must be visible in the Python interpreter. To do so, Numba + must update the container and its elements and convert them back into + Python objects during the transition back into the interpreter. + + Not to be confused with Python's "reflection" in the context of binary + operators (see https://docs.python.org/3.5/reference/datamodel.html). diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/index.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..fa757b5a86925476b8bd5c08d3c679c3da405c75 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/index.rst @@ -0,0 +1,37 @@ +.. Numba documentation master file, created by + sphinx-quickstart on Tue Dec 30 11:55:40 2014. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Numba documentation +=================== + +This is the Numba documentation. Unless you are already acquainted +with Numba, we suggest you start with the :doc:`User manual `. + + +.. toctree:: + :caption: For all users + :maxdepth: 2 + + user/index.rst + reference/index.rst + + +.. toctree:: + :caption: For CUDA users + :maxdepth: 2 + + cuda/index.rst + cuda-reference/index.rst + + +.. toctree:: + :caption: For advanced users & developers + :maxdepth: 2 + + extending/index.rst + developer/index.rst + proposals/index.rst + glossary.rst + release-notes.rst diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/cfunc.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/cfunc.rst new file mode 100644 index 0000000000000000000000000000000000000000..7650dfb6d098ec55fab044587f87e8cf61cbe8fb --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/cfunc.rst @@ -0,0 +1,147 @@ +============================ +NBEP 4: Defining C callbacks +============================ + +:Author: Antoine Pitrou +:Date: April 2016 +:Status: Draft + + +Interfacing with some native libraries (for example written in C +or C++) can necessitate writing native callbacks to provide business logic +to the library. Some Python-facing libraries may also provide the +alternative of passing a ctypes-wrapped native callback instead of a +Python callback for better performance. A simple example is the +``scipy.integrate`` package where the user passes the function to be +integrated as a callback. + +Users of those libraries may want to benefit from the performance advantage +of running purely native code, while writing their code in Python. +This proposal outlines a scheme to provide such a functionality in +Numba. + + +Basic usage +=========== + +We propose adding a new decorator, ``@cfunc``, importable from the main +package. This decorator allows defining a callback as in the following +example:: + + from numba import cfunc + from numba.types import float64 + + # A callback with the C signature `double(double)` + + @cfunc(float64(float64), nopython=True) + def integrand(x): + return 1 / x + + +The ``@cfunc`` decorator returns a "C function" object holding the +resources necessary to run the given compiled function (for example its +LLVM module). This object has several attributes and methods: + +* the ``ctypes`` attribute is a ctypes function object representing + the native function. + +* the ``address`` attribute is the address of the native function code, as + an integer (note this can also be computed from the ``ctypes`` attribute). + +* the ``native_name`` attribute is the symbol under which the function + can be looked up inside the current process. + +* the ``inspect_llvm()`` method returns the IR for the LLVM module + in which the function is compiled. It is expected that the ``native_name`` + attribute corresponds to the function's name in the LLVM IR. + +The general signature of the decorator is ``cfunc(signature, **options)``. + +The ``signature`` must specify the argument types and return type of the +function using Numba types. In contrary to ``@jit``, the return type cannot +be omitted. + +The ``options`` are keyword-only parameters specifying compilation options. +We are expecting that the standard ``@jit`` options (``nopython``, +``forceobj``, ``cache``) can be made to work with ``@cfunc``. + + +Calling from Numba-compiled functions +------------------------------------- + +While the intended use is to pass a callback's address to foreign C +code expecting a function pointer, it should be made possible to call +the C callback from a Numba-compiled function. + + +Passing array data +================== + +Native platform ABIs as used by C or C++ don't have the notion of a shaped +array as in Numpy. One common solution is to pass a raw data pointer and +one or several size arguments (depending on dimensionality). Numba must +provide a way to rebuild an array view of this data inside the callback. + +:: + + from numba import cfunc, carray + from numba.types import float64, CPointer, void, intp + + # A callback with the C signature `void(double *, double *, size_t)` + + @cfunc(void(CPointer(float64), CPointer(float64), intp)) + def invert(in_ptr, out_ptr, n): + in_ = carray(in_ptr, (n,)) + out = carray(out_ptr, (n,)) + for i in range(n): + out[i] = 1 / in_[i] + + +The ``carray`` function takes ``(pointer, shape, dtype)`` arguments +(``dtype`` being optional) and returns a C-layout array view over the +data *pointer*, with the given *shape* and *dtype*. *pointer* must +be a ctypes pointer object (not a Python integer). The array's +dimensionality corresponds to the *shape* tuple's length. If *dtype* +is not given, the array's dtype corresponds to the *pointer*'s pointee +type. + +The ``farray`` function is similar except that it returns a F-layout +array view. + + +Error handling +============== + +There is no standard mechanism in C for error reporting. Unfortunately, +Numba currently doesn't handle ``try..except`` blocks, which makes it more +difficult for the user to implement the required error reporting scheme. +The current stance of this proposal is to let users guard against invalid +arguments where necessary, and do whatever is required to inform the caller +of the error. + +Based on user feedback, we can later add support for some error reporting +schemes, such as returning an integer error code depending on whether an +exception was raised, or setting ``errno``. + + +Deferred topics +=============== + +Ahead-of-Time compilation +------------------------- + +This proposal doesn't make any provision for AOT compilation of C callbacks. +It would probably necessitate a separate API (a new method on the +``numba.pycc.CC`` object), and the implementation would require exposing +a subset of the C function object's functionality from the compiled C +extension module. + +Opaque data pointers +-------------------- + +Some libraries allow passing an opaque data pointer (``void *``) to a +user-provided callback, to provide any required context for execution +of the callback. Taking advantage of this functionality would require +adding specific support in Numba, for example the ability to do generic +conversion from ``types.voidptr`` and to take the address of a +Python-facing ``jitclass`` instance. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/extension-points.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/extension-points.rst new file mode 100644 index 0000000000000000000000000000000000000000..89197dd17db9a357d91bca388d2d9634612c8cd0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/extension-points.rst @@ -0,0 +1,414 @@ +======================== +NBEP 2: Extension points +======================== + +:Author: Antoine Pitrou +:Date: July 2015 +:Status: Draft + + +Implementing new types or functions in Numba requires hooking into +various mechanisms along the compilation chain (and potentially +outside of it). This document aims, first, at examining the +current ways of doing so and, second, at making proposals to make +extending easier. + +If some of the proposals are implemented, we should first strive +to use and exercise them internally, before exposing the APIs to the +public. + +.. note:: + This document doesn't cover CUDA or any other non-CPU backend. + + +High-level API +============== + +There is currently no high-level API, making some use cases more +complicated than they should be. + +Proposed changes +---------------- + +Dedicated module +'''''''''''''''' + +We propose the addition of a ``numba.extending`` module exposing the main +APIs useful for extending Numba. + +Implementing a function +''''''''''''''''''''''' + +We propose the addition of a ``@overload`` decorator allowing the +implementation of a given function for use in :term:`nopython mode`. +The overloading function has the same formal signature as the implemented +function, and receives the actual argument types. It should return a +Python function implementing the overloaded function for the given types. + +The following example implements :func:`numpy.where` with +this approach. + +.. literalinclude:: np-where-override.py + +It is also possible to implement functions already known to Numba, to +support additional types. The following example implements the +built-in function :func:`len` for tuples with this approach:: + + @overload(len) + def tuple_len(x): + if isinstance(x, types.BaseTuple): + # The tuple length is known at compile-time, so simply reify it + # as a constant. + n = len(x) + def len_impl(x): + return n + return len_impl + + +Implementing an attribute +''''''''''''''''''''''''' + +We propose the addition of a ``@overload_attribute`` decorator allowing +the implementation of an attribute getter for use in :term:`nopython mode`. + +The following example implements the ``.nbytes`` attribute on Numpy arrays:: + + @overload_attribute(types.Array, 'nbytes') + def array_nbytes(arr): + def get(arr): + return arr.size * arr.itemsize + return get + +.. note:: + The overload_attribute() signature allows for expansion to also define + setters and deleters, by letting the decorated function return a + ``getter, setter, deleter`` tuple instead of a single ``getter``. + + +Implementing a method +''''''''''''''''''''' + +We propose the addition of a ``@overload_method`` decorator allowing the +implementation of an instance method for use in :term:`nopython mode`. + +The following example implements the ``.take()`` method on Numpy arrays:: + + @overload_method(types.Array, 'take') + def array_take(arr, indices): + if isinstance(indices, types.Array): + def take_impl(arr, indices): + n = indices.shape[0] + res = np.empty(n, arr.dtype) + for i in range(n): + res[i] = arr[indices[i]] + return res + return take_impl + + +Exposing a structure member +''''''''''''''''''''''''''' + +We propose the addition of a ``make_attribute_wrapper()`` function exposing +an internal field as a visible read-only attribute, for those types backed +by a ``StructModel`` data model. + +For example, assuming ``PdIndexType`` is the Numba type of pandas indices, +here is how to expose the underlying Numpy array as a ``._data`` attribute:: + + @register_model(PdIndexType) + class PdIndexModel(models.StructModel): + def __init__(self, dmm, fe_type): + members = [ + ('values', fe_type.as_array), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + make_attribute_wrapper(PdIndexType, 'values', '_data') + + +Typing +====== + +Numba types +----------- + +Numba's standard types are declared in :mod:`numba.types`. To declare +a new type, one subclasses the base :class:`Type` class or one of its +existing abstract subclasses, and implements the required functionality. + +Proposed changes +'''''''''''''''' + +No change required. + + +Type inference on values +------------------------ + +Values of a new type need to be type-inferred if they can appear as +function arguments or constants. The core machinery is in +:mod:`numba.typing.typeof`. + +In the common case where some Python class or classes map exclusively +to the new type, one can extend a generic function to dispatch on said +classes, e.g.:: + + from numba.typing.typeof import typeof_impl + + @typeof_impl(MyClass) + def _typeof_myclass(val, c): + if "some condition": + return MyType(...) + +The ``typeof_impl`` specialization must return a Numba type instance, +or None if the value failed typing. + +(when one controls the class being type-inferred, an alternative +to ``typeof_impl`` is to define a ``_numba_type_`` property on the class) + +In the rarer case where the new type can denote various Python classes +that are impossible to enumerate, one must insert a manual check in the +fallback implementation of the ``typeof_impl`` generic function. + +Proposed changes +'''''''''''''''' + +Allow people to define a generic hook without monkeypatching the +fallback implementation. + + +Fast path for type inference on function arguments +-------------------------------------------------- + +Optionally, one may want to allow a new type to participate in the +fast type resolution (written in C code) to minimize function call +overhead when a JIT-compiled function is called with the new type. +One must then insert the required checks and implementation in +the ``_typeof.c`` file, presumably inside the ``compute_fingerprint()`` +function. + +Proposed changes +'''''''''''''''' + +None. Adding generic hooks to C code embedded in a C Python extension +is too delicate a change. + + +Type inference on operations +---------------------------- + +Values resulting from various operations (function calls, operators, etc.) +are typed using a set of helpers called "templates". One can define a +new template by subclass one of the existing base classes and implement +the desired inference mechanism. The template is explicitly registered +with the type inference machinery using a decorator. + +The :class:`ConcreteTemplate` base class allows one to define inference as +a set of supported signatures for a given operation. The following example +types the modulo operator:: + + @builtin + class BinOpMod(ConcreteTemplate): + key = "%" + cases = [signature(op, op, op) + for op in sorted(types.signed_domain)] + cases += [signature(op, op, op) + for op in sorted(types.unsigned_domain)] + cases += [signature(op, op, op) for op in sorted(types.real_domain)] + +(note that type *instances* are used in the signatures, severely +limiting the amount of genericity that can be expressed) + +The :class:`AbstractTemplate` base class allows to define inference +programmatically, giving it full flexibility. Here is a simplistic +example of how tuple indexing (i.e. the ``__getitem__`` operator) can +be expressed:: + + @builtin + class GetItemUniTuple(AbstractTemplate): + key = "getitem" + + def generic(self, args, kws): + tup, idx = args + if isinstance(tup, types.UniTuple) and isinstance(idx, types.Integer): + return signature(tup.dtype, tup, idx) + + +The :class:`AttributeTemplate` base class allows to type the attributes +and methods of a given type. Here is an example, typing the ``.real`` +and ``.imag`` attributes of complex numbers:: + + @builtin_attr + class ComplexAttribute(AttributeTemplate): + key = types.Complex + + def resolve_real(self, ty): + return ty.underlying_float + + def resolve_imag(self, ty): + return ty.underlying_float + +.. note:: + :class:`AttributeTemplate` only works for getting attributes. Setting + an attribute's value is hardcoded in :mod:`numba.typeinfer`. + +The :class:`CallableTemplate` base class offers an easier way to parse +flexible function signatures, by letting one define a callable that has +the same definition as the function being typed. For example, here is how +one could hypothetically type Python's ``sorted`` function if Numba supported +lists:: + + @builtin + class Sorted(CallableTemplate): + key = sorted + + def generic(self): + def typer(iterable, key=None, reverse=None): + if reverse is not None and not isinstance(reverse, types.Boolean): + return + if key is not None and not isinstance(key, types.Callable): + return + if not isinstance(iterable, types.Iterable): + return + return types.List(iterable.iterator_type.yield_type) + + return typer + +(note you can return just the function's return type instead of the +full signature) + +Proposed changes +'''''''''''''''' + +Naming of the various decorators is quite vague and confusing. We propose +renaming ``@builtin`` to ``@infer``, ``@builtin_attr`` to ``@infer_getattr`` +and ``builtin_global`` to ``infer_global``. + +The two-step declaration for global values is a bit verbose, we propose +simplifying it by allowing the use of ``infer_global`` as a decorator:: + + @infer_global(len) + class Len(AbstractTemplate): + key = len + + def generic(self, args, kws): + assert not kws + (val,) = args + if isinstance(val, (types.Buffer, types.BaseTuple)): + return signature(types.intp, val) + +The class-based API can feel clumsy, we can add a functional API for +some of the template kinds: + +.. code-block:: python + + @type_callable(sorted) + def type_sorted(context): + def typer(iterable, key=None, reverse=None): + # [same function as above] + + return typer + + +Code generation +=============== + +Concrete representation of values of a Numba type +------------------------------------------------- + +Any concrete Numba type must be able to be represented in LLVM form +(for variable storage, argument passing, etc.). One defines that +representation by implementing a datamodel class and registering it +with a decorator. Datamodel classes for standard types are defined +in :mod:`numba.datamodel.models`. + +Proposed changes +'''''''''''''''' + +No change required. + +Conversion between types +------------------------ + +Implicit conversion between Numba types is currently implemented as a +monolithic sequence of choices and type checks in the +:meth:`BaseContext.cast` method. To add a new implicit conversion, one +appends a type-specific check in that method. + +Boolean evaluation is a special case of implicit conversion (the +destination type being :class:`types.Boolean`). + +.. note:: + Explicit conversion is seen as a regular operation, e.g. a constructor + call. + +Proposed changes +'''''''''''''''' + +Add a generic function for implicit conversion, with multiple dispatch +based on the source and destination types. Here is an example showing +how to write a float-to-integer conversion:: + + @lower_cast(types.Float, types.Integer) + def float_to_integer(context, builder, fromty, toty, val): + lty = context.get_value_type(toty) + if toty.signed: + return builder.fptosi(val, lty) + else: + return builder.fptoui(val, lty) + + +Implementation of an operation +------------------------------ + +Other operations are implemented and registered using a set of generic +functions and decorators. For example, here is how lookup for a the ``.ndim`` +attribute on Numpy arrays is implemented:: + + @builtin_attr + @impl_attribute(types.Kind(types.Array), "ndim", types.intp) + def array_ndim(context, builder, typ, value): + return context.get_constant(types.intp, typ.ndim) + +And here is how calling ``len()`` on a tuple value is implemented:: + + @builtin + @implement(types.len_type, types.Kind(types.BaseTuple)) + def tuple_len(context, builder, sig, args): + tupty, = sig.args + retty = sig.return_type + return context.get_constant(retty, len(tupty.types)) + +Proposed changes +'''''''''''''''' + +Review and streamine the API. Drop the requirement to write +``types.Kind(...)`` explicitly. Remove the separate ``@implement`` +decorator and rename ``@builtin`` to ``@lower_builtin``, ``@builtin_attr`` +to ``@lower_getattr``, etc. + +Add decorators to implement ``setattr()`` operations, named +``@lower_setattr`` and ``@lower_setattr_generic``. + + +Conversion from / to Python objects +----------------------------------- + +Some types need to be converted from or to Python objects, if they can +be passed as function arguments or returned from a function. The +corresponding boxing and unboxing operations are implemented using +a generic function. The implementations for standard Numba types +are in :mod:`numba.targets.boxing`. For example, here is the boxing +implementation for a boolean value:: + + @box(types.Boolean) + def box_bool(c, typ, val): + longval = c.builder.zext(val, c.pyapi.long) + return c.pyapi.bool_from_long(longval) + +Proposed changes +'''''''''''''''' + +Change the implementation signature from ``(c, typ, val)`` to +``(typ, val, c)``, to match the one chosen for the ``typeof_impl`` +generic function. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/external-memory-management.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/external-memory-management.rst new file mode 100644 index 0000000000000000000000000000000000000000..38878d670c50f1601c2dc8526de3a1874ea70187 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/external-memory-management.rst @@ -0,0 +1,922 @@ +.. _nbep-7: + +=============================================== +NBEP 7: CUDA External Memory Management Plugins +=============================================== + +:Author: Graham Markall, NVIDIA +:Contributors: Thomson Comer, Peter Entschev, Leo Fang, John Kirkham, Keith Kraus +:Date: March 2020 +:Status: Final + +Background and goals +-------------------- + +The :ref:`CUDA Array Interface ` enables sharing of data +between different Python libraries that access CUDA devices. However, each +library manages its own memory distinctly from the others. For example: + + +* `Numba `_ internally manages memory for the creation + of device and mapped host arrays. +* `The RAPIDS libraries `_ (cuDF, cuML, etc.) use the `Rapids + Memory Manager `_ for allocating device + memory. +* `CuPy `_ includes a `memory pool + implementation `_ + for both device and pinned memory. + +The goal of this NBEP is to describe a plugin interface that enables Numba's +internal memory management to be replaced with an external memory manager by the +user. When the plugin interface is in use, Numba no longer directly allocates or +frees any memory when creating arrays, but instead requests allocations and +frees through the external manager. + +Requirements +------------ + +Provide an *External Memory Manager (EMM)* interface in Numba. + + +* When the EMM is in use, Numba will make all memory allocation using the EMM. + It will never directly call functions such as ``CuMemAlloc``\ , ``cuMemFree``\ , etc. +* When not using an *External Memory Manager (EMM)*\ , Numba's present behaviour + is unchanged (at the time of writing, the current version is the 0.48 + release). + +If an EMM is to be used, it will entirely replace Numba's internal memory +management for the duration of program execution. An interface for setting the +memory manager will be provided. + +Device vs. Host memory +^^^^^^^^^^^^^^^^^^^^^^^ + +An EMM will always take responsibility for the management of device memory. +However, not all CUDA memory management libraries also support managing host +memory, so a facility for Numba to continue the management of host memory +whilst ceding control of device memory to the EMM will be provided. + +Deallocation strategies +^^^^^^^^^^^^^^^^^^^^^^^ + +Numba's internal memory management uses a :ref:`deallocation strategy +` designed to increase efficiency by deferring +deallocations until a significant quantity are pending. It also provides a +mechanism for preventing deallocations entirely during critical sections, using +the :func:`~numba.cuda.defer_cleanup` context manager. + + +* When the EMM is not in use, the deallocation strategy and operation of + ``defer_cleanup`` remain unchanged. +* When the EMM is in use, the deallocation strategy is implemented by the EMM, + and Numba's internal deallocation mechanism is not used. For example: + + * A similar strategy to Numba's could be implemented by the EMM, or + * Deallocated memory might immediately be returned to a memory pool. + +* The ``defer_cleanup`` context manager may behave differently with an EMM - an + EMM should be accompanied by documentation of the behaviour of the + ``defer_cleanup`` context manager when it is in use. + + * For example, a pool allocator could always immediately return memory to a + pool even when the context manager is in use, but could choose + not to free empty pools until ``defer_cleanup`` is not in use. + +Management of other objects +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In addition to memory, Numba manages the allocation and deallocation of +:ref:`events `, :ref:`streams `, and modules (a module is a +compiled object, which is generated from ``@cuda.jit``\ -ted functions). The +management of streams, events, and modules should be unchanged by the presence +or absence of an EMM. + +Asynchronous allocation / deallocation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +An asynchronous memory manager might provide the facility for an allocation or +free to take a CUDA stream and execute asynchronously. For freeing, this is +unlikely to cause issues since it operates at a layer beneath Python, but for +allocations this could be problematic if the user tries to then launch a kernel +on the default stream from this asynchronous memory allocation. + +The interface described in this proposal will not be required to support +asynchronous allocation and deallocation, and as such these use cases will not +be considered further. However, nothing in this proposal should preclude the +straightforward addition of asynchronous operations in future versions of the +interface. + +Non-requirements +^^^^^^^^^^^^^^^^ + +In order to minimise complexity and constrain this proposal to a reasonable +scope, the following will not be supported: + + +* Using different memory manager implementations for different contexts. All + contexts will use the same memory manager implementation - either the Numba + internal implementation or an external implementation. +* Changing the memory manager once execution has begun. It is not practical to + change the memory manager and retain all allocations. Cleaning up the entire + state and then changing to a different memory allocator (rather than starting + a new process) appears to be a rather niche use case. +* Any changes to the ``__cuda_array_interface__`` to further define its semantics, + e.g. for acquiring / releasing memory as discussed in `Numba Issue + #4886 `_ - these are independent, + and can be addressed as part of separate proposals. +* Managed memory / UVM is not supported. At present Numba does not support UVM - + see `Numba Issue #4362 `_ for + discussion of support. + +Interface for Plugin developers +------------------------------- + +New classes and functions will be added to ``numba.cuda.cudadrv.driver``: + +* ``BaseCUDAMemoryManager`` and ``HostOnlyCUDAMemoryManager``\ : base classes for + EMM plugin implementations. +* ``set_memory_manager``: a method for registering an external memory manager with + Numba. + +These will be exposed through the public API, in the ``numba.cuda`` module. +Additionally, some classes that are already part of the `driver` module will be +exposed as part of the public API: + +* ``MemoryPointer``: used to encapsulate information about a pointer to device + memory. +* ``MappedMemory``: used to hold information about host memory that is mapped into + the device address space (a subclass of ``MemoryPointer``\ ). +* ``PinnedMemory``: used to hold information about host memory that is pinned (a + subclass of ``mviewbuf.MemAlloc``\ , a class internal to Numba). + +As an alternative to calling the ``set_memory_manager`` function, an environment +variable can be used to set the memory manager. The value of the environment +variable should be the name of the module containing the memory manager in its +global scope, named ``_numba_memory_manager``\ : + +.. code-block:: + + export NUMBA_CUDA_MEMORY_MANAGER="" + +When this variable is set, Numba will automatically use the memory manager from +the specified module. Calls to ``set_memory_manager`` will issue a warning, but +otherwise be ignored. + +Plugin Base Classes +^^^^^^^^^^^^^^^^^^^ + +An EMM plugin is implemented by inheriting from the ``BaseCUDAMemoryManager`` +class, which is defined as: + +.. code-block:: python + + class BaseCUDAMemoryManager(object, metaclass=ABCMeta): + @abstractmethod + def memalloc(self, size): + """ + Allocate on-device memory in the current context. Arguments: + + - `size`: Size of allocation in bytes + + Returns: a `MemoryPointer` to the allocated memory. + """ + + @abstractmethod + def memhostalloc(self, size, mapped, portable, wc): + """ + Allocate pinned host memory. Arguments: + + - `size`: Size of the allocation in bytes + - `mapped`: Whether the allocated memory should be mapped into the CUDA + address space. + - `portable`: Whether the memory will be considered pinned by all + contexts, and not just the calling context. + - `wc`: Whether to allocate the memory as write-combined. + + Returns a `MappedMemory` or `PinnedMemory` instance that owns the + allocated memory, depending on whether the region was mapped into + device memory. + """ + + @abstractmethod + def mempin(self, owner, pointer, size, mapped): + """ + Pin a region of host memory that is already allocated. Arguments: + + - `owner`: An object owning the memory - e.g. a `DeviceNDArray`. + - `pointer`: The pointer to the beginning of the region to pin. + - `size`: The size of the region to pin. + - `mapped`: Whether the region should also be mapped into device memory. + + Returns a `MappedMemory` or `PinnedMemory` instance that refers to the + allocated memory, depending on whether the region was mapped into device + memory. + """ + + @abstractmethod + def initialize(self): + """ + Perform any initialization required for the EMM plugin to be ready to + use. + """ + + @abstractmethod + def get_memory_info(self): + """ + Returns (free, total) memory in bytes in the context + """ + + @abstractmethod + def get_ipc_handle(self, memory): + """ + Return an `IpcHandle` from a GPU allocation. Arguments: + + - `memory`: A `MemoryPointer` for which the IPC handle should be created. + """ + + @abstractmethod + def reset(self): + """ + Clear up all memory allocated in this context. + """ + + @abstractmethod + def defer_cleanup(self): + """ + Returns a context manager that ensures the implementation of deferred + cleanup whilst it is active. + """ + + @property + @abstractmethod + def interface_version(self): + """ + Returns an integer specifying the version of the EMM Plugin interface + supported by the plugin implementation. Should always return 1 for + implementations described in this proposal. + """ + +All of the methods of an EMM plugin are called from within Numba - they never +need to be invoked directly by a Numba user. + +The ``initialize`` method is called by Numba prior to any memory allocations +being requested. This gives the EMM an opportunity to initialize any data +structures, etc., that it needs for its normal operations. The method may be +called multiple times during the lifetime of the program - subsequent calls +should not invalidate or reset the state of the EMM. + +The ``memalloc``\ , ``memhostalloc``\ , and ``mempin`` methods are called when Numba +requires an allocation of device or host memory, or pinning of host memory. +Device memory should always be allocated in the current context. + +``get_ipc_handle`` is called when an IPC handle for an array is required. Note +that there is no method for closing an IPC handle - this is because the +``IpcHandle`` object constructed by ``get_ipc_handle`` contains a ``close()`` method +as part of its definition in Numba, which closes the handle by calling +``cuIpcCloseMemHandle``. It is expected that this is sufficient for general use +cases, so no facility for customising the closing of IPC handles is provided by +the EMM Plugin interface. + +``get_memory_info`` may be called at any time after ``initialize``. + +``reset`` is called as part of resetting a context. Numba does not normally call +reset spontaneously, but it may be called at the behest of the user. Calls to +``reset`` may even occur before ``initialize`` is called, so the plugin should be +robust against this occurrence. + +``defer_cleanup`` is called when the ``numba.cuda.defer_cleanup`` context manager +is used from user code. + +``interface_version`` is called by Numba when the memory manager is set, to +ensure that the version of the interface implemented by the plugin is +compatible with the version of Numba in use. + +Representing pointers +^^^^^^^^^^^^^^^^^^^^^ + +Device Memory +~~~~~~~~~~~~~ + +The ``MemoryPointer`` class is used to represent a pointer to memory. Whilst there +are various details of its implementation, the only aspect relevant to EMM +plugin development is its initialization. The ``__init__`` method has the +following interface: + +.. code-block:: python + + class MemoryPointer: + def __init__(self, context, pointer, size, owner=None, finalizer=None): + + +* ``context``\ : The context in which the pointer was allocated. +* ``pointer``\ : A ``ctypes`` pointer (e.g. ``ctypes.c_uint64``\ ) holding the address of + the memory. +* ``size``\ : The size of the allocation in bytes. +* ``owner``\ : The owner is sometimes set by the internals of the class, or used for + Numba's internal memory management, but need not be provided by the writer of + an EMM plugin - the default of ``None`` should always suffice. +* ``finalizer``\ : A method that is called when the last reference to the + ``MemoryPointer`` object is released. Usually this will make a call to the + external memory management library to inform it that the memory is no longer + required, and that it could potentially be freed (though the EMM is not + required to free it immediately). + +Host Memory +~~~~~~~~~~~ + +Memory mapped into the CUDA address space (which is created when the +``memhostalloc`` or ``mempin`` methods are called with ``mapped=True``\ ) is managed +using the ``MappedMemory`` class: + +.. code-block:: python + + class MappedMemory(AutoFreePointer): + def __init__(self, context, pointer, size, owner, finalizer=None): + + +* ``context``\ : The context in which the pointer was allocated. +* ``pointer``\ : A ``ctypes`` pointer (e.g. ``ctypes.c_void_p``\ ) holding the address of + the allocated memory. +* ``size``\ : The size of the allocated memory in bytes. +* ``owner``\ : A Python object that owns the memory, e.g. a ``DeviceNDArray`` + instance. +* ``finalizer``\ : A method that is called when the last reference to the + ``MappedMemory`` object is released. For example, this method could call + ``cuMemFreeHost`` on the pointer to deallocate the memory immediately. + +Note that the inheritance from ``AutoFreePointer`` is an implementation detail and +need not concern the developer of an EMM plugin - ``MemoryPointer`` is higher in +the MRO of ``MappedMemory``. + +Memory that is only in the host address space and has been pinned is represented +with the ``PinnedMemory`` class: + +.. code-block:: python + + class PinnedMemory(mviewbuf.MemAlloc): + def __init__(self, context, pointer, size, owner, finalizer=None): + + +* ``context``\ : The context in which the pointer was allocated. +* ``pointer``\ : A ``ctypes`` pointer (e.g. ``ctypes.c_void_p``\ ) holding the address of + the pinned memory. +* ``size``\ : The size of the pinned region in bytes. +* ``owner``\ : A Python object that owns the memory, e.g. a ``DeviceNDArray`` + instance. +* ``finalizer``\ : A method that is called when the last reference to the + ``PinnedMemory`` object is released. This method could e.g. call + ``cuMemHostUnregister`` on the pointer to unpin the memory immediately. + +Providing device memory management only +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some external memory managers will support management of on-device memory but +not host memory. To make it easy to implement an EMM plugin using one of these +managers, Numba will provide a memory manager class with implementations of the +``memhostalloc`` and ``mempin`` methods. An abridged definition of this class +follows: + +.. code-block:: python + + class HostOnlyCUDAMemoryManager(BaseCUDAMemoryManager): + # Unimplemented methods: + # + # - memalloc + # - get_memory_info + + def memhostalloc(self, size, mapped, portable, wc): + # Implemented. + + def mempin(self, owner, pointer, size, mapped): + # Implemented. + + def initialize(self): + # Implemented. + # + # Must be called by any subclass when its initialize() method is + # called. + + def reset(self): + # Implemented. + # + # Must be called by any subclass when its reset() method is + # called. + + def defer_cleanup(self): + # Implemented. + # + # Must be called by any subclass when its defer_cleanup() method is + # called. + +A class can subclass the ``HostOnlyCUDAMemoryManager`` and then it only needs to +add implementations of methods for on-device memory. Any subclass must observe +the following rules: + + +* If the subclass implements ``__init__``\ , then it must also call + ``HostOnlyCUDAMemoryManager.__init__``\ , as this is used to initialize some of + its data structures (\ ``self.allocations`` and ``self.deallocations``\ ). +* The subclass must implement ``memalloc`` and ``get_memory_info``. +* The ``initialize`` and ``reset`` methods perform initialisation of structures + used by the ``HostOnlyCUDAMemoryManager``. + + * If the subclass has nothing to do on initialisation (possibly) or reset + (unlikely) then it need not implement these methods. + * However, if it does implement these methods then it must also call the + methods from ``HostOnlyCUDAMemoryManager`` in its own implementations. + +* Similarly if ``defer_cleanup`` is implemented, it should enter the context + provided by ``HostOnlyCUDAManager.defer_cleanup()`` prior to ``yield``\ ing (or in + the ``__enter__`` method) and release it prior to exiting (or in the ``__exit__`` + method). + +Import order +^^^^^^^^^^^^ + +The order in which Numba and the library implementing an EMM Plugin should not +matter. For example, if ``rmm`` were to implement and register an EMM Plugin, +then: + +.. code-block:: python + + from numba import cuda + import rmm + +and + +.. code-block:: python + + import rmm + from numba import cuda + +are equivalent - this is because Numba does not initialize CUDA or allocate any +memory until the first call to a CUDA function - neither instantiating and +registering an EMM plugin, nor importing ``numba.cuda`` causes a call to a CUDA +function. + +Numba as a Dependency +^^^^^^^^^^^^^^^^^^^^^ + +Adding the implementation of an EMM Plugin to a library naturally makes Numba a +dependency of the library where it may not have been previously. In order to +make the dependency optional, if this is desired, one might conditionally +instantiate and register the EMM Plugin like: + +.. code-block:: python + + try: + import numba + from mylib.numba_utils import MyNumbaMemoryManager + numba.cuda.cudadrv.driver.set_memory_manager(MyNumbaMemoryManager) + except: + print("Numba not importable - not registering EMM Plugin") + +so that ``mylib.numba_utils``\ , which contains the implementation of the EMM +Plugin, is only imported if Numba is already present. If Numba is not available, +then ``mylib.numba_utils`` (which necessarily imports ``numba``\ ), will never be +imported. + +It is recommended that any library with an EMM Plugin includes at least some +environments with Numba for testing with the EMM Plugin in use, as well as some +environments without Numba, to avoid introducing an accidental Numba dependency. + +Example implementation - A RAPIDS Memory Manager (RMM) Plugin +------------------------------------------------------------- + +An implementation of an EMM plugin within the `Rapids Memory Manager +(RMM) `_ is sketched out in this section. This is +intended to show an overview of the implementation in order to support the +descriptions above and to illustrate how the plugin interface can be used - +different choices may be made for a production-ready implementation. + +The plugin implementation consists of additions to `python/rmm/rmm.py +`_: + +.. code-block:: python + + # New imports: + from contextlib import context_manager + # RMM already has Numba as a dependency, so these imports need not be guarded + # by a check for the presence of numba. + from numba.cuda import (HostOnlyCUDAMemoryManager, MemoryPointer, IpcHandle, + set_memory_manager) + + + # New class implementing the EMM Plugin: + class RMMNumbaManager(HostOnlyCUDAMemoryManager): + def memalloc(self, size): + # Allocates device memory using RMM functions. The finalizer for the + # allocated memory calls back to RMM to free the memory. + addr = librmm.rmm_alloc(bytesize, 0) + ctx = cuda.current_context() + ptr = ctypes.c_uint64(int(addr)) + finalizer = _make_finalizer(addr, stream) + return MemoryPointer(ctx, ptr, size, finalizer=finalizer) + + def get_ipc_handle(self, memory): + """ + Get an IPC handle for the memory with offset modified by the RMM memory + pool. + """ + # This implementation provides a functional implementation and illustrates + # what get_ipc_handle needs to do, but it is not a very "clean" + # implementation, and it relies on borrowing bits of Numba internals to + # initialise ipchandle. + # + # A more polished implementation might make use of additional functions in + # the RMM C++ layer for initialising IPC handles, and not use any Numba + # internals. + ipchandle = (ctypes.c_byte * 64)() # IPC handle is 64 bytes + cuda.cudadrv.memory.driver_funcs.cuIpcGetMemHandle( + ctypes.byref(ipchandle), + memory.owner.handle, + ) + source_info = cuda.current_context().device.get_device_identity() + ptr = memory.device_ctypes_pointer.value + offset = librmm.rmm_getallocationoffset(ptr, 0) + return IpcHandle(memory, ipchandle, memory.size, source_info, + offset=offset) + + def get_memory_info(self): + # Returns a tuple of (free, total) using RMM functionality. + return get_info() # Function defined in rmm.py + + def initialize(self): + # Nothing required to initialize RMM here, but this method is added + # to illustrate that the super() method should also be called. + super().initialize() + + @contextmanager + def defer_cleanup(self): + # Does nothing to defer cleanup - a full implementation may choose to + # implement a different policy. + with super().defer_cleanup(): + yield + + @property + def interface_version(self): + # As required by the specification + return 1 + + # The existing _make_finalizer function is used by RMMNumbaManager: + def _make_finalizer(handle, stream): + """ + Factory to make the finalizer function. + We need to bind *handle* and *stream* into the actual finalizer, which + takes no args. + """ + + def finalizer(): + """ + Invoked when the MemoryPointer is freed + """ + librmm.rmm_free(handle, stream) + + return finalizer + + # Utility function register `RMMNumbaManager` as an EMM: + def use_rmm_for_numba(): + set_memory_manager(RMMNumbaManager) + + # To support `NUMBA_CUDA_MEMORY_MANAGER=rmm`: + _numba_memory_manager = RMMNumbaManager + +Example usage +^^^^^^^^^^^^^ + +A simple example that configures Numba to use RMM for memory management and +creates a device array is as follows: + +.. code-block:: python + + # example.py + import rmm + import numpy as np + + from numba import cuda + + rmm.use_rmm_for_numba() + + a = np.zeros(10) + d_a = cuda.to_device(a) + del(d_a) + print(rmm.csv_log()) + +Running this should result in output similar to the following: + +.. code-block:: + + Event Type,Device ID,Address,Stream,Size (bytes),Free Memory,Total Memory,Current Allocs,Start,End,Elapsed,Location + Alloc,0,0x7fae06600000,0,80,0,0,1,1.10549,1.1074,0.00191666,/numba/numba/cuda/cudadrv/driver.py:683 + Free,0,0x7fae06600000,0,0,0,0,0,1.10798,1.10921,0.00122238,/numba/numba/utils.py:678 + +Note that there is some scope for improvement in RMM for detecting the line +number at which the allocation / free occurred, but this is outside the scope of +the example in this proposal. + +Setting the memory manager through the environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Rather than calling ``rmm.use_rmm_for_numba()`` in the example above, the memory +manager could also be set to use RMM globally with an environment variable, so +the Python interpreter is invoked to run the example as: + +.. code-block:: + + NUMBA_CUDA_MEMORY_MANAGER="rmm.RMMNumbaManager" python example.py + +Numba internal changes +---------------------- + +This section is intended primarily for Numba developers - those with an interest +in the external interface for implementing EMM plugins may choose to skip over +this section. + +Current model / implementation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +At present, memory management is implemented in the +:class:`~numba.cuda.cudadrv.driver.Context` class. It maintains lists of +allocations and deallocations: + +* ``allocations`` is a ``numba.core.utils.UniqueDict``, created at context + creation time. +* ``deallocations`` is an instance of the ``_PendingDeallocs`` class, and is created + when ``Context.prepare_for_use()`` is called. + +These are used to track allocations and deallocations of: + +* Device memory +* Pinned memory +* Mapped memory +* Streams +* Events +* Modules + +The ``_PendingDeallocs`` class implements the deferred deallocation strategy - +cleanup functions (such as ``cuMemFree``\ ) for the items above are added to its +list of pending deallocations by the finalizers of objects representing +allocations. These finalizers are run when the objects owning them are +garbage-collected by the Python interpreter. When the addition of a new +cleanup function to the deallocation list causes the number or size of pending +deallocations to exceed a configured ratio, the ``_PendingDeallocs`` object runs +deallocators for all items it knows about and then clears its internal pending +list. + +See :ref:`deallocation-behavior` for more details of this implementation. + +Proposed changes +^^^^^^^^^^^^^^^^ + +This section outlines the major changes that will be made to support the EMM +plugin interface - there will be various small changes to other parts of Numba +that will be required in order to adapt to these changes; an exhaustive list of +these is not provided. + +Context changes +~~~~~~~~~~~~~~~ + +The ``numba.cuda.cudadrv.driver.Context`` class will no longer directly allocate +and free memory. Instead, the context will hold a reference to a memory manager +instance, and its memory allocation methods will call into the memory manager, +e.g.: + +.. code-block:: python + + def memalloc(self, size): + return self.memory_manager.memalloc(size) + + def memhostalloc(self, size, mapped=False, portable=False, wc=False): + return self.memory_manager.memhostalloc(size, mapped, portable, wc) + + def mempin(self, owner, pointer, size, mapped=False): + if mapped and not self.device.CAN_MAP_HOST_MEMORY: + raise CudaDriverError("%s cannot map host memory" % self.device) + return self.memory_manager.mempin(owner, pointer, size, mapped) + + def prepare_for_use(self): + self.memory_manager.initialize() + + def get_memory_info(self): + self.memory_manager.get_memory_info() + + def get_ipc_handle(self, memory): + return self.memory_manager.get_ipc_handle(memory) + + def reset(self): + # ... Already-extant reset logic, plus: + self._memory_manager.reset() + +The ``memory_manager`` member is initialised when the context is created. + +The ``memunpin`` method (not shown above but currently exists in the ``Context`` +class) has never been implemented - it presently raises a ``NotImplementedError``. +This method arguably un-needed - pinned memory is immediately unpinned by its +finalizer, and unpinning before a finalizer runs would invalidate the state of +``PinnedMemory`` objects for which references are still held. It is proposed that +this is removed when making the other changes to the ``Context`` class. + +The ``Context`` class will still instantiate ``self.allocations`` and +``self.deallocations`` as before - these will still be used by the context to +manage the allocations and deallocations of events, streams, and modules, which +are not handled by the EMM plugin. + +New components of the ``driver`` module +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +* ``BaseCUDAMemoryManager``\ : An abstract class, as defined in the plugin interface + above. +* ``HostOnlyCUDAMemoryManager``\ : A subclass of ``BaseCUDAMemoryManager``\ , with the + logic from ``Context.memhostalloc`` and ``Context.mempin`` moved into it. This + class will also create its own ``allocations`` and ``deallocations`` members, + similarly to how the ``Context`` class creates them. These are used to manage + the allocations and deallocations of pinned and mapped host memory. +* ``NumbaCUDAMemoryManager``\ : A subclass of ``HostOnlyCUDAMemoryManager``\ , which + also contains an implementation of ``memalloc`` based on that presently existing + in the ``Context`` class. This is the default memory manager, and its use + preserves the behaviour of Numba prior to the addition of the EMM plugin + interface - that is, all memory allocation and deallocation for Numba arrays + is handled within Numba. + + * This class shares the ``allocations`` and ``deallocations`` members with its + parent class ``HostOnlyCUDAMemoryManager``\ , and it uses these for the + management of device memory that it allocates. + +* The ``set_memory_manager`` function, which sets a global pointing to the memory + manager class. This global initially holds ``NumbaCUDAMemoryManager`` (the + default). + +Staged IPC +~~~~~~~~~~ + +Staged IPC should not take ownership of the memory that it allocates. When the +default internal memory manager is in use, the memory allocated for the staging +array is already owned. When an EMM plugin is in use, it is not legitimate to +take ownership of the memory. + +This change can be made by applying the following small patch, which has been +tested to have no effect on the CUDA test suite: + +.. code-block:: diff + + diff --git a/numba/cuda/cudadrv/driver.py b/numba/cuda/cudadrv/driver.py + index 7832955..f2c1352 100644 + --- a/numba/cuda/cudadrv/driver.py + +++ b/numba/cuda/cudadrv/driver.py + @@ -922,7 +922,11 @@ class _StagedIpcImpl(object): + with cuda.gpus[srcdev.id]: + impl.close() + + - return newmem.own() + + return newmem + +Testing +~~~~~~~ + +Alongside the addition of appropriate tests for new functionality, there will be +some refactoring of existing tests required, but these changes are not +substantial. Tests of the deallocation strategy (e.g. ``TestDeallocation``\ , +``TestDeferCleanup``\ ) will need to be modified to ensure that they are +examining the correct set of deallocations. When an EMM plugin is in use, they +will need to be skipped. + +Prototyping / experimental implementation +----------------------------------------- + +Some prototype / experimental implementations have been produced to guide the +designs presented in this document. The current implementations can be found in: + + +* Numba branch: https://github.com/gmarkall/numba/tree/grm-numba-nbep-7. +* RMM branch: https://github.com/gmarkall/rmm/tree/grm-numba-nbep-7. +* CuPy implementation: + https://github.com/gmarkall/nbep-7/blob/master/nbep7/cupy_mempool.py - uses + an unmodified CuPy. + + * See `CuPy memory management + docs `_. + +Current implementation status +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +RMM Plugin +~~~~~~~~~~ + +For a minimal example, a simple allocation and free using RMM works as expected. +For the example code (similar to the RMM example above): + +.. code-block:: python + + import rmm + import numpy as np + + from numba import cuda + + rmm.use_rmm_for_numba() + + a = np.zeros(10) + d_a = cuda.to_device(a) + del(d_a) + print(rmm.csv_log()) + +We see the following output: + +.. code-block:: + + Event Type,Device ID,Address,Stream,Size (bytes),Free Memory,Total Memory,Current Allocs,Start,End,Elapsed,Location + Alloc,0,0x7f96c7400000,0,80,0,0,1,1.13396,1.13576,0.00180059,/numba/numba/cuda/cudadrv/driver.py:686 + Free,0,0x7f96c7400000,0,0,0,0,0,1.13628,1.13723,0.000956004,/numba/numba/utils.py:678 + +This output is similar to the expected output from the example usage presented +above (though note that the pointer addresses and timestamps vary compared to +the example), and provides some validation of the example use case. + +CuPy Plugin +~~~~~~~~~~~ + +.. code-block:: python + + from nbep7.cupy_mempool import use_cupy_mm_for_numba + import numpy as np + + from numba import cuda + + use_cupy_mm_for_numba() + + a = np.zeros(10) + d_a = cuda.to_device(a) + del(d_a) + +The prototype CuPy plugin has somewhat primitive logging, so we see the output: + +.. code-block:: + + Allocated 80 bytes at 7f004d400000 + Freeing 80 bytes at 7f004d400000 + +Numba CUDA Unit tests +^^^^^^^^^^^^^^^^^^^^^ + +As well as providing correct execution of a simple example, all relevant Numba +CUDA unit tests also pass with the prototype branch, for both the internal memory +manager and the RMM EMM Plugin. + +RMM +~~~ + +The unit test suite can be run with the RMM EMM Plugin with: + +.. code-block:: + + NUMBA_CUDA_MEMORY_MANAGER=rmm python -m numba.runtests numba.cuda.tests + +A summary of the unit test suite output is: + +.. code-block:: + + Ran 564 tests in 142.211s + + OK (skipped=11) + +When running with the built-in Numba memory management, the output is: + +.. code-block:: + + Ran 564 tests in 133.396s + + OK (skipped=5) + +i.e. the changes for using an external memory manager do not break the built-in +Numba memory management. There are an additional 6 skipped tests, from: + + +* ``TestDeallocation``\ : skipped as it specifically tests Numba's internal + deallocation strategy. +* ``TestDeferCleanup``\ : skipped as it specifically tests Numba's implementation of + deferred cleanup. +* ``TestCudaArrayInterface.test_ownership``\ : skipped as Numba does not own memory + when an EMM Plugin is used, but ownership is assumed by this test case. + +CuPy +~~~~ + +The test suite can be run with the CuPy plugin using: + +.. code-block:: + + NUMBA_CUDA_MEMORY_MANAGER=nbep7.cupy_mempool python -m numba.runtests numba.cuda.tests + +This plugin implementation is presently more primitive than the RMM +implementation, and results in some errors with the unit test suite: + +.. code-block:: + + Ran 564 tests in 111.699s + + FAILED (errors=8, skipped=11) + +The 8 errors are due to a lack of implementation of ``get_ipc_handle`` in the +CuPy EMM Plugin implementation. It is expected that this implementation will be +re-visited and completed so that CuPy can be used stably as an allocator for +Numba in the future. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/index.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..33efe8d53a7774b6bda3630532c56cb465e0c9cf --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/index.rst @@ -0,0 +1,35 @@ +=========================== +Numba Enhancement Proposals +=========================== + +Numba Enhancement Proposals (not really abbreviated "NEPs", since "NEP" +is already taken by the Numpy project) describe proposed changes to Numba. +They are modeled on Python Enhancement Proposals (PEPs) and Numpy Enhancement +Proposals, and are typically written up when important changes +(behavioural changes, feature additions...) to Numba are proposed. + +This page provides an overview of all proposals, making only a distinction +between the ones that have been implemented and those that have not been +implemented. + +Implemented proposals +--------------------- + +.. toctree:: + :maxdepth: 1 + + integer-typing.rst + external-memory-management.rst + +Other proposals +--------------- + +.. toctree:: + :maxdepth: 1 + + extension-points.rst + jit-classes.rst + cfunc.rst + type-inference.rst + typing_recursion.rst + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/integer-typing.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/integer-typing.rst new file mode 100644 index 0000000000000000000000000000000000000000..9093d7e6590e51131d37088bf2c3e8a77a6374b3 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/integer-typing.rst @@ -0,0 +1,186 @@ +.. _nbep-1: + +================================= +NBEP 1: Changes in integer typing +================================= + +:Author: Antoine Pitrou +:Date: July 2015 +:Status: Final + + +Current semantics +================= + +Type inference of integers in Numba currently has some subtleties +and some corner cases. The simple case is when some variable has an obvious +Numba type (for example because it is the result of a constructor call to a +Numpy scalar type such as ``np.int64``). That case suffers no ambiguity. + +The less simple case is when a variable doesn't bear such explicit +information. This can happen because it is inferred from a built-in Python +``int`` value, or from an arithmetic operation between two integers, or +other cases yet. Then Numba has a number of rules to infer the resulting +Numba type, especially its signedness and bitwidth. + +Currently, the generic case could be summarized as: *start small, +grow bigger as required*. Concretely: + +1. Each constant or pseudo-constant is inferred using the *smallest signed + integer type* that can correctly represent it (or, possibly, ``uint64`` + for positive integers between ``2**63`` and ``2**64 - 1``). +2. The result of an operation is typed so as to ensure safe representation + in the face of overflow and other magnitude increases (for example, + ``int32 + int32`` would be typed ``int64``). +3. As an exception, a Python ``int`` used as function argument is always + typed ``intp``, a pointer-size integer. This is to avoid the proliferation + of compiled specializations, as otherwise various integer bitwidths + in input arguments may produce multiple signatures. + +.. note:: + The second rule above (the "respect magnitude increases" rule) + reproduces Numpy's behaviour with arithmetic on scalar values. + Numba, however, has different implementation and performance constraints + than Numpy scalars. + + It is worth nothing, by the way, that Numpy arrays do not implement + said rule (i.e. ``array(int32) + array(int32)`` is typed ``array(int32)``, + not ``array(int64)``). Probably because this makes performance more + controllable. + +This has several non-obvious side-effects: + +1. It is difficult to predict the precise type of a value inside a function, + after several operations. The basic operands in an expression tree + may for example be ``int8`` but the end result may be ``int64``. Whether + this is desirable or not is an open question; it is good for correctness, + but potentially bad for performance. + +2. In trying to follow the correctness over predictability rule, some values + can actually leave the integer realm. For example, ``int64 + uint64`` + is typed ``float64`` in order to avoid magnitude losses (but incidentally + will lose precision on large integer values...), again following Numpy's + semantics for scalars. This is usually not intended by the user. + +3. More complicated scenarios can produce unexpected errors at the type unification + stage. An example is at `Github issue 1299 `_, + the gist of which is reproduced here:: + + @jit(nopython=True) + def f(): + variable = 0 + for i in range(1): + variable = variable + 1 + return np.arange(variable) + + At the time of this writing, this fails compiling, on a 64-bit system, + with the error:: + + numba.errors.TypingError: Failed at nopython (nopython frontend) + Can't unify types of variable '$48.4': $48.4 := {array(int32, 1d, C), array(int64, 1d, C)} + + People expert with Numba's type unification system can understand why. + But the user is caught in mystery. + + +Proposal: predictable width-conserving typing +============================================= + +We propose to turn the current typing philosophy on its head. Instead +of "*start small and grow as required*", we propose "*start big and keep +the width unchanged*". + +Concretely: + +1. The typing of Python ``int`` values used as function arguments doesn't + change, as it works satisfyingly and doesn't surprise the user. + +2. The typing of integer *constants* (and pseudo-constants) changes to match + the typing of integer arguments. That is, every non-explicitly typed + integer constant is typed ``intp``, the pointer-sized integer; except for + the rare cases where ``int64`` (on 32-bit systems) or ``uint64`` is + required. + +3. Operations on integers promote bitwidth to ``intp``, if smaller, otherwise + they don't promote. For example, on a 32-bit machine, ``int8 + int8`` + is typed ``int32``, as is ``int32 + int32``. However, ``int64 + int64`` + is typed ``int64``. + +4. Furthermore, mixed operations between signed and unsigned fall back to + signed, while following the same bitwidth rule. For example, on a + 32-bit machine, ``int8 + uint16`` is typed ``int32``, as is + ``uint32 + int32``. + + +Proposal impact +=============== + +Semantics +--------- + +With this proposal, the semantics become clearer. Regardless of whether +the arguments and constants of a function were explicitly typed or not, +the results of various expressions at any point in the function have +easily predictable types. + +When using built-in Python ``int``, the user gets acceptable magnitude +(32 or 64 bits depending on the system's bitness), and the type remains +the same across all computations. + +When explicitly using smaller bitwidths, intermediate results don't +suffer from magnitude loss, since their bitwidth is promoted to ``intp``. + +There is also less potential for annoyances with the type unification +system as demonstrated above. The user would have to force several +different types to be faced with such an error. + +One potential cause for concern is the discrepancy with Numpy's scalar +semantics; but at the same time this brings Numba scalar semantics closer +to array semantics (both Numba's and Numpy's), which seems a desirable +outcome as well. + +It is worth pointing out that some sources of integer numbers, such +as the ``range()`` built-in, always yield 32-bit integers or larger. +This proposal could be an opportunity to standardize them on ``intp``. + +Performance +----------- + +Except in trivial cases, it seems unlikely that the current "best fit" +behaviour for integer constants really brings a performance benefit. After +all, most integers in Numba code would either be stored in arrays (with +well-known types, chosen by the user) or be used as indices, where a ``int8`` +is highly unlikely to fare better than a ``intp`` (actually, it may be worse, +if LLVM isn't able to optimize away the required sign-extension). + +As a side note, the default use of ``intp`` rather than ``int64`` +ensures that 32-bit systems won't suffer from poor arithmetic performance. + +Implementation +-------------- + +Optimistically, this proposal may simplify some Numba internals a bit. +Or, at least, it doesn't threaten to make them significantly more complicated. + +Limitations +----------- + +This proposal doesn't really solve the combination of signed and unsigned +integers. It is geared mostly at solving the bitwidth issues, which are +a somewhat common cause of pain for users. Unsigned integers are in +practice very uncommon in Numba-compiled code, except when explicitly +asked for, and therefore much less of a pain point. + +On the bitwidth front, 32-bit systems could still show discrepancies based +on the values of constants: if a constant is too large to fit in 32 bits, +it is typed ``int64``, which propagates through other computations. +This would be a reminiscence of the current behaviour, but rarer and much +more controlled still. + +Long-term horizon +----------------- + +While we believe this proposal makes Numba's behaviour more regular and more +predictable, it also pulls it further from general compatibility with pure +Python semantics, where users can assume arbitrary-precision integers without +any truncation issues. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/jit-classes.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/jit-classes.rst new file mode 100644 index 0000000000000000000000000000000000000000..2cd33d42ca8638601c6e718925c886469b6c654b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/jit-classes.rst @@ -0,0 +1,231 @@ +=================== +NBEP 3: JIT Classes +=================== + +:Author: Siu Kwan Lam +:Date: Dec 2015 +:Status: Draft + +Introduction +============ + +Numba does not yet support user-defined classes. +Classes provide useful abstraction and promote modularity when used +right. In the simplest sense, a class specifies the set of data and +operations as attributes and methods, respectively. +A class instance is an instantiation of that class. +This proposal will focus on supporting this simple usecase of classes--with +just attributes and methods. Other features, such as class methods, static +methods, and inheritance are deferred to another proposal, but we believe +these features can be easily implemented given the foundation described here. + + +Proposal: jit-classes +===================== + +A JIT-classes is more restricted than a Python class. +We will focus on the following operations on a class and its instance: + +* Instantiation: create an instance of a class using the class object as the + constructor: ``cls(*args, **kwargs)`` +* Destruction: remove resources allocated during instantiation and release + all references to other objects. +* Attribute access: loading and storing attributes using ``instance.attr`` + syntax. +* Method access: loading methods using ``instance.method`` syntax. + +With these operations, a class object (not the instance) does not need to be +materialize. Using the class object as a constructor is fully resolved (a +runtime implementation is picked) during the typing phase in the compiler. +This means **a class object will not be first class**. On the other hand, +implementing a first-class class object will require an "interface" type, +or the type of class. + +The instantiation of a class will allocate resources for storing the data +attributes. This is described in the "Storage model" section. Methods are +never stored in the instance. They are information attached to the class. +Since a class object only exists in the type domain, the methods will also be +fully resolved at the typing phase. Again, numba do not have first-class +function value and each function type maps uniquely to each function +implementation (this needs to be changed to support function value as argument). + +A class instance can contain other NRT reference-counted object as attributes. +To properly clean up an instance, a destructor is called when the reference +count of the instance is dropped to zero. This is described in the +"Reference count and descructor" section. + +Storage model +~~~~~~~~~~~~~ + +For compatibility with C, attributes are stored in a simple plain-old-data +structure. Each attribute are stored in a user-defined order in a padded +(for proper alignment), contiguous memory region. An instance that contains +three fields of int32, float32, complex64 will be compatible with the following +C structure:: + + struct { + int32 field0; + float32 field1; + complex64 field2; + }; + +This will also be compatible with an aligned NumPy structured dtype. + + +Methods +~~~~~~~ + +Methods are regular function that can be bounded to an instance. +They can be compiled as regular function by numba. +The operation ``getattr(instance, name)`` (getting an attribute ``name`` from +``instance``) binds the instance to the requested method at runtime. + + +The special ``__init__`` method is also handled like regular functions. + + +``__del__`` is not supported at this time. + + +Reference count and destructor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +An instance of jit-class is reference-counted by NRT. Since it may contain +other NRT tracked object, it must call a destructor when its reference count +dropped to zero. The destructor will decrement the reference count of all +attributes by one. + +At this time, there is no support for user defined ``__del__`` method. + +Proper cleanup for cyclic reference is not handled at this time. +Cycles will cause memory leak. + +Type inference +~~~~~~~~~~~~~~ + +So far we have not described the type of the attributes or the methods. +Type information is necessary to materailize the instance (e.g. allocate the +storage). The simplest way is to let user provide the type of each attributes +as well as the ordering; for instance:: + + dct = OrderedDict() + dct['x'] = int32 + dct['y'] = float32 + +Allowing user to supply an ordered dictionary will provide the name, ordering +and types of the attributes. However, this statically typed semantic is not as +flexible as the Python semantic which behaves like a generic class. + +Inferring the type of attributes is difficult. In a previous attempt to +implement JIT classes, the ``__init__`` method is specialized to capture +the type stored into the attributes. Since the method can contain arbitrary +logic, the problem can become a dependent typing problem if types are assigned +conditionally depending on the value. (Very few languages implement dependent +typing and those that does are mostly theorem provers.) + +Example: typing function using an OrderedDict +--------------------------------------------- + +.. code-block:: python + + spec = OrderedDict() + spec['x'] = numba.int32 + spec['y'] = numba.float32 + + @jitclass(spec) + class Vec(object): + def __init__(self, x, y): + self.x = x + self.y = y + + def add(self, dx, dy): + self.x += dx + self.y += dy + +Example: typing function using a list of 2-tuples +------------------------------------------------- + +.. code-block:: python + + spec = [('x', numba.int32), + ('y', numba.float32)] + + @jitclass(spec) + class Vec(object): + ... + +Creating multiple jitclasses from a single class object +------------------------------------------------------- + +The `jitclass(spec)` decorator creates a new jitclass type even when applied to +the same class object and the same type specification. + +.. code-block:: python + + class Vec(object): + ... + + Vec1 = jitclass(spec)(Vec) + Vec2 = jitclass(spec)(Vec) + # Vec1 and Vec2 are two different jitclass types + +Usage from the Interpreter +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When constructing a new instance of a jitclass, a "box" is created that wraps +the underlying jitclass instance from numba. Attributes and methods are +accessible from the interpreter. The actual implementation will be in numba +compiled code. Any Python object is converted to its native +representation for consumption in numba. Similarly, the returned value is +converted to its Python representation. As a result, there may be overhead in +manipulating jitclass instances in the interpreter. This overhead is minimal +and should be easily amortized by more efficient computation in the compiled +methods. + +Support for property, staticmethod and classmethod +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The use of ``property`` is accepted for getter and setter only. Deleter is not +supported. + +The use of ``staticmethod`` is not supported. + +The use of ``classmethod`` is not supported. + +Inheritance +~~~~~~~~~~~ + +Class inhertance is not considered in this proposal. The only accepted base +class for a jitclass is `object`. + +Supported targets +~~~~~~~~~~~~~~~~~~ + +Only the CPU target (including the parallel target) is supported. +GPUs (e.g. CUDA and HSA) targets are supported via an immutable version of the +jitclass instance, which will be described in a separate NBEP. + + +Other properties +~~~~~~~~~~~~~~~~ + +Given: + +.. code-block:: python + + spec = [('x', numba.int32), + ('y', numba.float32)] + + @jitclass(spec) + class Vec(object): + ... + +* ``isinstance(Vec(1, 2), Vec)`` is True. +* ``type(Vec(1, 2))`` may not be ``Vec``. + +Future enhancements +~~~~~~~~~~~~~~~~~~~ + +This proposal has only described the basic semantic and functionality of a +jitclass. Additional features will be described in future enhancement +proposals. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/np-where-override.py b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/np-where-override.py new file mode 100644 index 0000000000000000000000000000000000000000..109ba6a43658288de2594170db4de1ef331a0ed7 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/np-where-override.py @@ -0,0 +1,51 @@ +import numpy as np + +from numba.core import types +from numba.extending import overload + +@overload(np.where) +def where(cond, x, y): + """ + Implement np.where(). + """ + # Choose implementation based on argument types. + if isinstance(cond, types.Array): + # Array where() => return an array of the same shape + if all(ty.layout == 'C' for ty in (cond, x, y)): + def where_impl(cond, x, y): + """ + Fast implementation for C-contiguous arrays + """ + shape = cond.shape + if x.shape != shape or y.shape != shape: + raise ValueError("all inputs should have the same shape") + res = np.empty_like(x) + cf = cond.flat + xf = x.flat + yf = y.flat + rf = res.flat + for i in range(cond.size): + rf[i] = xf[i] if cf[i] else yf[i] + return res + else: + def where_impl(cond, x, y): + """ + Generic implementation for other arrays + """ + shape = cond.shape + if x.shape != shape or y.shape != shape: + raise ValueError("all inputs should have the same shape") + res = np.empty_like(x) + for idx, c in np.ndenumerate(cond): + res[idx] = x[idx] if c else y[idx] + return res + + else: + def where_impl(cond, x, y): + """ + Scalar where() => return a 0-dim array + """ + scal = x if cond else y + return np.full_like(scal, scal) + + return where_impl diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/recursion_callstack.svg b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/recursion_callstack.svg new file mode 100644 index 0000000000000000000000000000000000000000..7e23f30d5a84f123f90d68a780625315e494fdd2 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/recursion_callstack.svg @@ -0,0 +1,4 @@ + + + + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/type-inference.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/type-inference.rst new file mode 100644 index 0000000000000000000000000000000000000000..24f3b262be6d786f133d8eaff21ac5a65100517d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/type-inference.rst @@ -0,0 +1,124 @@ +====================== +NBEP 5: Type Inference +====================== + +:Author: Siu Kwan Lam +:Date: Sept 2016 +:Status: Draft + + +This document describes the current type inference implementation in numba. + + +Introduction +============ + +Numba uses type information to ensure that every variable in the user code can +be correctly lowered (translated into a low-level representation). The type of +a variable describes the set of valid operations and available attributes. +Resolving this information during compilation avoids the overhead of type +checking and dispatching at runtime. However, Python is dynamically typed and +the user does not declare variable types. Since type information is absent, +we use type inference to reconstruct the missing information. + + +Numba Type Semantic +=================== + +Type inference operates on :term:`Numba IR`, a mostly static-single-assignment (SSA) +encoding of the Python bytecode. Conceptually, all intermediate values in the +Python code are explicitly assigned to a variable in the IR. Numba enforces +that each IR variable to have one type only. A user variable (from the Python +source code) can be mapped to multiple variables in the IR. They are *versions* +of a variable. Each time a user variable is assigned to, a new version is +created. From that point, all subsequent references will use the new version. +The user variable *evolves* as the function logic updates its type. Merge +points (e.g. subsequent block to an if-else, the loop body, etc..) in the control +flow need extra care. At each merge point, a new version is implicitly created +to merge the different variable versions from the incoming paths. +The merging of the variable versions may translate into an implicit cast. + +Numba uses function overloading to emulate Python duck-typing. The type of a +function can contain multiple call signatures that accept different argument +types and yield different return types. The process to decide the best +signature for an overloaded function is called *overload resolution*. +Numba partially implements the C++ overload resolution scheme +(`ISOCPP`_ 13.3 Overload Resolution). The scheme uses a "best fit" algorithm by +ranking each argument symmetrically. The five possible rankings in increasing +order of penalty are: + +* *Exact*: the expected type is the same as the actual type. +* *Promotion*: the actual type can be upcast to the expected type by extending + the precision without changing the behavior. +* *Safe conversion*: the actual type can be cast to the expected type by changing + the type without losing information. +* *Unsafe conversion*: the actual type can be cast to the expected type by + changing the type or downcasting the type even if it is imprecise. +* *No match*: no valid operation can convert the actual type to the expected type. + +It is possible to have an ambiguous resolution. For example, a function with +signatures ``(int16, int32)`` and ``(int32, int16)`` can become ambiguous if +presented with the argument types ``(int32, int32)``, because demoting either +argument to ``int16`` is equally "fit". Fortunately, numba can usually resolve +such ambiguity by compiling a new version with the exact signature +``(int32, int32)``. When compilation is disabled and there are multiple +signatures with equal fit, an exception is raised. + +Type Inference +============== + +The type inference in numba has three important components---type +variable, constraint network, and typing context. + +* The *typing context* provides all the type information and typing related + operations, including the logic for type unification, and the logic for typing + of global and constant values. It defines the semantic of the language that + can be compiled by numba. + +* A *type variable* holds the type of each variable (in the Numba IR). + Conceptually, it is initialized to the universal type and, as it is re-assigned, + it stores a common type by unifying the new type with the existing type. The + common type must be able to represent values of the new type and the existing + type. Type conversion is applied as necessary and precision loss is + accepted for usability reason. + +* The *constraint network* is a dependency graph built from the IR. Each + node represents an operation in the Numba IR and updates at least one type + variable. There may be cycles due to loops in user code. + +The type inference process starts by seeding the argument types. These initial +types are propagated in the constraint network, which eventually fills all the +type variables. Due to cycles in the network, the process repeats until all +type variables converge or it fails with undecidable types. + +Type unification always returns a more "general" (quoted because unsafe conversion +is allowed) type. Types will converge to the least "general" type that +can represent all possible values that the variable can hold. Since unification +will never move down the type hierarchy and there is a single top type, the +universal type---``object``, the type inference is guaranteed to converge. + +A failure in type inference can be caused by two reasons. The first reason is user +error due to incorrect use of a type. This type of error will also trigger an +exception in regular python execution. The second reason is due to the use of an +unsupported feature, but the code is otherwise valid in regular python +execution. Upon an error, the type inference will set all types to the object +type. As a result, numba will fallback to *object-mode*. + +Since functions can be overloaded, the type inference needs to decide the +type signature used at each call site. The overload resolution is applied to +all known overload versions of the callee function described in *call-templates*. +A call-template can either be concrete or abstract. A concrete call-template +defines a fixed list of all possible signatures. An abstract call-template +defines the logic to compute the accepted signature and it is used to implement +generic functions. + +Numba-compiled functions are generic functions due to their ability to compile +new versions. When it sees a new set of argument types, it triggers type +inference to validate and determine the return type. When there are nested calls +for numba-compiled functions, each call-site triggers type inference. +This poses a problem to recursive functions because the type inference will also +be triggered recursively. Currently, simple single recursion is supported if +the signature is user-annotated by the user, which avoids unbound recursion in +type inference that will never terminate. + +.. _ISOCPP: http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4296.pdf \ No newline at end of file diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/typing_recursion.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/typing_recursion.rst new file mode 100644 index 0000000000000000000000000000000000000000..a33a3a4b5a60767fd83042634c90e9cd0b4af7b9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/proposals/typing_recursion.rst @@ -0,0 +1,129 @@ +======================== +NBEP 6: Typing Recursion +======================== + +:Author: Siu Kwan Lam +:Date: Sept 2016 +:Status: Draft + +Introduction +============ + +This document proposes an enhancement to the type inference algorithm to +support recursion without explicitly annotating the function signature. +As a result, the proposal enables numba to type-infer both self-recursive and +mutual-recursive functions under some limitations. In practice, these +limitations can be easily overcome by specifying a compilation order. + + +The Current State +================= + +Recursion support in numba is currently limited to self-recursion with explicit +type annotation for the function. This limitation comes from the inability to +determine the return type of a recursive call. This is because the callee is +either the current function (for self-recursion) or a parent function +(mutual-recursion) and its type inference process has been suspended while waiting for +the function-type of its callee. This results in the formation of a cyclic +dependency. For example, given a function ``foo()`` that calls ``bar()``, +which in turns call ``foo()``:: + + def foo(x): + if x > 0: + return bar(x) + else: + return 1 + + def bar(x): + return foo(x - 1) + + +The type inference process of ``foo()`` depends on that of ``bar()``, +which depends on ``foo()``. Therefore ``foo()`` depends on itself and the type +inference algorithm cannot terminate. + + +The Solution +============ + +The proposed solution has two components: + +1. The introduction of a compile-time *callstack* that tracks the compiling functions. +2. The allowance of a partial type inference on functions by leveraging the return type + on non-recursive control-flow paths. + +The compile-time callstack stores typing information of the functions being +compiled. Like an ordinary callstack, it pushes a new record every time a +function is "called". Since this occurs at compile-time, a "call" triggers +a compilation of the callee. + +To detect recursion, the compile-time callstack is searched bottom-up +(stack grows downward) for a record that matches the callee. +As the record contains a reference to the type inference state, +the type inference process can be resumed to determine the return type. + +Recall that the type inference process cannot be resumed normally because of the cyclic +dependency of the return type. In practice, we can assume that a useful +program must have a terminating condition, a path that does not recurse. So, +the type inference process can make an initial guess for the return-type at the recursive +call by using the return-type determined by the non-recursive paths. This +allows type information to propagate on the recursive paths to generate the +final return type, which is used to refine the type information by the +subsequent iteration in the type inference process. + +The following figure illustrates the compile-time callstack when the compiler +reaches the recursive call to ``foo()`` from ``bar()``: + +.. image:: recursion_callstack.svg + :width: 400px + +At this time, the type inference process of ``foo()`` is suspended and that of ``bar()`` +is active. The compiler can see that the callee is already compiling by +searching the callstack. Knowing that it is a recursive call, the compiler +can resume the type-inference on ``foo()`` by ignoring the paths that contain +recursive calls. This means only the ``else`` branch is considered and we can +easily tell that ``foo()`` returns an ``int`` in this case. The compiler will +then set the initial return type of ``foo()`` and ``bar()`` to ``int``. The +subsequent type propagation can use this information to complete the type +inference of both functions, unifying the return-type of all returning paths. + + +Limitations +=========== + +For the proposed type inference algorithm to terminate, it assumes that +at least one of the control path leads to a return-statement without undertaking +a recursive call. Should this not be the case, the algorithm will raise an +exception indicating a potential runaway recursion. + +For example:: + + @jit + def first(x): + # The recursing call must have a path that is non-recursing. + if x > 0: + return second(x) + else: + return 1 + + @jit + def second(x): + return third(x) + + @jit + def third(x): + return first(x - 1) + + +The ``first()`` function must be the compiled first for the type inference algorithm to +complete successfully. Compiling any other function first will lead to a failure +in type inference. The type inference algorithm will treat it as a runaway +recursion due to the lack of a non-recursive exit in the recursive callee. + +For example, compiling ``second()`` first will move the recursive call to +``first()``. When the compiler tries to resume the type inference process of +``second()``, it will fail to find a non-recursive path. + +This is a small limitation and can be overcome easily by code restructuring or +precompiling in a specific order. + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/aot-compilation.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/aot-compilation.rst new file mode 100644 index 0000000000000000000000000000000000000000..a6d56ee3433b19540242027c01ea6d5cace215df --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/aot-compilation.rst @@ -0,0 +1,76 @@ +.. _aot-compilation: + +Ahead-of-Time compilation +========================= + +.. currentmodule:: numba.pycc + +.. class:: CC(extension_name, source_module=None) + + An object used to generate compiled extensions from Numba-compiled + Python functions. *extension_name* is the name of the extension + to be generated. *source_module* is the Python module + containing the functions; if ``None``, it is inferred by examining + the call stack. + + :class:`CC` instances have the following attributes and methods: + + .. attribute:: name + + (read-only attribute) The name of the extension module to be generated. + + .. attribute:: output_dir + + (read-write attribute) The directory the extension module will be + written into. By default it is the directory the *source_module* is + located in. + + .. attribute:: output_file + + (read-write attribute) The name of the file the extension module will + be written to. By default this follows the Python naming convention + for the current platform. + + .. attribute:: target_cpu + + (read-write attribute) The name of the CPU model to generate code for. + This will select the appropriate instruction set extensions. By + default, a generic CPU is selected in order to produce portable code. + + Recognized names for this attribute depend on the current architecture + and LLVM version. If you have LLVM installed, ``llc -mcpu=help`` + will give you a list. Examples on x86-64 are ``"ivybridge"``, + ``"haswell"``, ``"skylake"`` or ``"broadwell"``. You can also give + the value ``"host"`` which will select the current host CPU. + + .. attribute:: verbose + + (read-write attribute) If true, print out information while + compiling the extension. False by default. + + .. decorator:: export(exported_name, sig) + + Mark the decorated function for compilation with the signature *sig*. + The compiled function will be exposed as *exported_name* in the + generated extension module. + + All exported names within a given :class:`CC` instance must be + distinct, otherwise an exception is raised. + + .. method:: compile() + + Compile all exported functions and generate the extension module + as specified by :attr:`output_dir` and :attr:`output_file`. + + .. method:: distutils_extension(**kwargs) + + Return a :py:class:`distutils.core.Extension` instance allowing + to integrate generation of the extension module in a conventional + ``setup.py``-driven build process. The optional *kwargs* let you + pass optional parameters to the :py:class:`~distutils.core.Extension` + constructor. + + In this mode of operation, it is not necessary to call :meth:`compile` + yourself. Also, :attr:`output_dir` and :attr:`output_file` will be + ignored. + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/deprecation.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/deprecation.rst new file mode 100644 index 0000000000000000000000000000000000000000..4271f0e70b6f86c66bdd12af9a90ab0864e7fc8d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/deprecation.rst @@ -0,0 +1,312 @@ +.. _deprecation: + +=================== +Deprecation Notices +=================== + +This section contains information about deprecation of behaviours, features and +APIs that have become undesirable/obsolete. Any information about the schedule +for their deprecation and reasoning behind the changes, along with examples, is +provided. However, first is a small section on how to suppress deprecation +warnings that may be raised from Numba so as to prevent warnings propagating +into code that is consuming Numba. + +Suppressing Deprecation warnings +================================ +All Numba deprecations are issued via ``NumbaDeprecationWarning`` or +``NumbaPendingDeprecationWarning`` s, to suppress the reporting of +these the following code snippet can be used:: + + from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning + import warnings + + warnings.simplefilter('ignore', category=NumbaDeprecationWarning) + warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning) + +The ``action`` used above is ``'ignore'``, other actions are available, see +`The Warnings Filter `_ +documentation for more information. + +.. note:: It is **strongly recommended** that applications and libraries which + choose to suppress these warnings should pin their Numba dependency + to a suitable version because their users will no longer be aware of + the coming incompatibility. + +Deprecation of reflection for List and Set types +================================================ +Reflection (:term:`reflection`) is the jargon used in Numba to describe the +process of ensuring that changes made by compiled code to arguments that are +mutable Python container data types are visible in the Python interpreter when +the compiled function returns. Numba has for some time supported reflection of +``list`` and ``set`` data types and it is support for this reflection that +is scheduled for deprecation with view to replace with a better implementation. + +Reason for deprecation +---------------------- +First recall that for Numba to be able to compile a function in ``nopython`` +mode all the variables must have a concrete type ascertained through type +inference. In simple cases, it is clear how to reflect changes to containers +inside ``nopython`` mode back to the original Python containers. However, +reflecting changes to complex data structures with nested container types (for +example, lists of lists of integers) quickly becomes impossible to do +efficiently and consistently. After a number of years of experience with this +problem, it is clear that providing this behaviour is both fraught with +difficulty and often leads to code which does not have good performance (all +reflected data has to go through special APIs to convert the data to native +formats at call time and then back to CPython formats at return time). As a +result of this, the sheer number of reported problems in the issue tracker, and +how well a new approach that was taken with ``typed.Dict`` (typed dictionaries) +has gone, the core developers have decided to deprecate the noted ``reflection`` +behaviour. + + +Example(s) of the impact +------------------------ + +At present only a warning of the upcoming change is issued. In future code such +as:: + + from numba import njit + + @njit + def foo(x): + x.append(10) + + a = [1, 2, 3] + foo(a) + +will require adjustment to use a ``typed.List`` instance, this typed container +is synonymous to the :ref:`feature-typed-dict`. An example of translating the +above is:: + + from numba import njit + from numba.typed import List + + @njit + def foo(x): + x.append(10) + + a = [1, 2, 3] + typed_a = List() + [typed_a.append(x) for x in a] + foo(typed_a) + +For more information about ``typed.List`` see :ref:`feature-typed-list`. Further +usability enhancements for this feature were made in the 0.47.0 release +cycle. + +Schedule +-------- +This feature will be removed with respect to this schedule: + +* Pending-deprecation warnings will be issued in version 0.44.0 +* Prominent notice will be given for a minimum of two releases prior to full + removal. + +Recommendations +--------------- +Projects that need/rely on the deprecated behaviour should pin their dependency +on Numba to a version prior to removal of this behaviour, or consider following +replacement instructions that will be issued outlining how to adjust to the +change. + +Expected Replacement +-------------------- +As noted above ``typed.List`` will be used to permit similar functionality to +reflection in the case of ``list`` s, a ``typed.Set`` will provide the +equivalent for ``set`` (not implemented yet!). The advantages to this approach +are: + +* That the containers are typed means type inference has to work less hard. +* Nested containers (containers of containers of ...) are more easily + supported. +* Performance penalties currently incurred translating data to/from native + formats are largely avoided. +* Numba's ``typed.Dict`` will be able to use these containers as values. + + +Deprecation of :term:`object mode` `fall-back` behaviour when using ``@jit`` +============================================================================ +The ``numba.jit`` decorator has for a long time followed the behaviour of first +attempting to compile the decorated function in :term:`nopython mode` and should +this compilation fail it will `fall-back` and try again to compile but this time +in :term:`object mode`. It it this `fall-back` behaviour which is being +deprecated, the result of which will be that ``numba.jit`` will by default +compile in :term:`nopython mode` and :term:`object mode` compilation will +become `opt-in` only. + + +Reason for deprecation +---------------------- +The `fall-back` has repeatedly caused confusion for users as seemingly innocuous +changes in user code can lead to drastic performance changes as code which may +have once compiled in :term:`nopython mode` mode may silently switch to +compiling in :term:`object mode` e.g:: + + from numba import jit + + @jit + def foo(): + l = [] + for x in range(10): + l.append(x) + return l + + foo() + + assert foo.nopython_signatures # this was compiled in nopython mode + + @jit + def bar(): + l = [] + for x in range(10): + l.append(x) + return reversed(l) # innocuous change, but no reversed support in nopython mode + + bar() + + assert not bar.nopython_signatures # this was not compiled in nopython mode + +Another reason to remove the `fall-back` is that it is confusing for the +compiler engineers developing Numba as it causes internal state problems that +are really hard to debug and it makes manipulating the compiler pipelines +incredibly challenging. + +Further, it has long been considered best practice that the +:term:`nopython mode` keyword argument in the ``numba.jit`` decorator is set to +``True`` and that any user effort spent should go into making code work in this +mode as there's very little gain if it does not. The result is that, as Numba +has evolved, the amount of use :term:`object mode` gets in practice and its +general utility has decreased. It can be noted that there are some minor +improvements available through the notion of :term:`loop-lifting`, the cases of +this being used in practice are, however, rare and often a legacy from use of +less-recent Numba whereby such behaviour was better accommodated/the use of +``@jit`` with `fall-back` was recommended. + + +Example(s) of the impact +------------------------ +At present a warning of the upcoming change is issued if ``@jit`` decorated code +uses the `fall-back` compilation path. In future code such as:: + + @jit + def bar(): + l = [] + for x in range(10): + l.append(x) + return reversed(l) + + bar() + +will simply not compile, a ``TypingError`` would be raised. + +Schedule +-------- +This feature will be removed with respect to this schedule: + +* Deprecation warnings will be issued in version 0.44.0 +* Prominent notice will be given for a minimum of two releases prior to full + removal. + +Recommendations +--------------- +Projects that need/rely on the deprecated behaviour should pin their dependency +on Numba to a version prior to removal of this behaviour. Alternatively, to +accommodate the scheduled deprecations, users with code compiled at present with +``@jit`` can supply the ``nopython=True`` keyword argument, if the code +continues to compile then the code is already ready for this change. If the code +does not compile, continue using the ``@jit`` decorator without +``nopython=True`` and profile the performance of the function. Then remove the +decorator and again check the performance of the function. If there is no +benefit to having the ``@jit`` decorator present consider removing it! If there +is benefit to having the ``@jit`` decorator present, then to be future proof +supply the keyword argument ``forceobj=True`` to ensure the function is always +compiled in :term:`object mode`. + + +.. _deprecation-strict-strides: + + +Deprecation of eager compilation of CUDA device functions +========================================================= + +In future versions of Numba, the ``device`` kwarg to the ``@cuda.jit`` decorator +will be obviated, and whether a device function or global kernel is compiled will +be inferred from the context. With respect to kernel / device functions and lazy +/ eager compilation, four cases were handled: + +1. ``device=True``, eager compilation with a signature provided +2. ``device=False``, eager compilation with a signature provided +3. ``device=True``, lazy compilation with no signature +4. ``device=False``, lazy compilation with no signature + +The latter two cases can be differentiated without the ``device`` kwarg, because +it can be inferred from the calling context - if the call is from the host, then +a global kernel should be compiled, and if the call is from a kernel or another +device function, then a device function should be compiled. + +The first two cases cannot be differentiated in the absence of the ``device`` +kwarg - without it, it will not be clear from a signature alone whether a device +function or global kernel should be compiled. In order to resolve this, device +functions will no longer be eagerly compiled. When a signature is provided to a +device function, it will only be used to enforce the types of arguments that +the function accepts. + +.. note:: + + In previous releases this notice stated that support for providing + signatures to device functions would be removed completely - however, this + precludes the common use case of enforcing the types that can be passed to a + device function (and the automatic insertion of casts that it implies) so + this notice has been updated to retain support for passing signatures. + + +Schedule +-------- + +- In Numba 0.54: Eager compilation of device functions will be deprecated. +- In Numba 0.55: Eager compilation of device functions will be unsupported and + the provision of signatures for device functions will only enforce casting. + + +Deprecation and removal of ``numba.core.base.BaseContext.add_user_function()`` +============================================================================== + +``add_user_function()`` offered the same functionality as +``insert_user_function()``, only with a check that the function has already +been inserted at least once. It is now removed as it was no longer used +internally and it was expected that it was not used externally. + +Recommendations +--------------- + +Replace any uses of ``add_user_function()`` with ``insert_user_function()``. + +Schedule +-------- + +- In Numba 0.55: ``add_user_function()`` was deprecated. +- In Numba 0.56: ``add_user_function()`` was removed. + + +Deprecation and removal of CUDA Toolkits < 10.2 and devices with CC < 5.3 +========================================================================= + +- Support for CUDA toolkits less than 10.2 was deprecated and removed. +- Support for devices with Compute Capability < 5.3 is deprecated and will be + removed in the future. + + +Recommendations +--------------- + +- For devices of Compute Capability 3.0 and 3.2, Numba 0.55.1 or earlier will + be required. +- CUDA toolkit 10.2 or later (ideally 11.2 or later) should be installed. + +Schedule +-------- + +- In Numba 0.55.1: support for CC < 5.3 and CUDA toolkits < 10.2 was deprecated. +- In Numba 0.56: support for CC < 3.5 and CUDA toolkits < 10.2 was removed. +- In Numba 0.57: support for CC < 5.3 will be removed. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/envvars.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/envvars.rst new file mode 100644 index 0000000000000000000000000000000000000000..ec28ed816e8a67c24c5f31d5fb909ca0843d024f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/envvars.rst @@ -0,0 +1,589 @@ +.. _numba-envvars: + +Environment variables +===================== + +.. note:: This section relates to environment variables that impact Numba's + runtime, for compile time environment variables see + :ref:`numba-source-install-env_vars`. + +Numba allows its behaviour to be changed through the use of environment +variables. Unless otherwise mentioned, those variables have integer values and +default to zero. + +For convenience, Numba also supports the use of a configuration file to persist +configuration settings. Note: To use this feature ``pyyaml`` must be installed. + +The configuration file must be named ``.numba_config.yaml`` and be present in +the directory from which the Python interpreter is invoked. The configuration +file, if present, is read for configuration settings before the environment +variables are searched. This means that the environment variable settings will +override the settings obtained from a configuration file (the configuration file +is for setting permanent preferences whereas the environment variables are for +ephemeral preferences). + +The format of the configuration file is a dictionary in ``YAML`` format that +maps the environment variables below (without the ``NUMBA_`` prefix) to a +desired value. For example, to permanently switch on developer mode +(``NUMBA_DEVELOPER_MODE`` environment variable) and control flow graph printing +(``NUMBA_DUMP_CFG`` environment variable), create a configuration file with the +contents:: + + developer_mode: 1 + dump_cfg: 1 + +This can be especially useful in the case of wanting to use a set color scheme +based on terminal background color. For example, if the terminal background +color is black, the ``dark_bg`` color scheme would be well suited and can be set +for permanent use by adding:: + + color_scheme: dark_bg + +Jit flags +--------- + +These variables globally override flags to the :func:`~numba.jit` decorator. + +.. envvar:: NUMBA_BOUNDSCHECK + + If set to 0 or 1, globally disable or enable bounds checking, respectively. + The default if the variable is not set or set to an empty string is to use + the ``boundscheck`` flag passed to the :func:`~numba.jit` decorator for a + given function. See the documentation of :ref:`@jit + ` for more information. + + Note, due to limitations in numba, the bounds checking currently produces + exception messages that do not match those from NumPy. If you set + ``NUMBA_FULL_TRACEBACKS=1``, the full exception message with the axis, + index, and shape information will be printed to the terminal. + +Debugging +--------- + +These variables influence what is printed out during compilation of +:term:`JIT functions `. + +.. envvar:: NUMBA_DEVELOPER_MODE + + If set to non-zero, developer mode produces full tracebacks and disables + help instructions. Default is zero. + +.. envvar:: NUMBA_FULL_TRACEBACKS + + If set to non-zero, enable full tracebacks when an exception occurs. + Defaults to the value set by `NUMBA_DEVELOPER_MODE`. + +.. envvar:: NUMBA_SHOW_HELP + + If set to non-zero, show resources for getting help. Default is zero. + +.. envvar:: NUMBA_CAPTURED_ERRORS + + Alters the way in which Numba captures and handles exceptions that do not + inherit from ``numba.core.errors.NumbaError`` during compilation (e.g. + standard Python exceptions). This does not impact runtime exception + handling. Valid values are: + + - ``"old_style"`` (default): this is the exception handling behaviour that + is present in Numba versions <= 0.54.x. Numba will capture and wrap all + errors occurring in compilation and depending on the compilation phase they + will likely materialize as part of the message in a ``TypingError`` or a + ``LoweringError``. + - ``"new_style"`` this will treat any exception that does not inherit from + ``numba.core.errors.NumbaError`` **and** is raised during compilation as a + "hard error", i.e. the exception will propagate and compilation will halt. + The purpose of this new style is to differentiate between intentionally + raised exceptions and those which occur due to mistakes. For example, if + an ``AttributeError`` occurs in the typing of an ``@overload`` function, + under this new behaviour it is assumed that this a mistake in the + implementation and compilation will halt due to this exception. This + behaviour will eventually become the default. + +.. envvar:: NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING + + If set to non-zero error message highlighting is disabled. This is useful + for running the test suite on CI systems. + +.. envvar:: NUMBA_COLOR_SCHEME + + Alters the color scheme used in error reporting (requires the ``colorama`` + package to be installed to work). Valid values are: + + - ``no_color`` No color added, just bold font weighting. + - ``dark_bg`` Suitable for terminals with a dark background. + - ``light_bg`` Suitable for terminals with a light background. + - ``blue_bg`` Suitable for terminals with a blue background. + - ``jupyter_nb`` Suitable for use in Jupyter Notebooks. + + *Default value:* ``no_color``. The type of the value is ``string``. + +.. envvar:: NUMBA_HIGHLIGHT_DUMPS + + If set to non-zero and ``pygments`` is installed, syntax highlighting is + applied to Numba IR, LLVM IR and assembly dumps. Default is zero. + +.. envvar:: NUMBA_DISABLE_PERFORMANCE_WARNINGS + + If set to non-zero the issuing of performance warnings is disabled. Default + is zero. + +.. envvar:: NUMBA_DEBUG + + If set to non-zero, print out all possible debugging information during + function compilation. Finer-grained control can be obtained using other + variables below. + +.. envvar:: NUMBA_DEBUG_FRONTEND + + If set to non-zero, print out debugging information during operation + of the compiler frontend, up to and including generation of the Numba + Intermediate Representation. + +.. envvar:: NUMBA_DEBUGINFO + + If set to non-zero, enable debug for the full application by setting + the default value of the ``debug`` option in ``jit``. Beware that + enabling debug info significantly increases the memory consumption + for each compiled function. + Default value equals to the value of `NUMBA_ENABLE_PROFILING`. + +.. envvar:: NUMBA_EXTEND_VARIABLE_LIFETIMES + + If set to non-zero, extend the lifetime of variables to the end of the block + in which their lifetime ends. This is particularly useful in conjunction + with :envvar:`NUMBA_DEBUGINFO` as it helps with introspection of values. + Default is zero. + +.. envvar:: NUMBA_GDB_BINARY + + Set the ``gdb`` binary for use in Numba's ``gdb`` support. This takes one of + two forms: 1) a path and full name of the binary to explicitly express + which binary to use 2) just the name of the binary and the current path will + be searched using the standard path resolution rules. For example: + ``/path/from/root/to/binary/name_of_gdb_binary`` or + ``custom_gdb_binary_name``. This is to permit the use of a ``gdb`` from a + non-default location with a non-default name. The default value is ``gdb``. + +.. envvar:: NUMBA_DEBUG_TYPEINFER + + If set to non-zero, print out debugging information about type inference. + +.. envvar:: NUMBA_ENABLE_PROFILING + + Enables JIT events of LLVM in order to support profiling of jitted functions. + This option is automatically enabled under certain profilers. + +.. envvar:: NUMBA_TRACE + + If set to non-zero, trace certain function calls (function entry and exit + events, including arguments and return values). + +.. envvar:: NUMBA_CHROME_TRACE + + If defined, chrome tracing is enabled and this variable specifies the filepath + of the chrome tracing json file output. The emitted file can be opened by + a Chromium-based browser using the profile viewer at `chrome://tracing/`. + + .. warning:: This feature is not supported in multi-process applications. + +.. envvar:: NUMBA_DUMP_BYTECODE + + If set to non-zero, print out the Python :py:term:`bytecode` of + compiled functions. + +.. envvar:: NUMBA_DUMP_CFG + + If set to non-zero, print out information about the Control Flow Graph + of compiled functions. + +.. envvar:: NUMBA_DUMP_IR + + If set to non-zero, print out the Numba Intermediate Representation + of compiled functions. + + +.. envvar:: NUMBA_DUMP_SSA + + If set to non-zero, print out the Numba Intermediate Representation of + compiled functions after conversion to Static Single Assignment (SSA) form. + +.. envvar:: NUMBA_DEBUG_PRINT_AFTER + + Dump the Numba IR after declared pass(es). This is useful for debugging IR + changes made by given passes. Accepted values are: + + * Any pass name (as given by the ``.name()`` method on the class) + * Multiple pass names as a comma separated list, i.e. ``"foo_pass,bar_pass"`` + * The token ``"all"``, which will print after all passes. + + The default value is ``"none"`` so as to prevent output. + +.. envvar:: NUMBA_DUMP_ANNOTATION + + If set to non-zero, print out types annotations for compiled functions. + +.. envvar:: NUMBA_DUMP_LLVM + + Dump the unoptimized LLVM assembly source of compiled functions. + Unoptimized code is usually very verbose; therefore, + :envvar:`NUMBA_DUMP_OPTIMIZED` is recommended instead. + +.. envvar:: NUMBA_DUMP_FUNC_OPT + + Dump the LLVM assembly source after the LLVM "function optimization" + pass, but before the "module optimization" pass. This is useful mostly + when developing Numba itself, otherwise use :envvar:`NUMBA_DUMP_OPTIMIZED`. + +.. envvar:: NUMBA_DUMP_OPTIMIZED + + Dump the LLVM assembly source of compiled functions after all + optimization passes. The output includes the raw function as well as + its CPython-compatible wrapper (whose name begins with ``wrapper.``). + Note that the function is often inlined inside the wrapper, as well. + +.. envvar:: NUMBA_DEBUG_ARRAY_OPT + + Dump debugging information related to the processing associated with + the ``parallel=True`` jit decorator option. + +.. envvar:: NUMBA_DEBUG_ARRAY_OPT_RUNTIME + + Dump debugging information related to the runtime scheduler associated + with the ``parallel=True`` jit decorator option. + +.. envvar:: NUMBA_DEBUG_ARRAY_OPT_STATS + + Dump statistics about how many operators/calls are converted to + parallel for-loops and how many are fused together, which are associated + with the ``parallel=True`` jit decorator option. + +.. envvar:: NUMBA_PARALLEL_DIAGNOSTICS + + If set to an integer value between 1 and 4 (inclusive) diagnostic information + about parallel transforms undertaken by Numba will be written to STDOUT. The + higher the value set the more detailed the information produced. + +.. envvar:: NUMBA_DUMP_ASSEMBLY + + Dump the native assembly code of compiled functions. + +.. envvar:: NUMBA_LLVM_PASS_TIMINGS + + Set to ``1`` to enable recording of pass timings in LLVM; + e.g. ``NUMBA_LLVM_PASS_TIMINGS=1``. + See :ref:`developer-llvm-timings`. + + *Default value*: ``0`` (Off) + +.. seealso:: + :ref:`numba-troubleshooting` and :ref:`architecture`. + + +Compilation options +------------------- + +.. envvar:: NUMBA_OPT + + The optimization level; this option is passed straight to LLVM. + + *Default value:* 3 + +.. envvar:: NUMBA_LOOP_VECTORIZE + + If set to non-zero, enable LLVM loop vectorization. + + *Default value:* 1 (except on 32-bit Windows) + +.. envvar:: NUMBA_SLP_VECTORIZE + + If set to non-zero, enable LLVM superword-level parallelism vectorization. + + *Default value:* 1 + +.. envvar:: NUMBA_ENABLE_AVX + + If set to non-zero, enable AVX optimizations in LLVM. This is disabled + by default on Sandy Bridge and Ivy Bridge architectures as it can sometimes + result in slower code on those platforms. + +.. envvar:: NUMBA_DISABLE_INTEL_SVML + + If set to non-zero and Intel SVML is available, the use of SVML will be + disabled. + +.. envvar:: NUMBA_DISABLE_JIT + + Disable JIT compilation entirely. The :func:`~numba.jit` decorator acts + as if it performs no operation, and the invocation of decorated functions + calls the original Python function instead of a compiled version. This + can be useful if you want to run the Python debugger over your code. + +.. envvar:: NUMBA_CPU_NAME +.. envvar:: NUMBA_CPU_FEATURES + + Override CPU and CPU features detection. + By setting ``NUMBA_CPU_NAME=generic``, a generic CPU model is picked + for the CPU architecture and the feature list (``NUMBA_CPU_FEATURES``) + defaults to empty. CPU features must be listed with the format + ``+feature1,-feature2`` where ``+`` indicates enable and ``-`` indicates + disable. For example, ``+sse,+sse2,-avx,-avx2`` enables SSE and SSE2, and + disables AVX and AVX2. + + These settings are passed to LLVM for configuring the compilation target. + To get a list of available options, use the ``llc`` commandline tool + from LLVM, for example:: + + llc -march=x86 -mattr=help + + + .. tip:: To force all caching functions (``@jit(cache=True)``) to emit + portable code (portable within the same architecture and OS), + simply set ``NUMBA_CPU_NAME=generic``. + +.. envvar:: NUMBA_FUNCTION_CACHE_SIZE + + Override the size of the function cache for retaining recently + deserialized functions in memory. In systems like + `Dask `_, it is common for functions to be deserialized + multiple times. Numba will cache functions as long as there is a + reference somewhere in the interpreter. This cache size variable controls + how many functions that are no longer referenced will also be retained, + just in case they show up in the future. The implementation of this is + not a true LRU, but the large size of the cache should be sufficient for + most situations. + + Note: this is unrelated to the compilation cache. + + *Default value:* 128 + +.. envvar:: NUMBA_LLVM_REFPRUNE_PASS + + Turns on the LLVM pass level reference-count pruning pass and disables the + regex based implementation in Numba. + + *Default value:* 1 (On) + +.. envvar:: NUMBA_LLVM_REFPRUNE_FLAGS + + When ``NUMBA_LLVM_REFPRUNE_PASS`` is on, this allows configuration + of subpasses in the reference-count pruning LLVM pass. + + Valid values are any combinations of the below separated by `,` + (case-insensitive): + + - ``all``: enable all subpasses. + - ``per_bb``: enable per-basic-block level pruning, which is same as the + old regex based implementation. + - ``diamond``: enable inter-basic-block pruning that is a diamond shape + pattern, i.e. a single-entry single-exit CFG subgraph where has an incref + in the entry and a corresponding decref in the exit. + - ``fanout``: enable inter-basic-block pruning that has a fanout pattern, + i.e. a single-entry multiple-exit CFG subgraph where the entry has an + incref and every exit has a corresponding decref. + - ``fanout_raise``: same as ``fanout`` but allow subgraph exit nodes to be + raising an exception and not have a corresponding decref. + + For example, ``all`` is the same as + ``per_bb, diamond, fanout, fanout_raise`` + + *Default value:* "all" + + +.. _numba-envvars-caching: + +Caching options +--------------- + +Options for the compilation cache. + +.. envvar:: NUMBA_DEBUG_CACHE + + If set to non-zero, print out information about operation of the + :ref:`JIT compilation cache `. + +.. envvar:: NUMBA_CACHE_DIR + + Override the location of the cache directory. If defined, this should be + a valid directory path. + + If not defined, Numba picks the cache directory in the following order: + + 1. In-tree cache. Put the cache next to the corresponding source file under + a ``__pycache__`` directory following how ``.pyc`` files are stored. + 2. User-wide cache. Put the cache in the user's application directory using + ``appdirs.user_cache_dir`` from the + `Appdirs package `_. + 3. IPython cache. Put the cache in an IPython specific application + directory. + Stores are made under the ``numba_cache`` in the directory returned by + ``IPython.paths.get_ipython_cache_dir()``. + + Also see :ref:`docs on cache sharing ` and + :ref:`docs on cache clearing ` + + +.. _numba-envvars-gpu-support: + +GPU support +----------- + +.. envvar:: NUMBA_DISABLE_CUDA + + If set to non-zero, disable CUDA support. + +.. envvar:: NUMBA_FORCE_CUDA_CC + + If set, force the CUDA compute capability to the given version (a + string of the type ``major.minor``), regardless of attached devices. + +.. envvar:: NUMBA_CUDA_DEFAULT_PTX_CC + + The default compute capability (a string of the type ``major.minor``) to + target when compiling to PTX using ``cuda.compile_ptx``. The default is + 5.2, which is the lowest non-deprecated compute capability in the most + recent version of the CUDA toolkit supported (10.2 at present). + +.. envvar:: NUMBA_ENABLE_CUDASIM + + If set, don't compile and execute code for the GPU, but use the CUDA + Simulator instead. For debugging purposes. + + +.. envvar:: NUMBA_CUDA_ARRAY_INTERFACE_SYNC + + Whether to synchronize on streams provided by objects imported using the CUDA + Array Interface. This defaults to 1. If set to 0, then no synchronization + takes place, and the user of Numba (and other CUDA libraries) is responsible + for ensuring correctness with respect to synchronization on streams. + +.. envvar:: NUMBA_CUDA_LOG_LEVEL + + For debugging purposes. If no other logging is configured, the value of this + variable is the logging level for CUDA API calls. The default value is + ``CRITICAL`` - to trace all API calls on standard error, set this to + ``DEBUG``. + +.. envvar:: NUMBA_CUDA_LOG_API_ARGS + + By default the CUDA API call logs only give the names of functions called. + Setting this variable to 1 also includes the values of arguments to Driver + API calls in the logs. + +.. envvar:: NUMBA_CUDA_DRIVER + + Path of the directory in which the CUDA driver libraries are to be found. + Normally this should not need to be set as Numba can locate the driver in + standard locations. However, this variable can be used if the driver is in a + non-standard location. + +.. envvar:: NUMBA_CUDA_LOG_SIZE + + Buffer size for logs produced by CUDA driver API operations. This defaults + to 1024 and should not normally need to be modified - however, if an error + in an API call produces a large amount of output that appears to be + truncated (perhaps due to multiple long function names, for example) then + this variable can be used to increase the buffer size and view the full + error message. + +.. envvar:: NUMBA_CUDA_VERBOSE_JIT_LOG + + Whether the CUDA driver should produce verbose log messages. Defaults to 1, + indicating that verbose messaging is enabled. This should not need to be + modified under normal circumstances. + +.. envvar:: NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM + + When set to 1, the default stream is the per-thread default stream. When set + to 0, the default stream is the legacy default stream. This defaults to 0, + for the legacy default stream. See `Stream Synchronization Behavior + `_ + for an explanation of the legacy and per-thread default streams. + + This variable only takes effect when using Numba's internal CUDA bindings; + when using the NVIDIA bindings, use the environment variable + ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` instead. + + .. seealso:: + + The `Default Stream section + `_ + in the NVIDIA Bindings documentation. + +.. envvar:: NUMBA_CUDA_LOW_OCCUPANCY_WARNINGS + + Enable warnings if the grid size is too small relative to the number of + streaming multiprocessors (SM). This option is on by default (default value is 1). + + The heuristic checked is whether ``gridsize < 2 * (number of SMs)``. NOTE: The absence of + a warning does not imply a good gridsize relative to the number of SMs. Disabling + this warning will reduce the number of CUDA API calls (during JIT compilation), as the + heuristic needs to check the number of SMs available on the device in the + current context. + +.. envvar:: NUMBA_CUDA_WARN_ON_IMPLICIT_COPY + + Enable warnings if a kernel is launched with host memory which forces a copy to and + from the device. This option is on by default (default value is 1). + +.. envvar:: NUMBA_CUDA_USE_NVIDIA_BINDING + + When set to 1, Numba will attempt to use the `NVIDIA CUDA Python binding + `_ to make calls to the driver API + instead of using its own ctypes binding. This defaults to 0 (off), as the + NVIDIA binding is currently missing support for Per-Thread Default + Streams and the profiler APIs. + +.. envvar:: NUMBA_CUDA_INCLUDE_PATH + + The location of the CUDA include files. This is used when linking CUDA C/C++ + sources to Python kernels, and needs to be correctly set for CUDA includes to + be available to linked C/C++ sources. On Linux, it defaults to + ``/usr/local/cuda/include``. On Windows, the default is + ``$env:CUDA_PATH\include``. + + +Threading Control +----------------- + +.. envvar:: NUMBA_NUM_THREADS + + If set, the number of threads in the thread pool for the parallel CPU target + will take this value. Must be greater than zero. This value is independent + of ``OMP_NUM_THREADS`` and ``MKL_NUM_THREADS``. + + *Default value:* The number of CPU cores on the system as determined at run + time. This can be accessed via :obj:`numba.config.NUMBA_DEFAULT_NUM_THREADS`. + + See also the section on :ref:`setting_the_number_of_threads` for + information on how to set the number of threads at runtime. + +.. envvar:: NUMBA_THREADING_LAYER + + This environment variable controls the library used for concurrent execution + for the CPU parallel targets (``@vectorize(target='parallel')``, + ``@guvectorize(target='parallel')`` and ``@njit(parallel=True)``). The + variable type is string and by default is ``default`` which will select a + threading layer based on what is available in the runtime. The valid values + are (for more information about these see + :ref:`the threading layer documentation `): + + * ``default`` - select a threading layer based on what is available in the + current runtime. + * ``safe`` - select a threading layer that is both fork and thread safe + (requires the TBB package). + * ``forksafe`` - select a threading layer that is fork safe. + * ``threadsafe`` - select a threading layer that is thread safe. + * ``tbb`` - A threading layer backed by Intel TBB. + * ``omp`` - A threading layer backed by OpenMP. + * ``workqueue`` - A simple built-in work-sharing task scheduler. + +.. envvar:: NUMBA_THREADING_LAYER_PRIORITY + + This environment variable controls the order in which the libraries used for + concurrent execution, for the CPU parallel targets + (``@vectorize(target='parallel')``, ``@guvectorize(target='parallel')`` + and ``@njit(parallel=True)``), are prioritized for use. The variable type is + string and by default is ``tbb omp workqueue``, with the priority taken based + on position from the left of the string, left most being the highest. Valid + values are any permutation of the three choices (for more information about + these see :ref:`the threading layer documentation `.) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/fpsemantics.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/fpsemantics.rst new file mode 100644 index 0000000000000000000000000000000000000000..7973d4e500b7a3ae9c8282d00ce8834d97a76b58 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/fpsemantics.rst @@ -0,0 +1,81 @@ + +Floating-point pitfalls +======================= + +Precision and accuracy +---------------------- + +For some operations, Numba may use a different algorithm than Python or +Numpy. The results may not be bit-by-bit compatible. The difference +should generally be small and within reasonable expectations. However, +small accumulated differences might produce large differences at the end, +especially if a divergent function is involved. + +Math library implementations +'''''''''''''''''''''''''''' + +Numba supports a variety of platforms and operating systems, each of which +has its own math library implementation (referred to as ``libm`` from here +in). The majority of math functions included in ``libm`` have specific +requirements as set out by the IEEE 754 standard (like ``sin()``, ``exp()`` +etc.), but each implementation may have bugs. Thus, on some platforms +Numba has to exercise special care in order to workaround known ``libm`` +issues. + +Another typical problem is when an operating system's ``libm`` function +set is incomplete and needs to be supplemented by additional functions. +These are provided with reference to the IEEE 754 and C99 standards +and are often implemented in Numba in a manner similar to equivalent +CPython functions. + +Linear algebra +'''''''''''''' + +Numpy forces some linear algebra operations to run in double-precision mode +even when a ``float32`` input is given. Numba will always observe +the input's precision, and invoke single-precision linear algebra routines +when all inputs are ``float32`` or ``complex64``. + +The implementations of the ``numpy.linalg`` routines in Numba only support the +floating point types that are used in the LAPACK functions that provide +the underlying core functionality. As a result only ``float32``, ``float64``, +``complex64`` and ``complex128`` types are supported. If a user has e.g. an +``int32`` type, an appropriate type conversion must be performed to a +floating point type prior to its use in these routines. The reason for this +decision is to essentially avoid having to replicate type conversion choices +made in Numpy and to also encourage the user to choose the optimal floating +point type for the operation they are undertaking. + + +Mixed-types operations +'''''''''''''''''''''' + +Numpy will most often return a ``float64`` as a result of a computation +with mixed integer and floating-point operands (a typical example is the +power operator ``**``). Numba by contrast will select the highest precision +amongst the floating-point operands, so for example ``float32 ** int32`` +will return a ``float32``, regardless of the input values. This makes +performance characteristics easier to predict, but you should explicitly +cast the input to ``float64`` if you need the extra precision. + + +.. _ufunc-fpu-errors: + +Warnings and errors +------------------- + +When calling a :term:`ufunc` created with :func:`~numba.vectorize`, +Numpy will determine whether an error occurred by examining the FPU +error word. It may then print out a warning or raise an exception +(such as ``RuntimeWarning: divide by zero encountered``), +depending on the current error handling settings. + +Depending on how LLVM optimized the ufunc's code, however, some spurious +warnings or errors may appear. If you get caught by this issue, we +recommend you call :func:`numpy.seterr` to change Numpy's error handling +settings, or the :class:`numpy.errstate` context manager to switch them +temporarily:: + + with np.errstate(all='ignore'): + x = my_ufunc(y) + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/index.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..e099d2a312341c3d5a16ead11a34e52ef3f5f510 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/index.rst @@ -0,0 +1,16 @@ + +Reference Manual +================ + +.. toctree:: + + types.rst + jit-compilation.rst + aot-compilation.rst + utils.rst + envvars.rst + pysupported.rst + numpysupported.rst + pysemantics.rst + fpsemantics.rst + deprecation.rst diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/jit-compilation.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/jit-compilation.rst new file mode 100644 index 0000000000000000000000000000000000000000..ac67a593b145d9753182d2f1934a851166e36503 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/jit-compilation.rst @@ -0,0 +1,572 @@ +Just-in-Time compilation +======================== + + +JIT functions +------------- + +.. _jit-decorator: + +.. decorator:: numba.jit(signature=None, nopython=False, nogil=False, cache=False, forceobj=False, parallel=False, error_model='python', fastmath=False, locals={}, boundscheck=False) + + Compile the decorated function on-the-fly to produce efficient machine + code. All parameters are optional. + + If present, the *signature* is either a single signature or a list of + signatures representing the expected :ref:`numba-types` of function + arguments and return values. Each signature can be given in several + forms: + + * A tuple of :ref:`numba-types` arguments (for example + ``(numba.int32, numba.double)``) representing the types of the + function's arguments; Numba will then infer an appropriate return + type from the arguments. + * A call signature using :ref:`numba-types`, specifying both return + type and argument types. This can be given in intuitive form + (for example ``numba.void(numba.int32, numba.double)``). + * A string representation of one of the above, for example + ``"void(int32, double)"``. All type names used in the string are assumed + to be defined in the ``numba.types`` module. + + *nopython* and *nogil* are boolean flags. *locals* is a mapping of + local variable names to :ref:`numba-types`. + + This decorator has several modes of operation: + + * If one or more signatures are given in *signature*, a specialization is + compiled for each of them. Calling the decorated function will then try + to choose the best matching signature, and raise a :class:`TypeError` if + no appropriate conversion is available for the function arguments. If + converting succeeds, the compiled machine code is executed with the + converted arguments and the return value is converted back according to + the signature. + + * If no *signature* is given, the decorated function implements + lazy compilation. Each call to the decorated function will try to + re-use an existing specialization if it exists (for example, a call + with two integer arguments may re-use a specialization for argument + types ``(numba.int64, numba.int64)``). If no suitable specialization + exists, a new specialization is compiled on-the-fly, stored for later + use, and executed with the converted arguments. + + If true, *nopython* forces the function to be compiled in :term:`nopython + mode`. If not possible, compilation will raise an error. + + If true, *forceobj* forces the function to be compiled in :term:`object + mode`. Since object mode is slower than nopython mode, this is mostly + useful for testing purposes. + + If true, *nogil* tries to release the :py:term:`global interpreter lock` + inside the compiled function. The GIL will only be released if Numba can + compile the function in :term:`nopython mode`, otherwise a compilation + warning will be printed. + + .. _jit-decorator-cache: + + If true, *cache* enables a file-based cache to shorten compilation times + when the function was already compiled in a previous invocation. + The cache is maintained in the ``__pycache__`` subdirectory of + the directory containing the source file; if the current user is not + allowed to write to it, though, it falls back to a platform-specific + user-wide cache directory (such as ``$HOME/.cache/numba`` on Unix + platforms). + + .. _jit-decorator-parallel: + + If true, *parallel* enables the automatic parallelization of a number of + common NumPy constructs as well as the fusion of adjacent parallel + operations to maximize cache locality. + + The *error_model* option controls the divide-by-zero behavior. + Setting it to 'python' causes divide-by-zero to raise exception like CPython. + Setting it to 'numpy' causes divide-by-zero to set the result to *+/-inf* or + *nan*. + + Not all functions can be cached, since some functionality cannot be + always persisted to disk. When a function cannot be cached, a + warning is emitted. + + .. _jit-decorator-fastmath: + + If true, *fastmath* enables the use of otherwise unsafe floating point + transforms as described in the + `LLVM documentation `_. + Further, if :ref:`Intel SVML ` is installed faster but less + accurate versions of some math intrinsics are used (answers to within + ``4 ULP``). + + .. _jit-decorator-boundscheck: + + If true, *boundscheck* enables bounds checking for array indices. Out of + bounds accesses will raise IndexError. The default is to not do bounds + checking. If bounds checking is disabled, out of bounds accesses can + produce garbage results or segfaults. However, enabling bounds checking + will slow down typical functions, so it is recommended to only use this + flag for debugging. You can also set the `NUMBA_BOUNDSCHECK` environment + variable to 0 or 1 to globally override this flag. + + The *locals* dictionary may be used to force the :ref:`numba-types` + of particular local variables, for example if you want to force the + use of single precision floats at some point. In general, we recommend + you let Numba's compiler infer the types of local variables by itself. + + Here is an example with two signatures:: + + @jit(["int32(int32)", "float32(float32)"], nopython=True) + def f(x): ... + + Not putting any parentheses after the decorator is equivalent to calling + the decorator without any arguments, i.e.:: + + @jit + def f(x): ... + + is equivalent to:: + + @jit() + def f(x): ... + + The decorator returns a :class:`Dispatcher` object. + + .. note:: + If no *signature* is given, compilation errors will be raised when + the actual compilation occurs, i.e. when the function is first called + with some given argument types. + + .. note:: + Compilation can be influenced by some dedicated :ref:`numba-envvars`. + + +Generated JIT functions +----------------------- + +.. decorator:: numba.generated_jit(nopython=False, nogil=False, cache=False, forceobj=False, locals={}) + + Like the :func:`~numba.jit` decorator, but calls the decorated function at + compile-time, passing the *types* of the function's arguments. + The decorated function must return a callable which will be compiled as + the function's implementation for those types, allowing flexible kinds of + specialization. + + The :func:`~numba.generated_jit` decorator returns a :class:`Dispatcher` object. + + +Dispatcher objects +------------------ + +.. class:: Dispatcher + + The class of objects created by calling :func:`~numba.jit` or + :func:`~numba.generated_jit`. You shouldn't try to create such an object + in any other way. Calling a Dispatcher object calls the compiled + specialization for the arguments with which it is called, letting it + act as an accelerated replacement for the Python function which was compiled. + + In addition, Dispatcher objects have the following methods and attributes: + + .. attribute:: py_func + + The pure Python function which was compiled. + + .. method:: inspect_types(file=None, pretty=False) + + Print out a listing of the function source code annotated line-by-line + with the corresponding Numba IR, and the inferred types of the various + variables. If *file* is specified, printing is done to that file + object, otherwise to sys.stdout. If *pretty* is set to True then colored + ANSI will be produced in a terminal and HTML in a notebook. + + .. seealso:: :ref:`architecture` + + .. method:: inspect_llvm(signature=None) + + Return a dictionary keying compiled function signatures to the human + readable LLVM IR generated for the function. If the signature + keyword is specified a string corresponding to that individual + signature is returned. + + .. method:: inspect_asm(signature=None) + + Return a dictionary keying compiled function signatures to the + human-readable native assembly code for the function. If the + signature keyword is specified a string corresponding to that + individual signature is returned. + + .. method:: inspect_cfg(signature=None, show_wrapped) + + Return a dictionary keying compiled function signatures to the + control-flow graph objects for the function. If the signature keyword is + specified a string corresponding to that individual signature is returned. + + The control-flow graph objects can be stringified (``str`` or ``repr``) + to get the textual representation of the graph in DOT format. Or, use + its ``.display(filename=None, view=False)`` method to plot the graph. + The *filename* option can be set to a specific path for the rendered + output to write to. If *view* option is True, the plot is opened by + the system default application for the image format (PDF). In IPython + notebook, the returned object can be plot inlined. + + Usage:: + + @jit + def foo(): + ... + + # opens the CFG in system default application + foo.inspect_cfg(foo.signatures[0]).display(view=True) + + + .. method:: inspect_disasm_cfg(signature=None) + + Return a dictionary keying compiled function signatures to the + control-flow graph of the disassembly of the underlying compiled ``ELF`` + object. If the signature keyword is specified a control-flow graph + corresponding to that individual signature is returned. This function is + execution environment aware and will produce SVG output in Jupyter + notebooks and ASCII in terminals. + + Example:: + + @njit + def foo(x): + if x < 3: + return x + 1 + return x + 2 + + foo(10) + + print(foo.inspect_disasm_cfg(signature=foo.signatures[0])) + + Gives:: + + [0x08000040]> # method.__main__.foo_241_long_long (int64_t arg1, int64_t arg3); + ─────────────────────────────────────────────────────────────────────┐ + │ 0x8000040 │ + │ ; arg3 ; [02] -r-x section size 279 named .text │ + │ ;-- section..text: │ + │ ;-- .text: │ + │ ;-- __main__::foo$241(long long): │ + │ ;-- rip: │ + │ 25: method.__main__.foo_241_long_long (int64_t arg1, int64_t arg3); │ + │ ; arg int64_t arg1 @ rdi │ + │ ; arg int64_t arg3 @ rdx │ + │ ; 2 │ + │ cmp rdx, 2 │ + │ jg 0x800004f │ + └─────────────────────────────────────────────────────────────────────┘ + f t + │ │ + │ └──────────────────────────────┐ + └──┐ │ + │ │ + ┌─────────────────────────┐ ┌─────────────────────────┐ + │ 0x8000046 │ │ 0x800004f │ + │ ; arg3 │ │ ; arg3 │ + │ inc rdx │ │ add rdx, 2 │ + │ ; arg3 │ │ ; arg3 │ + │ mov qword [rdi], rdx │ │ mov qword [rdi], rdx │ + │ xor eax, eax │ │ xor eax, eax │ + │ ret │ │ ret │ + └─────────────────────────┘ └─────────────────────────┘ + + .. method:: recompile() + + Recompile all existing signatures. This can be useful for example if + a global or closure variable was frozen by your function and its value + in Python has changed. Since compiling isn't cheap, this is mainly + for testing and interactive use. + + .. method:: parallel_diagnostics(signature=None, level=1) + + Print parallel diagnostic information for the given signature. If no + signature is present it is printed for all known signatures. ``level`` is + used to adjust the verbosity, ``level=1`` (default) is minimum verbosity, + levels 2, 3, and 4 provide increasing levels of verbosity. + + .. method:: get_metadata(signature=None) + + Obtain the compilation metadata for a given signature. This is useful for + developers of Numba and Numba extensions. + + +Vectorized functions (ufuncs and DUFuncs) +----------------------------------------- + +.. decorator:: numba.vectorize(*, signatures=[], identity=None, nopython=True, target='cpu', forceobj=False, cache=False, locals={}) + + Compile the decorated function and wrap it either as a `NumPy + ufunc`_ or a Numba :class:`~numba.DUFunc`. The optional + *nopython*, *forceobj* and *locals* arguments have the same meaning + as in :func:`numba.jit`. + + *signatures* is an optional list of signatures expressed in the + same form as in the :func:`numba.jit` *signature* argument. If + *signatures* is non-empty, then the decorator will compile the user + Python function into a NumPy ufunc. If no *signatures* are given, + then the decorator will wrap the user Python function in a + :class:`~numba.DUFunc` instance, which will compile the user + function at call time whenever NumPy can not find a matching loop + for the input arguments. *signatures* is required if *target* is + ``"parallel"``. + + *identity* is the identity (or unit) value of the function being + implemented. Possible values are 0, 1, None, and the string + ``"reorderable"``. The default is None. Both None and + ``"reorderable"`` mean the function has no identity value; + ``"reorderable"`` additionally specifies that reductions along multiple + axes can be reordered. + + If there are several *signatures*, they must be ordered from the more + specific to the least specific. Otherwise, NumPy's type-based + dispatching may not work as expected. For example, the following is + wrong:: + + @vectorize(["float64(float64)", "float32(float32)"]) + def f(x): ... + + as running it over a single-precision array will choose the ``float64`` + version of the compiled function, leading to much less efficient + execution. The correct invocation is:: + + @vectorize(["float32(float32)", "float64(float64)"]) + def f(x): ... + + *target* is a string for backend target; Available values are "cpu", + "parallel", and "cuda". To use a multithreaded version, change the + target to "parallel" (which requires signatures to be specified):: + + @vectorize(["float64(float64)", "float32(float32)"], target='parallel') + def f(x): ... + + For the CUDA target, use "cuda":: + + @vectorize(["float64(float64)", "float32(float32)"], target='cuda') + def f(x): ... + + The compiled function can be cached to reduce future compilation time. + It is enabled by setting *cache* to True. Only the "cpu" and "parallel" + targets support caching. + + +.. decorator:: numba.guvectorize(signatures, layout, *, identity=None, nopython=True, target='cpu', forceobj=False, cache=False, locals={}) + + Generalized version of :func:`numba.vectorize`. While + :func:`numba.vectorize` will produce a simple ufunc whose core + functionality (the function you are decorating) operates on scalar + operands and returns a scalar value, :func:`numba.guvectorize` + allows you to create a `NumPy ufunc`_ whose core function takes array + arguments of various dimensions. + + The additional argument *layout* is a string specifying, in symbolic + form, the dimensionality and size relationship of the argument types + and return types. For example, a matrix multiplication will have + a layout string of ``"(m,n),(n,p)->(m,p)"``. Its definition might + be (function body omitted):: + + @guvectorize(["void(float64[:,:], float64[:,:], float64[:,:])"], + "(m,n),(n,p)->(m,p)") + def f(a, b, result): + """Fill-in *result* matrix such as result := a * b""" + ... + + If one of the arguments should be a scalar, the corresponding layout + specification is ``()`` and the argument will really be given to + you as a zero-dimension array (you have to dereference it to get the + scalar value). For example, a :ref:`one-dimension moving average ` + with a parameterable window width may have a layout string of ``"(n),()->(n)"``. + + Note that any output will be given to you preallocated as an additional + function argument: your code has to fill it with the appropriate values + for the function you are implementing. + + If your function doesn't take an output array, you should omit the "arrow" + in the layout string (e.g. ``"(n),(n)"``). When doing this, it is important + to be aware that changes to the input arrays cannot always be relied on to be + visible outside the execution of the ufunc, as NumPy may pass in temporary + arrays as inputs (for example, if a cast is required). + + .. seealso:: + Specification of the `layout string `_ + as supported by NumPy. Note that NumPy uses the term "signature", + which we unfortunately use for something else. + + The compiled function can be cached to reduce future compilation time. + It is enabled by setting *cache* to True. Only the "cpu" and "parallel" + targets support caching. + +.. _NumPy ufunc: http://docs.scipy.org/doc/numpy/reference/ufuncs.html + +.. class:: numba.DUFunc + + The class of objects created by calling :func:`numba.vectorize` + with no signatures. + + DUFunc instances should behave similarly to NumPy + :class:`~numpy.ufunc` objects with one important difference: + call-time loop generation. When calling a ufunc, NumPy looks at + the existing loops registered for that ufunc, and will raise a + :class:`~python.TypeError` if it cannot find a loop that it cannot + safely cast the inputs to suit. When calling a DUFunc, Numba + delegates the call to NumPy. If the NumPy ufunc call fails, then + Numba attempts to build a new loop for the given input types, and + calls the ufunc again. If this second call attempt fails or a + compilation error occurs, then DUFunc passes along the exception to + the caller. + + .. seealso:: + + The ":ref:`dynamic-universal-functions`" section in the user's + guide demonstrates the call-time behavior of + :class:`~numba.DUFunc`, and discusses the impact of call order + on how Numba generates the underlying :class:`~numpy.ufunc`. + + .. attribute:: ufunc + + The actual NumPy :class:`~numpy.ufunc` object being built by the + :class:`~numba.DUFunc` instance. Note that the + :class:`~numba.DUFunc` object maintains several important data + structures required for proper ufunc functionality (specifically + the dynamically compiled loops). Users should not pass the + :class:`~numpy.ufunc` value around without ensuring the + underlying :class:`~numba.DUFunc` will not be garbage collected. + + .. attribute:: nin + + The number of DUFunc (ufunc) inputs. See `ufunc.nin`_. + + .. attribute:: nout + + The number of DUFunc outputs. See `ufunc.nout`_. + + .. attribute:: nargs + + The total number of possible DUFunc arguments (should be + :attr:`~numba.DUFunc.nin` + :attr:`~numba.DUFunc.nout`). + See `ufunc.nargs`_. + + .. attribute:: ntypes + + The number of input types supported by the DUFunc. See + `ufunc.ntypes`_. + + .. attribute:: types + + A list of the supported types given as strings. See + `ufunc.types`_. + + .. attribute:: identity + + The identity value when using the ufunc as a reduction. See + `ufunc.identity`_. + + .. method:: reduce(A, *, axis, dtype, out, keepdims) + + Reduces *A*\'s dimension by one by applying the DUFunc along one + axis. See `ufunc.reduce`_. + + .. method:: accumulate(A, *, axis, dtype, out) + + Accumulate the result of applying the operator to all elements. + See `ufunc.accumulate`_. + + .. method:: reduceat(A, indices, *, axis, dtype, out) + + Performs a (local) reduce with specified slices over a single + axis. See `ufunc.reduceat`_. + + .. method:: outer(A, B) + + Apply the ufunc to all pairs (*a*, *b*) with *a* in *A*, and *b* + in *B*. See `ufunc.outer`_. + + .. method:: at(A, indices, *, B) + + Performs unbuffered in place operation on operand *A* for + elements specified by *indices*. If you are using NumPy 1.7 or + earlier, this method will not be present. See `ufunc.at`_. + + +.. note:: + Vectorized functions can, in rare circumstances, show + :ref:`unexpected warnings or errors `. + + +.. _`ufunc.nin`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.nin.html#numpy.ufunc.nin + +.. _`ufunc.nout`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.nout.html#numpy.ufunc.nout + +.. _`ufunc.nargs`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.nargs.html#numpy.ufunc.nargs + +.. _`ufunc.ntypes`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.ntypes.html#numpy.ufunc.ntypes + +.. _`ufunc.types`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.types.html#numpy.ufunc.types + +.. _`ufunc.identity`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.identity.html#numpy.ufunc.identity + +.. _`ufunc.reduce`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.reduce.html#numpy.ufunc.reduce + +.. _`ufunc.accumulate`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.accumulate.html#numpy.ufunc.accumulate + +.. _`ufunc.reduceat`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.reduceat.html#numpy.ufunc.reduceat + +.. _`ufunc.outer`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.outer.html#numpy.ufunc.outer + +.. _`ufunc.at`: http://docs.scipy.org/doc/numpy/reference/generated/numpy.ufunc.at.html#numpy.ufunc.at + + +C callbacks +----------- + +.. decorator:: numba.cfunc(signature, nopython=False, cache=False, locals={}) + + Compile the decorated function on-the-fly to produce efficient machine + code. The compiled code is wrapped in a thin C callback that makes it + callable using the natural C ABI. + + The *signature* is a single signature representing the signature of the + C callback. It must have the same form as in :func:`~numba.jit`. + The decorator does not check that the types in the signature have + a well-defined representation in C. + + *nopython* and *cache* are boolean flags. *locals* is a mapping of + local variable names to :ref:`numba-types`. They all have the same + meaning as in :func:`~numba.jit`. + + The decorator returns a :class:`CFunc` object. + + .. note:: + C callbacks currently do not support :term:`object mode`. + + +.. class:: CFunc + + The class of objects created by :func:`~numba.cfunc`. :class:`CFunc` + objects expose the following attributes and methods: + + .. attribute:: address + + The address of the compiled C callback, as an integer. + + .. attribute:: cffi + + A `cffi`_ function pointer instance, to be passed as an argument to + `cffi`_-wrapped functions. The pointer's type is ``void *``, so + only minimal type checking will happen when passing it to `cffi`_. + + .. attribute:: ctypes + + A :mod:`ctypes` callback instance, as if it were created using + :func:`ctypes.CFUNCTYPE`. + + .. attribute:: native_name + + The name of the compiled C callback. + + .. method:: inspect_llvm() + + Return the human-readable LLVM IR generated for the C callback. + :attr:`native_name` is the name under which this callback is defined + in the IR. + + +.. _cffi: https://cffi.readthedocs.org/ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/numpysupported.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/numpysupported.rst new file mode 100644 index 0000000000000000000000000000000000000000..54060fd906aeb2da21be91514ab526265b53730b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/numpysupported.rst @@ -0,0 +1,925 @@ + +.. _numpy-support: + +======================== +Supported NumPy features +======================== + +One objective of Numba is having a seamless integration with `NumPy`_. +NumPy arrays provide an efficient storage method for homogeneous sets of +data. NumPy dtypes provide type information useful when compiling, and +the regular, structured storage of potentially large amounts of data +in memory provides an ideal memory layout for code generation. Numba +excels at generating code that executes on top of NumPy arrays. + +NumPy support in Numba comes in many forms: + +* Numba understands calls to NumPy `ufuncs`_ and is able to generate + equivalent native code for many of them. + +* NumPy arrays are directly supported in Numba. Access to NumPy arrays + is very efficient, as indexing is lowered to direct memory accesses + when possible. + +* Numba is able to generate `ufuncs`_ and `gufuncs`_. This means that it + is possible to implement ufuncs and gufuncs within Python, getting + speeds comparable to that of ufuncs/gufuncs implemented in C extension + modules using the NumPy C API. + +.. _NumPy: http://www.numpy.org/ +.. _ufuncs: http://docs.scipy.org/doc/numpy/reference/ufuncs.html +.. _gufuncs: http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html + +The following sections focus on the NumPy features supported in +:term:`nopython mode`, unless otherwise stated. + + +Scalar types +============ + +Numba supports the following NumPy scalar types: + +* **Integers**: all integers of either signedness, and any width up to 64 bits +* **Booleans** +* **Real numbers:** single-precision (32-bit) and double-precision (64-bit) reals +* **Complex numbers:** single-precision (2x32-bit) and double-precision (2x64-bit) complex numbers +* **Datetimes and timestamps:** of any unit +* **Character sequences** (but no operations are available on them) +* **Structured scalars:** structured scalars made of any of the types above and arrays of the types above + +The following scalar types and features are not supported: + +* **Arbitrary Python objects** +* **Half-precision and extended-precision** real and complex numbers +* **Nested structured scalars** the fields of structured scalars may not contain other structured scalars + +The operations supported on NumPy scalars are almost the same as on the +equivalent built-in types such as ``int`` or ``float``. You can use a type's +constructor to convert from a different type or width. In addition you can use +the ``view(np.)`` method to bitcast all ``int`` and ``float`` types +within the same width. However, you must define the scalar using a NumPy +constructor within a jitted function. For example, the following will work: + +.. code:: pycon + + >>> import numpy as np + >>> from numba import njit + >>> @njit + ... def bitcast(): + ... i = np.int64(-1) + ... print(i.view(np.uint64)) + ... + >>> bitcast() + 18446744073709551615 + + +Whereas the following will not work: + + +.. code:: pycon + + >>> import numpy as np + >>> from numba import njit + >>> @njit + ... def bitcast(i): + ... print(i.view(np.uint64)) + ... + >>> bitcast(np.int64(-1)) + --------------------------------------------------------------------------- + TypingError Traceback (most recent call last) + ... + TypingError: Failed in nopython mode pipeline (step: ensure IR is legal prior to lowering) + 'view' can only be called on NumPy dtypes, try wrapping the variable with 'np.()' + + File "", line 3: + def bitcast(i): + print(i.view(np.uint64)) + +Structured scalars support attribute getting and setting, as well as +member lookup using constant strings. Strings stored in a local or global tuple +are considered constant strings and can be used for member lookup. + + + +.. literalinclude:: ../../../numba/tests/doc_examples/test_rec_array.py + :language: python + :start-after: magictoken.ex_rec_arr_const_index.begin + :end-before: magictoken.ex_rec_arr_const_index.end + :dedent: 8 + +It is also possible to use local or global tuples together with ``literal_unroll``: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_rec_array.py + :language: python + :start-after: magictoken.ex_rec_arr_lit_unroll_index.begin + :end-before: magictoken.ex_rec_arr_lit_unroll_index.end + :dedent: 8 + + +Record subtyping +---------------- +.. warning:: + This is an experimental feature. + +Numba allows `width subtyping `_ of structured scalars. +For example, ``dtype([('a', 'f8'), ('b', 'i8')])`` will be considered a subtype of ``dtype([('a', 'f8')]``, because +the second is a strict subset of the first, i.e. field ``a`` is of the same type and is in the same position in both +types. The subtyping relationship will matter in cases where compilation for a certain input is not allowed, but the +input is a subtype of another, allowed type. + +.. code-block:: python + + import numpy as np + from numba import njit, typeof + from numba.core import types + record1 = np.array([1], dtype=[('a', 'f8')])[0] + record2 = np.array([(2,3)], dtype=[('a', 'f8'), ('b', 'f8')])[0] + + @njit(types.float64(typeof(record1))) + def foo(rec): + return rec['a'] + + foo(record1) + foo(record2) + +Without subtyping the last line would fail. With subtyping, no new compilation will be triggered, but the +compiled function for ``record1`` will be used for ``record2``. + +.. seealso:: + `NumPy scalars `_ + reference. + + +Array types +=========== + +`NumPy arrays `_ +of any of the scalar types above are supported, regardless of the shape +or layout. + +Array access +------------ + +Arrays support normal iteration. Full basic indexing and slicing is +supported. A subset of advanced indexing is also supported: only one +advanced index is allowed, and it has to be a one-dimensional array +(it can be combined with an arbitrary number of basic indices as well). + +.. seealso:: + `NumPy indexing `_ + reference. + + +.. _structured-array-access: + +Structured array access +----------------------- + +Numba presently supports accessing fields of individual elements in structured +arrays by attribute as well as by getting and setting. This goes slightly +beyond the NumPy API, which only allows accessing fields by getting and +setting. For example: + +.. code:: python + + from numba import njit + import numpy as np + + record_type = np.dtype([("ival", np.int32), ("fval", np.float64)], align=True) + + def f(rec): + value = 2.5 + rec[0].ival = int(value) + rec[0].fval = value + return rec + + arr = np.ones(1, dtype=record_type) + + cfunc = njit(f) + + # Works + print(cfunc(arr)) + + # Does not work + print(f(arr)) + +The above code results in the output: + +.. code:: none + + [(2, 2.5)] + Traceback (most recent call last): + File "repro.py", line 22, in + print(f(arr)) + File "repro.py", line 9, in f + rec[0].ival = int(value) + AttributeError: 'numpy.void' object has no attribute 'ival' + +The Numba-compiled version of the function executes, but the pure Python +version raises an error because of the unsupported use of attribute access. + +.. note:: + This behavior will eventually be deprecated and removed. + +Attributes +---------- + +The following attributes of NumPy arrays are supported: + +* :attr:`~numpy.ndarray.dtype` +* :attr:`~numpy.ndarray.flags` +* :attr:`~numpy.ndarray.flat` +* :attr:`~numpy.ndarray.itemsize` +* :attr:`~numpy.ndarray.ndim` +* :attr:`~numpy.ndarray.shape` +* :attr:`~numpy.ndarray.size` +* :attr:`~numpy.ndarray.strides` +* :attr:`~numpy.ndarray.T` +* :attr:`~numpy.ndarray.real` +* :attr:`~numpy.ndarray.imag` + +The ``flags`` object +'''''''''''''''''''' + +The object returned by the :attr:`~numpy.ndarray.flags` attribute supports +the ``contiguous``, ``c_contiguous`` and ``f_contiguous`` attributes. + +The ``flat`` object +''''''''''''''''''' + +The object returned by the :attr:`~numpy.ndarray.flat` attribute supports +iteration and indexing, but be careful: indexing is very slow on +non-C-contiguous arrays. + +The ``real`` and ``imag`` attributes +'''''''''''''''''''''''''''''''''''' + +NumPy supports these attributes regardless of the dtype but Numba chooses to +limit their support to avoid potential user error. For numeric dtypes, +Numba follows NumPy's behavior. The :attr:`~numpy.ndarray.real` attribute +returns a view of the real part of the complex array and it behaves as an identity +function for other numeric dtypes. The :attr:`~numpy.ndarray.imag` attribute +returns a view of the imaginary part of the complex array and it returns a zero +array with the same shape and dtype for other numeric dtypes. For non-numeric +dtypes, including all structured/record dtypes, using these attributes will +result in a compile-time (`TypingError`) error. This behavior differs from +NumPy's but it is chosen to avoid the potential confusion with field names that +overlap these attributes. + +Calculation +----------- + +The following methods of NumPy arrays are supported in their basic form +(without any optional arguments): + +* :meth:`~numpy.ndarray.all` +* :meth:`~numpy.ndarray.any` +* :meth:`~numpy.ndarray.clip` +* :meth:`~numpy.ndarray.conj` +* :meth:`~numpy.ndarray.conjugate` +* :meth:`~numpy.ndarray.cumprod` +* :meth:`~numpy.ndarray.cumsum` +* :meth:`~numpy.ndarray.max` +* :meth:`~numpy.ndarray.mean` +* :meth:`~numpy.ndarray.min` +* :meth:`~numpy.ndarray.nonzero` +* :meth:`~numpy.ndarray.prod` +* :meth:`~numpy.ndarray.std` +* :meth:`~numpy.ndarray.take` +* :meth:`~numpy.ndarray.var` + +The corresponding top-level NumPy functions (such as :func:`numpy.prod`) +are similarly supported. + +Other methods +------------- + +The following methods of NumPy arrays are supported: + +* :meth:`~numpy.ndarray.argmax` (``axis`` keyword argument supported). +* :meth:`~numpy.ndarray.argmin` (``axis`` keyword argument supported). +* :meth:`~numpy.ndarray.argsort` (``kind`` key word argument supported for + values ``'quicksort'`` and ``'mergesort'``) +* :meth:`~numpy.ndarray.astype` (only the 1-argument form) +* :meth:`~numpy.ndarray.copy` (without arguments) +* :meth:`~numpy.ndarray.dot` (only the 1-argument form) +* :meth:`~numpy.ndarray.flatten` (no order argument; 'C' order only) +* :meth:`~numpy.ndarray.item` (without arguments) +* :meth:`~numpy.ndarray.itemset` (only the 1-argument form) +* :meth:`~numpy.ndarray.ptp` (without arguments) +* :meth:`~numpy.ndarray.ravel` (no order argument; 'C' order only) +* :meth:`~numpy.ndarray.repeat` (no axis argument) +* :meth:`~numpy.ndarray.reshape` (only the 1-argument form) +* :meth:`~numpy.ndarray.sort` (without arguments) +* :meth:`~numpy.ndarray.sum` (with or without the ``axis`` and/or ``dtype`` + arguments.) + + * ``axis`` only supports ``integer`` values. + * If the ``axis`` argument is a compile-time constant, all valid values + are supported. + An out-of-range value will result in a ``LoweringError`` at compile-time. + * If the ``axis`` argument is not a compile-time constant, only values + from 0 to 3 are supported. + An out-of-range value will result in a runtime exception. + * All numeric ``dtypes`` are supported in the ``dtype`` parameter. + ``timedelta`` arrays can be used as input arrays but ``timedelta`` is not + supported as ``dtype`` parameter. + * When a ``dtype`` is given, it determines the type of the internal + accumulator. When it is not, the selection is made automatically based on + the input array's ``dtype``, mostly following the same rules as NumPy. + However, on 64-bit Windows, Numba uses a 64-bit accumulator for integer + inputs (``int64`` for ``int32`` inputs and ``uint64`` for ``uint32`` + inputs), while NumPy would use a 32-bit accumulator in those cases. + + +* :meth:`~numpy.ndarray.transpose` +* :meth:`~numpy.ndarray.view` (only the 1-argument form) +* :meth:`~numpy.ndarray.__contains__` + +Where applicable, the corresponding top-level NumPy functions (such as +:func:`numpy.argmax`) are similarly supported. + +.. warning:: + Sorting may be slightly slower than NumPy's implementation. + + +Functions +========= + +Linear algebra +-------------- + +Basic linear algebra is supported on 1-D and 2-D contiguous arrays of +floating-point and complex numbers: + +* :func:`numpy.dot` +* :func:`numpy.kron` ('C' and 'F' order only) +* :func:`numpy.outer` +* :func:`numpy.trace` (only the first argument). +* :func:`numpy.vdot` +* On Python 3.5 and above, the matrix multiplication operator from + :pep:`465` (i.e. ``a @ b`` where ``a`` and ``b`` are 1-D or 2-D arrays). +* :func:`numpy.linalg.cholesky` +* :func:`numpy.linalg.cond` (only non string values in ``p``). +* :func:`numpy.linalg.det` +* :func:`numpy.linalg.eig` (only running with data that does not cause a domain + change is supported e.g. real input -> real + output, complex input -> complex output). +* :func:`numpy.linalg.eigh` (only the first argument). +* :func:`numpy.linalg.eigvals` (only running with data that does not cause a + domain change is supported e.g. real input -> real output, + complex input -> complex output). +* :func:`numpy.linalg.eigvalsh` (only the first argument). +* :func:`numpy.linalg.inv` +* :func:`numpy.linalg.lstsq` +* :func:`numpy.linalg.matrix_power` +* :func:`numpy.linalg.matrix_rank` +* :func:`numpy.linalg.norm` (only the 2 first arguments and only non string + values in ``ord``). +* :func:`numpy.linalg.pinv` +* :func:`numpy.linalg.qr` (only the first argument). +* :func:`numpy.linalg.slogdet` +* :func:`numpy.linalg.solve` +* :func:`numpy.linalg.svd` (only the 2 first arguments). + +.. note:: + The implementation of these functions needs SciPy to be installed. + +Reductions +---------- + +The following reduction functions are supported: + +* :func:`numpy.diff` (only the 2 first arguments) +* :func:`numpy.median` (only the first argument) +* :func:`numpy.nancumprod` (only the first argument) +* :func:`numpy.nancumsum` (only the first argument) +* :func:`numpy.nanmax` (only the first argument) +* :func:`numpy.nanmean` (only the first argument) +* :func:`numpy.nanmedian` (only the first argument) +* :func:`numpy.nanmin` (only the first argument) +* :func:`numpy.nanpercentile` (only the 2 first arguments, complex dtypes + unsupported) +* :func:`numpy.nanquantile` (only the 2 first arguments, complex dtypes + unsupported) +* :func:`numpy.nanprod` (only the first argument) +* :func:`numpy.nanstd` (only the first argument) +* :func:`numpy.nansum` (only the first argument) +* :func:`numpy.nanvar` (only the first argument) +* :func:`numpy.percentile` (only the 2 first arguments, complex dtypes + unsupported) +* :func:`numpy.quantile` (only the 2 first arguments, complex dtypes + unsupported) + +Other functions +--------------- + +The following top-level functions are supported: + +* :func:`numpy.allclose` +* :func:`numpy.append` +* :func:`numpy.arange` +* :func:`numpy.argsort` (``kind`` key word argument supported for values + ``'quicksort'`` and ``'mergesort'``) +* :func:`numpy.argwhere` +* :func:`numpy.array` (only the 2 first arguments) +* :func:`numpy.array_equal` +* :func:`numpy.array_split` +* :func:`numpy.asarray` (only the 2 first arguments) +* :func:`numpy.asarray_chkfinite` (only the 2 first arguments) +* :func:`numpy.asfarray` +* :func:`numpy.asfortranarray` (only the first argument) +* :func:`numpy.atleast_1d` +* :func:`numpy.atleast_2d` +* :func:`numpy.atleast_3d` +* :func:`numpy.bartlett` +* :func:`numpy.bincount` +* :func:`numpy.blackman` +* :func:`numpy.broadcast_to` (only the 2 first arguments) +* :func:`numpy.broadcast_arrays` (only the first argument) +* :func:`numpy.broadcast_shapes` +* :func:`numpy.column_stack` +* :func:`numpy.concatenate` +* :func:`numpy.convolve` (only the 2 first arguments) +* :func:`numpy.copy` (only the first argument) +* :func:`numpy.corrcoef` (only the 3 first arguments, requires SciPy) +* :func:`numpy.correlate` (only the 2 first arguments) +* :func:`numpy.count_nonzero` (axis only supports scalar values) +* :func:`numpy.cov` (only the 5 first arguments) +* :func:`numpy.cross` (only the 2 first arguments; at least one of the input + arrays should have ``shape[-1] == 3``) + + * If ``shape[-1] == 2`` for both inputs, please replace your + :func:`numpy.cross` call with :func:`numba.np.extensions.cross2d`. + +* :func:`numpy.delete` (only the 2 first arguments) +* :func:`numpy.diag` +* :func:`numpy.digitize` +* :func:`numpy.dstack` +* :func:`numpy.dtype` (only the first argument) +* :func:`numpy.ediff1d` +* :func:`numpy.empty` (only the 2 first arguments) +* :func:`numpy.empty_like` (only the 2 first arguments) +* :func:`numpy.expand_dims` +* :func:`numpy.extract` +* :func:`numpy.eye` +* :func:`numpy.fill_diagonal` +* :func:`numpy.flatten` (no order argument; 'C' order only) +* :func:`numpy.flatnonzero` +* :func:`numpy.flip` (no axis argument) +* :func:`numpy.fliplr` +* :func:`numpy.flipud` +* :func:`numpy.frombuffer` (only the 2 first arguments) +* :func:`numpy.full` (only the 3 first arguments) +* :func:`numpy.full_like` (only the 3 first arguments) +* :func:`numpy.hamming` +* :func:`numpy.hanning` +* :func:`numpy.histogram` (only the 3 first arguments) +* :func:`numpy.hstack` +* :func:`numpy.identity` +* :func:`numpy.kaiser` +* :func:`numpy.iscomplex` +* :func:`numpy.iscomplexobj` +* :func:`numpy.isneginf` +* :func:`numpy.isposinf` +* :func:`numpy.isreal` +* :func:`numpy.isrealobj` +* :func:`numpy.isscalar` +* :func:`numpy.interp` (only the 3 first arguments) +* :func:`numpy.intersect1d` (only first 2 arguments, ar1 and ar2) +* :func:`numpy.linspace` (only the 3-argument form) +* :func:`numpy.logspace` (only the 3 first arguments) +* :class:`numpy.ndenumerate` +* :class:`numpy.ndindex` +* :class:`numpy.nditer` (only the first argument) +* :func:`numpy.ones` (only the 2 first arguments) +* :func:`numpy.ones_like` (only the 2 first arguments) +* :func:`numpy.partition` (only the 2 first arguments) +* :func:`numpy.ptp` (only the first argument) +* :func:`numpy.ravel` (no order argument; 'C' order only) +* :func:`numpy.repeat` (no axis argument) +* :func:`numpy.reshape` (no order argument; 'C' order only) +* :func:`numpy.roll` (only the 2 first arguments; second argument ``shift`` + must be an integer) +* :func:`numpy.roots` +* :func:`numpy.rot90` (only the 2 first arguments) +* :func:`numpy.round_` +* :func:`numpy.searchsorted` (only the 3 first arguments) +* :func:`numpy.select` (only using homogeneous lists or tuples for the first + two arguments, condlist and choicelist). Additionally, these two arguments + can only contain arrays (unlike NumPy that also accepts tuples). +* :func:`numpy.shape` +* :func:`numpy.sinc` +* :func:`numpy.sort` (no optional arguments, quicksort accepts + multi-dimensional array and sorts its last axis). +* :func:`numpy.split` +* :func:`numpy.stack` +* :func:`numpy.swapaxes` +* :func:`numpy.take` (only the 2 first arguments) +* :func:`numpy.take_along_axis` (the axis argument must be a literal value) +* :func:`numpy.transpose` +* :func:`numpy.trapz` (only the 3 first arguments) +* :func:`numpy.tri` (only the 3 first arguments; third argument ``k`` must be an integer) +* :func:`numpy.tril` (second argument ``k`` must be an integer) +* :func:`numpy.tril_indices` (all arguments must be integer) +* :func:`numpy.tril_indices_from` (second argument ``k`` must be an integer) +* :func:`numpy.triu` (second argument ``k`` must be an integer) +* :func:`numpy.triu_indices` (all arguments must be integer) +* :func:`numpy.triu_indices_from` (second argument ``k`` must be an integer) +* :func:`numpy.unique` (only the first argument) +* :func:`numpy.vander` +* :func:`numpy.vstack` +* :func:`numpy.where` +* :func:`numpy.zeros` (only the 2 first arguments) +* :func:`numpy.zeros_like` (only the 2 first arguments) + +The following constructors are supported, both with a numeric input (to +construct a scalar) or a sequence (to construct an array): + +* :class:`numpy.bool_` +* :class:`numpy.complex64` +* :class:`numpy.complex128` +* :class:`numpy.float32` +* :class:`numpy.float64` +* :class:`numpy.int8` +* :class:`numpy.int16` +* :class:`numpy.int32` +* :class:`numpy.int64` +* :class:`numpy.intc` +* :class:`numpy.intp` +* :class:`numpy.uint8` +* :class:`numpy.uint16` +* :class:`numpy.uint32` +* :class:`numpy.uint64` +* :class:`numpy.uintc` +* :class:`numpy.uintp` + +The following machine parameter classes are supported, with all purely numerical +attributes: + +* :class:`numpy.iinfo` +* :class:`numpy.finfo` (``machar`` attribute not supported) +* :class:`numpy.MachAr` (with no arguments to the constructor) + + +Literal arrays +-------------- + +.. XXX should this part of the user's guide? + +Neither Python nor Numba has actual array literals, but you can construct +arbitrary arrays by calling :func:`numpy.array` on a nested tuple:: + + a = numpy.array(((a, b, c), (d, e, f))) + +(nested lists are not yet supported by Numba) + + +Modules +======= + +.. _numpy-random: + +``random`` +---------- + +Generator Objects +''''''''''''''''' +Numba supports :py:class:`numpy.random.Generator()` objects. As of version 0.56, users can pass +individual NumPy :py:class:`Generator` objects into Numba functions and use their +methods inside the functions. The same algorithms are used as NumPy for +random number generation hence maintaining parity between the random +number generated using NumPy and Numba under identical arguments +(also the same documentation notes as NumPy :py:class:`Generator` methods apply). +The current Numba support for :py:class:`Generator` is not thread-safe, hence we +do not recommend using :py:class:`Generator` methods in methods with parallel +execution logic. + +.. note:: + NumPy's :py:class:`Generator` objects rely on :py:class:`BitGenerator` to manage state + and generate the random bits, which are then transformed into random + values from useful distributions. Numba will ``unbox`` the :py:class:`Generator` objects + and will maintain a reference to the underlying :py:class:`BitGenerator` objects using NumPy's + ``ctypes`` interface bindings. Hence :py:class:`Generator` objects can cross the JIT boundary + and their functions be used within Numba-Jit code. Note that since only references + to :py:class:`BitGenerator` objects are maintained, any change to the state of a particular + :py:class:`Generator` object outside Numba code would affect the state of :py:class:`Generator` + inside the Numba code. + +.. literalinclude:: ../../../numba/tests/doc_examples/test_numpy_generators.py + :language: python + :start-after: magictoken.npgen_usage.begin + :end-before: magictoken.npgen_usage.end + :dedent: 8 + +The following :py:class:`Generator` methods are supported: + +* :func:`numpy.random.Generator().random()` + +RandomState and legacy Random number generation +''''''''''''''''''''''''''''''''''''''''''''''' + +Numba supports top-level functions from the +`numpy.random `_ +module, but does not allow you to create individual RandomState instances. +The same algorithms are used as for :ref:`the standard +random module ` (and therefore the same notes apply), +but with an independent internal state: seeding or drawing numbers from +one generator won't affect the other. + +The following functions are supported. + +Initialization +'''''''''''''' + +* :func:`numpy.random.seed`: with an integer argument only + +.. warning:: + Calling :func:`numpy.random.seed` from interpreted code (including from :term:`object mode` + code) will seed the NumPy random generator, not the Numba random generator. + To seed the Numba random generator, see the example below. + +.. code-block:: python + + from numba import njit + import numpy as np + + @njit + def seed(a): + np.random.seed(a) + + @njit + def rand(): + return np.random.rand() + + + # Incorrect seeding + np.random.seed(1234) + print(rand()) + + np.random.seed(1234) + print(rand()) + + # Correct seeding + seed(1234) + print(rand()) + + seed(1234) + print(rand()) + + + + +Simple random data +'''''''''''''''''' + +* :func:`numpy.random.rand` +* :func:`numpy.random.randint` (only the first two arguments) +* :func:`numpy.random.randn` +* :func:`numpy.random.random` +* :func:`numpy.random.random_sample` +* :func:`numpy.random.ranf` +* :func:`numpy.random.sample` + +Permutations +'''''''''''' + +* :func:`numpy.random.choice`: the optional *p* argument (probabilities + array) is not supported +* :func:`numpy.random.permutation` +* :func:`numpy.random.shuffle`: the sequence argument must be a one-dimension + NumPy array or buffer-providing object (such as a :class:`bytearray` + or :class:`array.array`) + +Distributions +''''''''''''' + +The following functions support all arguments. + +* :func:`numpy.random.beta` +* :func:`numpy.random.binomial` +* :func:`numpy.random.chisquare` +* :func:`numpy.random.dirichlet` +* :func:`numpy.random.exponential` +* :func:`numpy.random.f` +* :func:`numpy.random.gamma` +* :func:`numpy.random.geometric` +* :func:`numpy.random.gumbel` +* :func:`numpy.random.hypergeometric` +* :func:`numpy.random.laplace` +* :func:`numpy.random.logistic` +* :func:`numpy.random.lognormal` +* :func:`numpy.random.logseries` +* :func:`numpy.random.multinomial` +* :func:`numpy.random.negative_binomial` +* :func:`numpy.random.noncentral_chisquare` +* :func:`numpy.random.normal` +* :func:`numpy.random.pareto` +* :func:`numpy.random.poisson` +* :func:`numpy.random.power` +* :func:`numpy.random.rayleigh` +* :func:`numpy.random.standard_cauchy` +* :func:`numpy.random.standard_exponential` +* :func:`numpy.random.standard_gamma` +* :func:`numpy.random.standard_normal` +* :func:`numpy.random.standard_t` +* :func:`numpy.random.triangular` +* :func:`numpy.random.uniform` +* :func:`numpy.random.vonmises` +* :func:`numpy.random.wald` +* :func:`numpy.random.weibull` +* :func:`numpy.random.zipf` + +.. note:: + Calling :func:`numpy.random.seed` from non-Numba code (or from + :term:`object mode` code) will seed the NumPy random generator, not the + Numba random generator. + +.. note:: + Since version 0.28.0, the generator is thread-safe and fork-safe. Each + thread and each process will produce independent streams of random numbers. + + +``stride_tricks`` +----------------- + +The following function from the :mod:`numpy.lib.stride_tricks` module +is supported: + +* :func:`~numpy.lib.stride_tricks.as_strided` (the *strides* argument + is mandatory, the *subok* argument is not supported) + +.. _supported_ufuncs: + +Standard ufuncs +=============== + +One objective of Numba is having all the +`standard ufuncs in NumPy `_ +understood by Numba. When a supported ufunc is found when compiling a +function, Numba maps the ufunc to equivalent native code. This allows the +use of those ufuncs in Numba code that gets compiled in :term:`nopython mode`. + +Limitations +----------- + +Right now, only a selection of the standard ufuncs work in :term:`nopython mode`. +Following is a list of the different standard ufuncs that Numba is aware of, +sorted in the same way as in the NumPy documentation. + + +Math operations +--------------- + +============== ============= =============== + UFUNC MODE +-------------- ------------------------------ + name object mode nopython mode +============== ============= =============== + add Yes Yes + subtract Yes Yes + multiply Yes Yes + divide Yes Yes + logaddexp Yes Yes + logaddexp2 Yes Yes + true_divide Yes Yes + floor_divide Yes Yes + negative Yes Yes + power Yes Yes + float_power Yes Yes + remainder Yes Yes + mod Yes Yes + fmod Yes Yes + divmod (*) Yes Yes + abs Yes Yes + absolute Yes Yes + fabs Yes Yes + rint Yes Yes + sign Yes Yes + conj Yes Yes + exp Yes Yes + exp2 Yes Yes + log Yes Yes + log2 Yes Yes + log10 Yes Yes + expm1 Yes Yes + log1p Yes Yes + sqrt Yes Yes + square Yes Yes + cbrt Yes Yes + reciprocal Yes Yes + conjugate Yes Yes + gcd Yes Yes + lcm Yes Yes +============== ============= =============== + +(\*) not supported on timedelta types + +Trigonometric functions +----------------------- + +============== ============= =============== + UFUNC MODE +-------------- ------------------------------ + name object mode nopython mode +============== ============= =============== + sin Yes Yes + cos Yes Yes + tan Yes Yes + arcsin Yes Yes + arccos Yes Yes + arctan Yes Yes + arctan2 Yes Yes + hypot Yes Yes + sinh Yes Yes + cosh Yes Yes + tanh Yes Yes + arcsinh Yes Yes + arccosh Yes Yes + arctanh Yes Yes + deg2rad Yes Yes + rad2deg Yes Yes + degrees Yes Yes + radians Yes Yes +============== ============= =============== + + +Bit-twiddling functions +----------------------- + +============== ============= =============== + UFUNC MODE +-------------- ------------------------------ + name object mode nopython mode +============== ============= =============== + bitwise_and Yes Yes + bitwise_or Yes Yes + bitwise_xor Yes Yes + bitwise_not Yes Yes + invert Yes Yes + left_shift Yes Yes + right_shift Yes Yes +============== ============= =============== + + +Comparison functions +-------------------- + +============== ============= =============== + UFUNC MODE +-------------- ------------------------------ + name object mode nopython mode +============== ============= =============== + greater Yes Yes + greater_equal Yes Yes + less Yes Yes + less_equal Yes Yes + not_equal Yes Yes + equal Yes Yes + logical_and Yes Yes + logical_or Yes Yes + logical_xor Yes Yes + logical_not Yes Yes + maximum Yes Yes + minimum Yes Yes + fmax Yes Yes + fmin Yes Yes +============== ============= =============== + + +Floating functions +------------------ + +============== ============= =============== + UFUNC MODE +-------------- ------------------------------ + name object mode nopython mode +============== ============= =============== + isfinite Yes Yes + isinf Yes Yes + isnan Yes Yes + signbit Yes Yes + copysign Yes Yes + nextafter Yes Yes + modf Yes No + ldexp Yes (*) Yes + frexp Yes No + floor Yes Yes + ceil Yes Yes + trunc Yes Yes + spacing Yes Yes +============== ============= =============== + +(\*) not supported on windows 32 bit + + +Datetime functions +------------------ + +============== ============= =============== + UFUNC MODE +-------------- ------------------------------ + name object mode nopython mode +============== ============= =============== + isnat Yes Yes +============== ============= =============== diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/pysemantics.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/pysemantics.rst new file mode 100644 index 0000000000000000000000000000000000000000..296d76b081e7881d7c564ed1a5b1ec3c0574ba19 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/pysemantics.rst @@ -0,0 +1,88 @@ +.. _pysemantics: + +Deviations from Python Semantics +================================ + +Bounds Checking +--------------- + +By default, instead of causing an :class:`IndexError`, accessing an +out-of-bound index of an array in a Numba-compiled function will return +invalid values or lead to an access violation error (it's reading from +invalid memory locations). Bounds checking can be enabled on a specific +function via the :ref:`boundscheck ` +option of the jit decorator. Additionally, the :envvar:`NUMBA_BOUNDSCHECK` +can be set to 0 or 1 to globally override this flag. + +.. note:: + Bounds checking will slow down typical functions so it is recommended to only + use this flag for debugging purposes. + +Exceptions and Memory Allocation +-------------------------------- + +Due to limitations in the current compiler when handling exceptions, memory +allocated (almost always NumPy arrays) within a function that raises an +exception will **leak**. This is a known issue that will be fixed, but in the +meantime, it is best to do memory allocation outside of functions that can +also raise exceptions. + +Integer width +------------- + +While Python has arbitrary-sized integers, integers in Numba-compiled +functions get a fixed size through :term:`type inference` (usually, +the size of a machine integer). This means that arithmetic +operations can wrapround or produce undefined results or overflow. + +Type inference can be overridden by an explicit type specification, +if fine-grained control of integer width is desired. + +.. seealso:: + :ref:`Enhancement proposal 1: Changes in integer typing ` + + +Boolean inversion +----------------- + +Calling the bitwise complement operator (the ``~`` operator) on a Python +boolean returns an integer, while the same operator on a Numpy boolean +returns another boolean:: + + >>> ~True + -2 + >>> ~np.bool_(True) + False + +Numba follows the Numpy semantics. + + +Global and closure variables +---------------------------- + +In :term:`nopython mode`, global and closure variables are *frozen* by +Numba: a Numba-compiled function sees the value of those variables at the +time the function was compiled. Also, it is not possible to change their +values from the function. + +Numba **may or may not** copy global variables referenced inside a compiled +function. Small global arrays are copied for potential compiler optimization +with immutability assumption. However, large global arrays are not copied to +conserve memory. The definition of "small" and "large" may change. + + +Zero initialization of variables +-------------------------------- + +Numba does not track variable liveness at runtime. For simplicity of +implementation, all variables are zero-initialized. Example:: + + from numba import njit + + @njit + def foo(): + for i in range(0): + pass + print(i) # will print 0 and not raise UnboundLocalError + + foo() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/pysupported.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/pysupported.rst new file mode 100644 index 0000000000000000000000000000000000000000..f0a6e45d0be77e1b3bc0a8731ce384a54f4502bc --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/pysupported.rst @@ -0,0 +1,1284 @@ +.. _pysupported: + +========================= +Supported Python features +========================= + +Apart from the :ref:`pysupported-language` part below, which applies to both +:term:`object mode` and :term:`nopython mode`, this page only lists the +features supported in :term:`nopython mode`. + +.. warning:: + Numba behavior differs from Python semantics in some situations. We + strongly advise reviewing :ref:`pysemantics` to become familiar with these + differences. + + +.. _pysupported-language: + +Language +======== + +Constructs +---------- + +Numba strives to support as much of the Python language as possible, but +some language features are not available inside Numba-compiled functions. +Below is a quick reference for the support level of Python constructs. + + +**Supported** constructs: + +- conditional branch: ``if .. elif .. else`` +- loops: ``while``, ``for .. in``, ``break``, ``continue`` +- basic generator: ``yield`` +- assertion: ``assert`` + +**Partially supported** constructs: + +- exceptions: ``try .. except``, ``raise``, ``else`` and ``finally`` + (See details in this :ref:`section `) + +- context manager: + ``with`` (only support :ref:`numba.objmode() `) + +- list comprehension (see details in this + :ref:`section `) + +**Unsupported** constructs: + +- async features: ``async with``, ``async for`` and ``async def`` +- class definition: ``class`` (except for :ref:`@jitclass `) +- set, dict and generator comprehensions +- generator delegation: ``yield from`` + +Functions +--------- + +Function calls +'''''''''''''' + +Numba supports function calls using positional and named arguments, as well +as arguments with default values and ``*args`` (note the argument for +``*args`` can only be a tuple, not a list). Explicit ``**kwargs`` are +not supported. + +Function calls to locally defined inner functions are supported as long as +they can be fully inlined. + +Functions as arguments +'''''''''''''''''''''' + +Functions can be passed as argument into another function. But, they cannot +be returned. For example: + +.. code-block:: python + + from numba import jit + + @jit + def add1(x): + return x + 1 + + @jit + def bar(fn, x): + return fn(x) + + @jit + def foo(x): + return bar(add1, x) + + # Passing add1 within numba compiled code. + print(foo(1)) + # Passing add1 into bar from interpreted code + print(bar(add1, 1)) + +.. note:: Numba does not handle function objects as real objects. Once a + function is assigned to a variable, the variable cannot be + re-assigned to a different function. + + +Inner function and closure +''''''''''''''''''''''''''' + +Numba now supports inner functions as long as they are non-recursive +and only called locally, but not passed as argument or returned as +result. The use of closure variables (variables defined in outer scopes) +within an inner function is also supported. + +Recursive calls +''''''''''''''' + +Most recursive call patterns are supported. The only restriction is that the +recursive callee must have a control-flow path that returns without recursing. +Numba is able to type-infer recursive functions without specifying the function +type signature (which is required in numba 0.28 and earlier). +Recursive calls can even call into a different overload of the function. + +.. XXX add reference to NBEP + +Generators +---------- + +Numba supports generator functions and is able to compile them in +:term:`object mode` and :term:`nopython mode`. The returned generator +can be used both from Numba-compiled code and from regular Python code. + +Coroutine features of generators are not supported (i.e. the +:meth:`generator.send`, :meth:`generator.throw`, :meth:`generator.close` +methods). + +.. _pysupported-exception-handling: + +Exception handling +------------------ + +``raise`` statement +''''''''''''''''''' + +The ``raise`` statement is only supported in the following forms: + +* ``raise SomeException`` +* ``raise SomeException()``: in :term:`nopython mode`, constructor + arguments must be :term:`compile-time constants ` + +It is currently unsupported to re-raise an exception created in compiled code. + +``try .. except`` +''''''''''''''''' + +The ``try .. except`` construct is partially supported. The following forms +of are supported: + +* the *bare* except that captures all exceptions: + + .. code-block:: python + + try: + ... + except: + ... + +* using exactly the ``Exception`` class in the ``except`` clause: + + .. code-block:: python + + try: + ... + except Exception: + ... + + This will match any exception that is a subclass of ``Exception`` as + expected. Currently, instances of ``Exception`` and it's subclasses are the + only kind of exception that can be raised in compiled code. + +.. warning:: Numba currently masks signals like ``KeyboardInterrupt`` and + ``SystemExit``. These signaling exceptions are ignored during the execution of + Numba compiled code. The Python interpreter will handle them as soon as + the control is returned to it. + +Currently, exception objects are not materialized inside compiled functions. +As a result, it is not possible to store an exception object into a user +variable or to re-raise an exception. With this limitation, the only realistic +use-case would look like: + +.. code-block:: python + + try: + do_work() + except Exception: + handle_error_case() + return error_code + +``try .. except .. else .. finally`` +'''''''''''''''''''''''''''''''''''' + +The ``else`` block and the ``finally`` block of a ``try .. except`` are +supported: + + .. code-block:: python + + >>> @jit(nopython=True) + ... def foo(): + ... try: + ... print('main block') + ... except Exception: + ... print('handler block') + ... else: + ... print('else block') + ... finally: + ... print('final block') + ... + >>> foo() + main block + else block + final block + +The ``try .. finally`` construct without the ``except`` clause is also +supported. + +.. _pysupported-builtin-types: + +Built-in types +============== + +int, bool +--------- + +Arithmetic operations as well as truth values are supported. + +The following attributes and methods are supported: + +* ``.conjugate()`` +* ``.real`` +* ``.imag`` + +float, complex +-------------- + +Arithmetic operations as well as truth values are supported. + +The following attributes and methods are supported: + +* ``.conjugate()`` +* ``.real`` +* ``.imag`` + +str +--- + +Numba supports (Unicode) strings in Python 3. Strings can be passed into +:term:`nopython mode` as arguments, as well as constructed and returned from +:term:`nopython mode`. As in Python, slices (even of length 1) return a new, +reference counted string. Optimized code paths for efficiently accessing +single characters may be introduced in the future. + +The in-memory representation is the same as was introduced in Python 3.4, with +each string having a tag to indicate whether the string is using a 1, 2, or 4 +byte character width in memory. When strings of different encodings are +combined (as in concatenation), the resulting string automatically uses the +larger character width of the two input strings. String slices also use the +same character width as the original string, even if the slice could be +represented with a narrower character width. (These details are invisible to +the user, of course.) + +The following constructors, functions, attributes and methods are currently +supported: + +* ``str(int)`` +* ``len()`` +* ``+`` (concatenation of strings) +* ``*`` (repetition of strings) +* ``in``, ``.contains()`` +* ``==``, ``<``, ``<=``, ``>``, ``>=`` (comparison) +* ``.capitalize()`` +* ``.casefold()`` +* ``.center()`` +* ``.count()`` +* ``.endswith()`` +* ``.endswith()`` +* ``.expandtabs()`` +* ``.find()`` +* ``.index()`` +* ``.isalnum()`` +* ``.isalpha()`` +* ``.isdecimal()`` +* ``.isdigit()`` +* ``.isidentifier()`` +* ``.islower()`` +* ``.isnumeric()`` +* ``.isprintable()`` +* ``.isspace()`` +* ``.istitle()`` +* ``.isupper()`` +* ``.join()`` +* ``.ljust()`` +* ``.lower()`` +* ``.lstrip()`` +* ``.partition()`` +* ``.replace()`` +* ``.rfind()`` +* ``.rindex()`` +* ``.rjust()`` +* ``.rpartition()`` +* ``.rsplit()`` +* ``.rstrip()`` +* ``.split()`` +* ``.splitlines()`` +* ``.startswith()`` +* ``.strip()`` +* ``.swapcase()`` +* ``.title()`` +* ``.upper()`` +* ``.zfill()`` + +Regular string literals (e.g. ``"ABC"``) as well as f-strings without format specs +(e.g. ``"ABC_{a+1}"``) +that only use string and integer variables (types with ``str()`` overload) +are supported in :term:`nopython mode`. + +Additional operations as well as support for Python 2 strings / Python 3 bytes +will be added in a future version of Numba. Python 2 Unicode objects will +likely never be supported. + +.. warning:: + The performance of some operations is known to be slower than the CPython + implementation. These include substring search (``in``, ``.contains()`` + and ``find()``) and string creation (like ``.split()``). Improving the + string performance is an ongoing task, but the speed of CPython is + unlikely to be surpassed for basic string operation in isolation. + Numba is most successfully used for larger algorithms that happen to + involve strings, where basic string operations are not the bottleneck. + + +tuple +----- + +Tuple support is categorised into two categories based on the contents of a +tuple. The first category is homogeneous tuples, these are tuples where the type +of all the values in the tuple are the same, the second is heterogeneous tuples, +these are tuples where the types of the values are different. + +.. note:: + + The ``tuple()`` constructor itself is NOT supported. + +homogeneous tuples +------------------ + +An example of a homogeneous tuple: + +.. code-block:: python + + homogeneous_tuple = (1, 2, 3, 4) + +The following operations are supported on homogeneous tuples: + +* Tuple construction. +* Tuple unpacking. +* Comparison between tuples. +* Iteration and indexing. +* Addition (concatenation) between tuples. +* Slicing tuples with a constant slice. +* The index method on tuples. + +heterogeneous tuples +-------------------- + +An example of a heterogeneous tuple: + +.. code-block:: python + + heterogeneous_tuple = (1, 2j, 3.0, "a") + +The following operations are supported on heterogeneous tuples: + +* Comparison between tuples. +* Indexing using an index value that is a compile time constant + e.g. ``mytuple[7]``, where ``7`` is evidently a constant. +* Iteration over a tuple (requires experimental :func:`literal_unroll` feature, + see below). + +.. warning:: + The following feature (:func:`literal_unroll`) is experimental and was added + in version 0.47. + +To permit iteration over a heterogeneous tuple the special function +:func:`numba.literal_unroll` must be used. This function has no effect other +than to act as a token to permit the use of this feature. Example use: + +.. code-block:: python + + from numba import njit, literal_unroll + + @njit + def foo(): + heterogeneous_tuple = (1, 2j, 3.0, "a") + for i in literal_unroll(heterogeneous_tuple): + print(i) + +.. warning:: + The following restrictions apply to the use of :func:`literal_unroll`: + + * :func:`literal_unroll` can only be used on tuples and constant lists of + compile time constants, e.g. ``[1, 2j, 3, "a"]`` and the list not being + mutated. + * The only supported use pattern for :func:`literal_unroll` is loop + iteration. + * Only one :func:`literal_unroll` call is permitted per loop nest (i.e. + nested heterogeneous tuple iteration loops are forbidden). + * The usual type inference/stability rules still apply. + +A more involved use of :func:`literal_unroll` might be type specific dispatch, +recall that string and integer literal values are considered their own type, +for example: + +.. code-block:: python + + from numba import njit, types, literal_unroll + from numba.extending import overload + + def dt(x): + # dummy function to overload + pass + + @overload(dt, inline='always') + def ol_dt(li): + if isinstance(li, types.StringLiteral): + value = li.literal_value + if value == "apple": + def impl(li): + return 1 + elif value == "orange": + def impl(li): + return 2 + elif value == "banana": + def impl(li): + return 3 + return impl + elif isinstance(li, types.IntegerLiteral): + value = li.literal_value + if value == 0xca11ab1e: + def impl(li): + # capture the dispatcher literal value + return 0x5ca1ab1e + value + return impl + + @njit + def foo(): + acc = 0 + for t in literal_unroll(('apple', 'orange', 'banana', 3390155550)): + acc += dt(t) + return acc + + print(foo()) + + +list +---- + + +.. warning:: + As of version 0.45.x the internal implementation for the list datatype in + Numba is changing. Until recently, only a single implementation of the list + datatype was available, the so-called *reflected-list* (see below). + However, it was scheduled for deprecation from version 0.44.0 onwards due + to its limitations. As of version 0.45.0 a new implementation, the + so-called *typed-list* (see below), is available as an experimental + feature. For more information, please see: :ref:`deprecation`. + +Creating and returning lists from JIT-compiled functions is supported, +as well as all methods and operations. Lists must be strictly homogeneous: +Numba will reject any list containing objects of different types, even if +the types are compatible (for example, ``[1, 2.5]`` is rejected as it +contains a :class:`int` and a :class:`float`). + +For example, to create a list of arrays:: + + In [1]: from numba import njit + + In [2]: import numpy as np + + In [3]: @njit + ...: def foo(x): + ...: lst = [] + ...: for i in range(x): + ...: lst.append(np.arange(i)) + ...: return lst + ...: + + In [4]: foo(4) + Out[4]: [array([], dtype=int64), array([0]), array([0, 1]), array([0, 1, 2])] + + +.. _feature-reflected-list: + +List Reflection +''''''''''''''' + +In nopython mode, Numba does not operate on Python objects. ``list`` are +compiled into an internal representation. Any ``list`` arguments must be +converted into this representation on the way in to nopython mode and their +contained elements must be restored in the original Python objects via a +process called :term:`reflection`. Reflection is required to maintain the same +semantics as found in regular Python code. However, the reflection process +can be expensive for large lists and it is not supported for lists that contain +reflected data types. Users cannot use list-of-list as an argument because +of this limitation. + +.. note:: + When passing a list into a JIT-compiled function, any modifications + made to the list will not be visible to the Python interpreter until + the function returns. (A limitation of the reflection process.) + +.. warning:: + List sorting currently uses a quicksort algorithm, which has different + performance characterics than the algorithm used by Python. + +.. _feature-list-initial-value: + +Initial Values +'''''''''''''' +.. warning:: + This is an experimental feature! + +Lists that: + +* Are constructed using the square braces syntax +* Have values of a literal type + +will have their initial value stored in the ``.initial_value`` property on the +type so as to permit inspection of these values at compile time. If required, +to force value based dispatch the :ref:`literally ` +function will accept such a list. + +Example: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_literal_container_usage.py + :language: python + :caption: from ``test_ex_initial_value_list_compile_time_consts`` of ``numba/tests/doc_examples/test_literal_container_usage.py`` + :start-after: magictoken.test_ex_initial_value_list_compile_time_consts.begin + :end-before: magictoken.test_ex_initial_value_list_compile_time_consts.end + :dedent: 12 + :linenos: + +.. _feature-typed-list: + +Typed List +'''''''''' + +.. note:: + ``numba.typed.List`` is an experimental feature, if you encounter any bugs in + functionality or suffer from unexpectedly bad performance, please report + this, ideally by opening an issue on the Numba issue tracker. + +As of version 0.45.0 a new implementation of the list data type is available, +the so-called *typed-list*. This is compiled library backed, type-homogeneous +list data type that is an improvement over the *reflected-list* mentioned +above. Additionally, lists can now be arbitrarily nested. Since the +implementation is considered experimental, you will need to import it +explicitly from the `numba.typed` module:: + + In [1]: from numba.typed import List + + In [2]: from numba import njit + + In [3]: @njit + ...: def foo(l): + ...: l.append(23) + ...: return l + ...: + + In [4]: mylist = List() + + In [5]: mylist.append(1) + + In [6]: foo(mylist) + Out[6]: ListType[int64]([1, 23]) + + +.. note:: + As the typed-list stabilizes it will fully replace the reflected-list and the + constructors `[]` and `list()` will create a typed-list instead of a + reflected one. + + +Here's an example using ``List()`` to create ``numba.typed.List`` inside a +jit-compiled function and letting the compiler infer the item type: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_typed_list_usage.py + :language: python + :caption: from ``ex_inferred_list_jit`` of ``numba/tests/doc_examples/test_typed_list_usage.py`` + :start-after: magictoken.ex_inferred_list_jit.begin + :end-before: magictoken.ex_inferred_list_jit.end + :dedent: 12 + :linenos: + +Here's an example of using ``List()`` to create a ``numba.typed.List`` outside of +a jit-compiled function and then using it as an argument to a jit-compiled +function: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_typed_list_usage.py + :language: python + :caption: from ``ex_inferred_list`` of ``numba/tests/doc_examples/test_typed_list_usage.py`` + :start-after: magictoken.ex_inferred_list.begin + :end-before: magictoken.ex_inferred_list.end + :dedent: 12 + :linenos: + +Finally, here's an example of using a nested `List()`: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_typed_list_usage.py + :language: python + :caption: from ``ex_nested_list`` of ``numba/tests/doc_examples/test_typed_list_usage.py`` + :start-after: magictoken.ex_nested_list.begin + :end-before: magictoken.ex_nested_list.end + :dedent: 12 + :linenos: + +.. _feature-literal-list: + +Literal List +'''''''''''' + +.. warning:: + This is an experimental feature! + +Numba supports the use of literal lists containing any values, for example:: + + l = ['a', 1, 2j, np.zeros(5,)] + +the predominant use of these lists is for use as a configuration object. +The lists appear as a ``LiteralList`` type which inherits from ``Literal``, as a +result the literal values of the list items are available at compile time. +For example: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_literal_container_usage.py + :language: python + :caption: from ``test_ex_literal_list`` of ``numba/tests/doc_examples/test_literal_container_usage.py`` + :start-after: magictoken.test_ex_literal_list.begin + :end-before: magictoken.test_ex_literal_list.end + :dedent: 12 + :linenos: + +Important things to note about these kinds of lists: + +#. They are immutable, use of mutating methods e.g. ``.pop()`` will result in + compilation failure. Read-only static access and read only methods are + supported e.g. ``len()``. +#. Dynamic access of items is not possible, e.g. ``some_list[x]``, for a + value ``x`` which is not a compile time constant. This is because it's + impossible to statically determine the type of the item being accessed. +#. Inside the compiler, these lists are actually just tuples with some extra + things added to make them look like they are lists. +#. They cannot be returned to the interpreter from a compiled function. + +.. _pysupported-comprehension: + +List comprehension +'''''''''''''''''' + +Numba supports list comprehension. For example:: + + + In [1]: from numba import njit + + In [2]: @njit + ...: def foo(x): + ...: return [[i for i in range(n)] for n in range(x)] + ...: + + In [3]: foo(3) + Out[3]: [[], [0], [0, 1]] + + +.. note:: + Prior to version 0.39.0, Numba did not support the creation of nested lists. + + +Numba also supports "array comprehension" that is a list comprehension +followed immediately by a call to :func:`numpy.array`. The following +is an example that produces a 2D Numpy array:: + + from numba import jit + import numpy as np + + @jit(nopython=True) + def f(n): + return np.array([ [ x * y for x in range(n) ] for y in range(n) ]) + +In this case, Numba is able to optimize the program to allocate and +initialize the result array directly without allocating intermediate +list objects. Therefore, the nesting of list comprehension here is +not a problem since a multi-dimensional array is being created here +instead of a nested list. + +Additionally, Numba supports parallel array comprehension when combined +with the :ref:`parallel_jit_option` option on CPUs. + +set +--- + +All methods and operations on sets are supported in JIT-compiled functions. + +Sets must be strictly homogeneous: Numba will reject any set containing +objects of different types, even if the types are compatible (for example, +``{1, 2.5}`` is rejected as it contains a :class:`int` and a :class:`float`). +The use of reference counted types, e.g. strings, in sets is unsupported. + +.. note:: + When passing a set into a JIT-compiled function, any modifications + made to the set will not be visible to the Python interpreter until + the function returns. + +.. _feature-typed-dict: + +Typed Dict +---------- + +.. warning:: + ``numba.typed.Dict`` is an experimental feature. The API may change + in the future releases. + +.. note:: + ``dict()`` was not supported in versions prior to 0.44. Currently, calling + ``dict()`` translates to calling ``numba.typed.Dict()``. + +Numba only supports the use of ``dict()`` without any arguments. Such use is +semantically equivalent to ``{}`` and ``numba.typed.Dict()``. It will create +an instance of ``numba.typed.Dict`` where the key-value types will be later +inferred by usage. + +Numba does not fully support the Python ``dict`` because it is an untyped +container that can have any Python types as members. To generate efficient +machine code, Numba needs the keys and the values of the dictionary to have +fixed types, declared in advance. To achieve this, Numba has a typed dictionary, +``numba.typed.Dict``, for which the type-inference mechanism must be able to +infer the key-value types by use, or the user must explicitly declare the +key-value type using the ``Dict.empty()`` constructor method. +This typed dictionary has the same API as the Python ``dict``, it implements +the ``collections.MutableMapping`` interface and is usable in both interpreted +Python code and JIT-compiled Numba functions. +Because the typed dictionary stores keys and values in Numba's native, +unboxed data layout, passing a Numba dictionary into nopython mode has very low +overhead. However, this means that using a typed dictionary from the Python +interpreter is slower than a regular dictionary because Numba has to box and +unbox key and value objects when getting or setting items. + +An important difference of the typed dictionary in comparison to Python's +``dict`` is that **implicit casting** occurs when a key or value is stored. +As a result the *setitem* operation may fail should the type-casting fail. + +It should be noted that the Numba typed dictionary is implemented using the same +algorithm as the CPython 3.7 dictionary. As a consequence, the typed dictionary +is ordered and has the same collision resolution as the CPython implementation. + +Further to the above in relation to type specification, there are limitations +placed on the types that can be used as keys and/or values in the typed +dictionary, most notably the Numba ``Set`` and ``List`` types are currently +unsupported. Acceptable key/value types include but are not limited to: unicode +strings, arrays (value only), scalars, tuples. It is expected that these +limitations will be relaxed as Numba continues to improve. + +Here's an example of using ``dict()`` and ``{}`` to create ``numba.typed.Dict`` +instances and letting the compiler infer the key-value types: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_typed_dict_usage.py + :language: python + :caption: from ``test_ex_inferred_dict_njit`` of ``numba/tests/doc_examples/test_typed_dict_usage.py`` + :start-after: magictoken.ex_inferred_dict_njit.begin + :end-before: magictoken.ex_inferred_dict_njit.end + :dedent: 12 + :linenos: + +Here's an example of creating a ``numba.typed.Dict`` instance from interpreted +code and using the dictionary in jit code: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_typed_dict_usage.py + :language: python + :caption: from ``test_ex_typed_dict_from_cpython`` of ``numba/tests/doc_examples/test_typed_dict_usage.py`` + :start-after: magictoken.ex_typed_dict_from_cpython.begin + :end-before: magictoken.ex_typed_dict_from_cpython.end + :dedent: 12 + :linenos: + +Here's an example of creating a ``numba.typed.Dict`` instance from jit code and +using the dictionary in interpreted code: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_typed_dict_usage.py + :language: python + :caption: from ``test_ex_typed_dict_njit`` of ``numba/tests/doc_examples/test_typed_dict_usage.py`` + :start-after: magictoken.ex_typed_dict_njit.begin + :end-before: magictoken.ex_typed_dict_njit.end + :dedent: 12 + :linenos: + +It should be noted that ``numba.typed.Dict`` is not thread-safe. +Specifically, functions which modify a dictionary from multiple +threads will potentially corrupt memory, causing a +range of possible failures. However, the dictionary can be safely read from +multiple threads as long as the contents of the dictionary do not +change during the parallel access. + +Dictionary comprehension +'''''''''''''''''''''''' + +Numba supports dictionary comprehension under the assumption that a +``numba.typed.Dict`` instance can be created from the comprehension. For +example:: + + In [1]: from numba import njit + + In [2]: @njit + ...: def foo(n): + ...: return {i: i**2 for i in range(n)} + ...: + + In [3]: foo(3) + Out[3]: DictType[int64,int64]({0: 0, 1: 1, 2: 4}) + +.. _feature-dict-initial-value: + +Initial Values +'''''''''''''' +.. warning:: + This is an experimental feature! + +Typed dictionaries that: + +* Are constructed using the curly braces syntax +* Have literal string keys +* Have values of a literal type + +will have their initial value stored in the ``.initial_value`` property on the +type so as to permit inspection of these values at compile time. If required, +to force value based dispatch the :ref:`literally ` +function will accept a typed dictionary. + +Example: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_literal_container_usage.py + :language: python + :caption: from ``test_ex_initial_value_dict_compile_time_consts`` of ``numba/tests/doc_examples/test_literal_container_usage.py`` + :start-after: magictoken.test_ex_initial_value_dict_compile_time_consts.begin + :end-before: magictoken.test_ex_initial_value_dict_compile_time_consts.end + :dedent: 12 + :linenos: + +.. _feature-literal-str-key-dict: + +Heterogeneous Literal String Key Dictionary +------------------------------------------- + +.. warning:: + This is an experimental feature! + +Numba supports the use of statically declared string key to any value +dictionaries, for example:: + + d = {'a': 1, 'b': 'data', 'c': 2j} + +the predominant use of these dictionaries is to orchestrate advanced compilation +dispatch or as a container for use as a configuration object. The dictionaries +appear as a ``LiteralStrKeyDict`` type which inherits from ``Literal``, as a +result the literal values of the keys and the types of the items are available +at compile time. For example: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_literal_container_usage.py + :language: python + :caption: from ``test_ex_literal_dict_compile_time_consts`` of ``numba/tests/doc_examples/test_literal_container_usage.py`` + :start-after: magictoken.test_ex_literal_dict_compile_time_consts.begin + :end-before: magictoken.test_ex_literal_dict_compile_time_consts.end + :dedent: 12 + :linenos: + +Important things to note about these kinds of dictionaries: + +#. They are immutable, use of mutating methods e.g. ``.pop()`` will result in + compilation failure. Read-only static access and read only methods are + supported e.g. ``len()``. +#. Dynamic access of items is not possible, e.g. ``some_dictionary[x]``, for a + value ``x`` which is not a compile time constant. This is because it's + impossible statically determine the type of the item being accessed. +#. Inside the compiler, these dictionaries are actually just named tuples with + some extra things added to make them look like they are dictionaries. +#. They cannot be returned to the interpreter from a compiled function. +#. The ``.keys()``, ``.values()`` and ``.items()`` methods all functionally + operate but return tuples opposed to iterables. + +None +---- + +The None value is supported for identity testing (when using an +:class:`~numba.optional` type). + + +bytes, bytearray, memoryview +---------------------------- + +The :class:`bytearray` type and, on Python 3, the :class:`bytes` type +support indexing, iteration and retrieving the len(). + +The :class:`memoryview` type supports indexing, slicing, iteration, +retrieving the len(), and also the following attributes: + +* :attr:`~memoryview.contiguous` +* :attr:`~memoryview.c_contiguous` +* :attr:`~memoryview.f_contiguous` +* :attr:`~memoryview.itemsize` +* :attr:`~memoryview.nbytes` +* :attr:`~memoryview.ndim` +* :attr:`~memoryview.readonly` +* :attr:`~memoryview.shape` +* :attr:`~memoryview.strides` + + +Built-in functions +================== + +The following built-in functions are supported: + +.. warning:: + Support for ``isinstance`` is an experimental feature. This feature is + automatically enabled by simply using ``isinstance`` in JIT compiled code. + +* :func:`abs` +* :class:`bool` +* :func:`chr` +* :class:`complex` +* :func:`divmod` +* :func:`enumerate` +* :func:`filter` +* :class:`float` +* :func:`hash` (see :ref:`pysupported-hashing` below) +* :class:`int`: only the one-argument form +* :func:`iter`: only the one-argument form +* :func:`isinstance` (experimental support only) +* :func:`len` +* :func:`min` +* :func:`map` +* :func:`max` +* :func:`next`: only the one-argument form +* :func:`ord` +* :func:`print`: only numbers and strings; no ``file`` or ``sep`` argument +* :class:`range`: The only permitted use of range is as a callable function + (cannot pass range as an argument to a jitted function or return a range from + a jitted function). +* :func:`round` +* :func:`sorted`: the ``key`` argument is not supported +* :func:`sum` +* :func:`type`: only the one-argument form, and only on some types + (e.g. numbers and named tuples) +* :func:`zip` + +.. _pysupported-hashing: + +Hashing +------- + +The :func:`hash` built-in is supported and produces hash values for all +supported hashable types with the following Python version specific behavior: + +Under Python 3, hash values computed by Numba will exactly match those computed +in CPython under the condition that the :attr:`sys.hash_info.algorithm` is +``siphash24`` (default). + +The ``PYTHONHASHSEED`` environment variable influences the hashing behavior in +precisely the manner described in the CPython documentation. + + +Standard library modules +======================== + +``array`` +--------- + +Limited support for the :class:`array.array` type is provided through +the buffer protocol. Indexing, iteration and taking the len() is supported. +All type codes are supported except for ``"u"``. + +``cmath`` +--------- + +The following functions from the :mod:`cmath` module are supported: + +* :func:`cmath.acos` +* :func:`cmath.acosh` +* :func:`cmath.asin` +* :func:`cmath.asinh` +* :func:`cmath.atan` +* :func:`cmath.atanh` +* :func:`cmath.cos` +* :func:`cmath.cosh` +* :func:`cmath.exp` +* :func:`cmath.isfinite` +* :func:`cmath.isinf` +* :func:`cmath.isnan` +* :func:`cmath.log` +* :func:`cmath.log10` +* :func:`cmath.phase` +* :func:`cmath.polar` +* :func:`cmath.rect` +* :func:`cmath.sin` +* :func:`cmath.sinh` +* :func:`cmath.sqrt` +* :func:`cmath.tan` +* :func:`cmath.tanh` + +``collections`` +--------------- + +Named tuple classes, as returned by :func:`collections.namedtuple`, are +supported in the same way regular tuples are supported. Attribute access +and named parameters in the constructor are also supported. + +Creating a named tuple class inside Numba code is *not* supported; the class +must be created at the global level. + +.. _ctypes-support: + +``ctypes`` +---------- + +Numba is able to call ctypes-declared functions with the following argument +and return types: + +* :class:`ctypes.c_int8` +* :class:`ctypes.c_int16` +* :class:`ctypes.c_int32` +* :class:`ctypes.c_int64` +* :class:`ctypes.c_uint8` +* :class:`ctypes.c_uint16` +* :class:`ctypes.c_uint32` +* :class:`ctypes.c_uint64` +* :class:`ctypes.c_float` +* :class:`ctypes.c_double` +* :class:`ctypes.c_void_p` + +``enum`` +-------- + +Both :class:`enum.Enum` and :class:`enum.IntEnum` subclasses are supported. + +``math`` +-------- + +The following functions from the :mod:`math` module are supported: + +* :func:`math.acos` +* :func:`math.acosh` +* :func:`math.asin` +* :func:`math.asinh` +* :func:`math.atan` +* :func:`math.atan2` +* :func:`math.atanh` +* :func:`math.ceil` +* :func:`math.copysign` +* :func:`math.cos` +* :func:`math.cosh` +* :func:`math.degrees` +* :func:`math.erf` +* :func:`math.erfc` +* :func:`math.exp` +* :func:`math.expm1` +* :func:`math.fabs` +* :func:`math.floor` +* :func:`math.frexp` +* :func:`math.gamma` +* :func:`math.gcd` +* :func:`math.hypot` +* :func:`math.isfinite` +* :func:`math.isinf` +* :func:`math.isnan` +* :func:`math.ldexp` +* :func:`math.lgamma` +* :func:`math.log` +* :func:`math.log10` +* :func:`math.log1p` +* :func:`math.pow` +* :func:`math.radians` +* :func:`math.sin` +* :func:`math.sinh` +* :func:`math.sqrt` +* :func:`math.tan` +* :func:`math.tanh` +* :func:`math.trunc` + +``operator`` +------------ + +The following functions from the :mod:`operator` module are supported: + +* :func:`operator.add` +* :func:`operator.and_` +* :func:`operator.eq` +* :func:`operator.floordiv` +* :func:`operator.ge` +* :func:`operator.gt` +* :func:`operator.iadd` +* :func:`operator.iand` +* :func:`operator.ifloordiv` +* :func:`operator.ilshift` +* :func:`operator.imatmul` (Python 3.5 and above) +* :func:`operator.imod` +* :func:`operator.imul` +* :func:`operator.invert` +* :func:`operator.ior` +* :func:`operator.ipow` +* :func:`operator.irshift` +* :func:`operator.isub` +* :func:`operator.itruediv` +* :func:`operator.ixor` +* :func:`operator.le` +* :func:`operator.lshift` +* :func:`operator.lt` +* :func:`operator.matmul` (Python 3.5 and above) +* :func:`operator.mod` +* :func:`operator.mul` +* :func:`operator.ne` +* :func:`operator.neg` +* :func:`operator.not_` +* :func:`operator.or_` +* :func:`operator.pos` +* :func:`operator.pow` +* :func:`operator.rshift` +* :func:`operator.sub` +* :func:`operator.truediv` +* :func:`operator.xor` + +``functools`` +------------- + +The :func:`functools.reduce` function is supported but the `initializer` +argument is required. + +.. _pysupported-random: + +``random`` +---------- + +Numba supports top-level functions from the :mod:`random` module, but does +not allow you to create individual Random instances. A Mersenne-Twister +generator is used, with a dedicated internal state. It is initialized at +startup with entropy drawn from the operating system. + +* :func:`random.betavariate` +* :func:`random.expovariate` +* :func:`random.gammavariate` +* :func:`random.gauss` +* :func:`random.getrandbits`: number of bits must not be greater than 64 +* :func:`random.lognormvariate` +* :func:`random.normalvariate` +* :func:`random.paretovariate` +* :func:`random.randint` +* :func:`random.random` +* :func:`random.randrange` +* :func:`random.seed`: with an integer argument only +* :func:`random.shuffle`: the sequence argument must be a one-dimension + Numpy array or buffer-providing object (such as a :class:`bytearray` + or :class:`array.array`); the second (optional) argument is not supported +* :func:`random.uniform` +* :func:`random.triangular` +* :func:`random.vonmisesvariate` +* :func:`random.weibullvariate` + +.. warning:: + Calling :func:`random.seed` from non-Numba code (or from :term:`object mode` + code) will seed the Python random generator, not the Numba random generator. + To seed the Numba random generator, see the example below. + +.. code-block:: python + + from numba import njit + import random + + @njit + def seed(a): + random.seed(a) + + @njit + def rand(): + return random.random() + + + # Incorrect seeding + random.seed(1234) + print(rand()) + + random.seed(1234) + print(rand()) + + # Correct seeding + seed(1234) + print(rand()) + + seed(1234) + print(rand()) + + +.. note:: + Since version 0.28.0, the generator is thread-safe and fork-safe. Each + thread and each process will produce independent streams of random numbers. + +.. seealso:: + Numba also supports most additional distributions from the :ref:`Numpy + random module `. + +``heapq`` +--------- + +The following functions from the :mod:`heapq` module are supported: + +* :func:`heapq.heapify` +* :func:`heapq.heappop` +* :func:`heapq.heappush` +* :func:`heapq.heappushpop` +* :func:`heapq.heapreplace` +* :func:`heapq.nlargest` : first two arguments only +* :func:`heapq.nsmallest` : first two arguments only + +Note: the heap must be seeded with at least one value to allow its type to be +inferred; heap items are assumed to be homogeneous in type. + + +Third-party modules +=================== + +.. I put this here as there's only one module (apart from Numpy), otherwise + it should be a separate page. + +.. _cffi-support: + +``cffi`` +-------- + +Similarly to ctypes, Numba is able to call into `cffi`_-declared external +functions, using the following C types and any derived pointer types: + +* :c:type:`char` +* :c:type:`short` +* :c:type:`int` +* :c:type:`long` +* :c:type:`long long` +* :c:type:`unsigned char` +* :c:type:`unsigned short` +* :c:type:`unsigned int` +* :c:type:`unsigned long` +* :c:type:`unsigned long long` +* :c:type:`int8_t` +* :c:type:`uint8_t` +* :c:type:`int16_t` +* :c:type:`uint16_t` +* :c:type:`int32_t` +* :c:type:`uint32_t` +* :c:type:`int64_t` +* :c:type:`uint64_t` +* :c:type:`float` +* :c:type:`double` +* :c:type:`ssize_t` +* :c:type:`size_t` +* :c:type:`void` + +The ``from_buffer()`` method of ``cffi.FFI`` and ``CompiledFFI`` objects is +supported for passing Numpy arrays and other buffer-like objects. Only +*contiguous* arguments are accepted. The argument to ``from_buffer()`` +is converted to a raw pointer of the appropriate C type (for example a +``double *`` for a ``float64`` array). + +Additional type mappings for the conversion from a buffer to the appropriate C +type may be registered with Numba. This may include struct types, though it is +only permitted to call functions that accept pointers to structs - passing a +struct by value is unsupported. For registering a mapping, use: + +.. function:: numba.core.typing.cffi_utils.register_type(cffi_type, numba_type) + +Out-of-line cffi modules must be registered with Numba prior to the use of any +of their functions from within Numba-compiled functions: + +.. function:: numba.core.typing.cffi_utils.register_module(mod) + + Register the cffi out-of-line module ``mod`` with Numba. + +Inline cffi modules require no registration. + +.. _cffi: https://cffi.readthedocs.org/ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/types.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/types.rst new file mode 100644 index 0000000000000000000000000000000000000000..75ff343207845e880b095e2564dec1738505c84f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/types.rst @@ -0,0 +1,375 @@ +.. _numba-types: + +==================== +Types and signatures +==================== + +Rationale +========= + +As an optimizing compiler, Numba needs to decide on the type of each +variable to generate efficient machine code. Python's standard types +are not precise enough for that, so we had to develop our own fine-grained +type system. + +You will encounter Numba types mainly when trying to inspect the results +of Numba's type inference, for :ref:`debugging ` or +:ref:`educational ` purposes. However, you need to use +types explicitly if compiling code :ref:`ahead-of-time `. + + +Signatures +========== + +A signature specifies the type of a function. Exactly which kind +of signature is allowed depends on the context (:term:`AOT` or :term:`JIT` +compilation), but signatures always involve some representation of Numba +types to specify the concrete types for the function's arguments and, +if required, the function's return type. + +An example function signature would be the string ``"f8(i4, i4)"`` +(or the equivalent ``"float64(int32, int32)"``) which specifies a +function taking two 32-bit integers and returning a double-precision float. + + +Basic types +=========== + +The most basic types can be expressed through simple expressions. The +symbols below refer to attributes of the main ``numba`` module (so if +you read "boolean", it means that symbol can be accessed as ``numba.boolean``). +Many types are available both as a canonical name and a shorthand alias, +following NumPy's conventions. + +Numbers +------- + +The following table contains the elementary numeric types currently defined +by Numba and their aliases. + +=================== ========= =================================== +Type name(s) Shorthand Comments +=================== ========= =================================== +boolean b1 represented as a byte +uint8, byte u1 8-bit unsigned byte +uint16 u2 16-bit unsigned integer +uint32 u4 32-bit unsigned integer +uint64 u8 64-bit unsigned integer + +int8, char i1 8-bit signed byte +int16 i2 16-bit signed integer +int32 i4 32-bit signed integer +int64 i8 64-bit signed integer + +intc -- C int-sized integer +uintc -- C int-sized unsigned integer +intp -- pointer-sized integer +uintp -- pointer-sized unsigned integer +ssize_t -- C ssize_t +size_t -- C size_t + +float32 f4 single-precision floating-point number +float64, double f8 double-precision floating-point number + +complex64 c8 single-precision complex number +complex128 c16 double-precision complex number +=================== ========= =================================== + +Arrays +------ + +The easy way to declare :class:`~numba.types.Array` types is to subscript an +elementary type according to the number of dimensions. For example a +1-dimension single-precision array:: + + >>> numba.float32[:] + array(float32, 1d, A) + +or a 3-dimension array of the same underlying type:: + + >>> numba.float32[:, :, :] + array(float32, 3d, A) + +This syntax defines array types with no particular layout (producing code +that accepts both non-contiguous and contiguous arrays), but you can +specify a particular contiguity by using the ``::1`` index either at +the beginning or the end of the index specification:: + + >>> numba.float32[::1] + array(float32, 1d, C) + >>> numba.float32[:, :, ::1] + array(float32, 3d, C) + >>> numba.float32[::1, :, :] + array(float32, 3d, F) + +Functions +--------- + +.. warning:: + The feature of considering functions as first-class type objects is + under development. + +Functions are often considered as certain transformations of +input arguments to output values. Within Numba :term:`JIT` compiled +functions, the functions can also be considered as objects, that is, +functions can be passed around as arguments or return values, or used +as items in sequences, in addition to being callable. + +First-class function support is enabled for all Numba :term:`JIT` +compiled functions and Numba ``cfunc`` compiled functions except when: + +- using a non-CPU compiler, +- the compiled function is a Python generator, +- the compiled function has Omitted arguments, +- or the compiled function returns Optional value. + +To disable first-class function support, use ``no_cfunc_wrapper=True`` +decorator option. + +For instance, consider an example where the Numba :term:`JIT` compiled +function applies user-specified functions as a composition to an input +argument:: + + >>> @numba.njit + ... def composition(funcs, x): + ... r = x + ... for f in funcs[::-1]: + ... r = f(r) + ... return r + ... + >>> @numba.cfunc("double(double)") + ... def a(x): + ... return x + 1.0 + ... + >>> @numba.njit + ... def b(x): + ... return x * x + ... + >>> composition((a, b), 0.5), 0.5 ** 2 + 1 + (1.25, 1.25) + >>> composition((b, a, b, b, a), 0.5), b(a(b(b(a(0.5))))) + (36.75390625, 36.75390625) + +Here, ``cfunc`` compiled functions ``a`` and ``b`` are considered as +first-class function objects because these are passed in to the Numba +:term:`JIT` compiled function ``composition`` as arguments, that is, the +``composition`` is :term:`JIT` compiled independently from its argument function +objects (that are collected in the input argument ``funcs``). + +Currently, first-class function objects can be Numba ``cfunc`` compiled +functions, :term:`JIT` compiled functions, and objects that implement the +Wrapper Address Protocol (WAP, see below) with the following restrictions: + +======================== ============ ============== =========== +Context JIT compiled cfunc compiled WAP objects +======================== ============ ============== =========== +Can be used as arguments yes yes yes +Can be called yes yes yes +Can be used as items yes\* yes yes +Can be returned yes yes yes +Namespace scoping yes yes yes +Automatic overload yes no no +======================== ============ ============== =========== + +\* at least one of the items in a sequence of first-class function objects must +have a precise type. + + +Wrapper Address Protocol - WAP +++++++++++++++++++++++++++++++ + +Wrapper Address Protocol provides an API for making any Python object +a first-class function for Numba :term:`JIT` compiled functions. This assumes +that the Python object represents a compiled function that can be +called via its memory address (function pointer value) from Numba :term:`JIT` +compiled functions. The so-called WAP objects must define the +following two methods: + +.. method:: __wrapper_address__(self) -> int + + Return the memory address of a first-class function. This + method is used when a Numba :term:`JIT` compiled function tries to + call the given WAP instance. + +.. method:: signature(self) -> numba.typing.Signature + + Return the signature of the given first-class + function. This method is used when passing in the given + WAP instance to a Numba :term:`JIT` compiled function. + +In addition, the WAP object may implement the ``__call__`` +method. This is necessary when calling WAP objects from Numba +:term:`JIT` compiled functions in :term:`object mode`. + +As an example, let us call the standard math library function ``cos`` +within a Numba :term:`JIT` compiled function. The memory address of ``cos`` can +be established after loading the math library and using the ``ctypes`` +package:: + + >>> import numba, ctypes, ctypes.util, math + >>> libm = ctypes.cdll.LoadLibrary(ctypes.util.find_library('m')) + >>> class LibMCos(numba.types.WrapperAddressProtocol): + ... def __wrapper_address__(self): + ... return ctypes.cast(libm.cos, ctypes.c_voidp).value + ... def signature(self): + ... return numba.float64(numba.float64) + ... + >>> @numba.njit + ... def foo(f, x): + ... return f(x) + ... + >>> foo(LibMCos(), 0.0) + 1.0 + >>> foo(LibMCos(), 0.5), math.cos(0.5) + (0.8775825618903728, 0.8775825618903728) + +Miscellaneous Types +------------------- + +There are some non-numerical types that do not fit into the other categories. + +=================== ================================================= +Type name(s) Comments +=================== ================================================= +pyobject generic Python object +voidptr raw pointer, no operations can be performed on it +=================== ================================================= + +Advanced types +============== + +For more advanced declarations, you have to explicitly call helper +functions or classes provided by Numba. + +.. warning:: + The APIs documented here are not guaranteed to be stable. Unless + necessary, it is recommended to let Numba infer argument types by using + the :ref:`signature-less variant of @jit `. + +.. A word of note: I only documented those types that can be genuinely + useful to users, i.e. types that can be passed as parameters to a JIT + function. Other types such as tuple are only usable in type inference. + + +Inference +--------- + +.. function:: numba.typeof(value) + + Create a Numba type accurately describing the given Python *value*. + ``ValueError`` is raised if the value isn't supported in + :term:`nopython mode`. + + :: + + >>> numba.typeof(np.empty(3)) + array(float64, 1d, C) + >>> numba.typeof((1, 2.0)) + (int64, float64) + >>> numba.typeof([0]) + reflected list(int64) + + +NumPy scalars +------------- + +Instead of using :func:`~numba.typeof`, non-trivial scalars such as +structured types can also be constructed programmatically. + +.. function:: numba.from_dtype(dtype) + + Create a Numba type corresponding to the given NumPy *dtype*:: + + >>> struct_dtype = np.dtype([('row', np.float64), ('col', np.float64)]) + >>> ty = numba.from_dtype(struct_dtype) + >>> ty + Record([('row', '>> ty[:, :] + unaligned array(Record([('row', '`_. + + +Arrays +------ + +.. class:: numba.types.Array(dtype, ndim, layout) + + Create an array type. *dtype* should be a Numba type. *ndim* is the + number of dimensions of the array (a positive integer). *layout* + is a string giving the layout of the array: ``A`` means any layout, ``C`` + means C-contiguous and ``F`` means Fortran-contiguous. + + +Optional types +-------------- + +.. class:: numba.optional(typ) + + Create an optional type based on the underlying Numba type *typ*. + The optional type will allow any value of either *typ* or :const:`None`. + + :: + + >>> @jit((optional(intp),)) + ... def f(x): + ... return x is not None + ... + >>> f(0) + True + >>> f(None) + False + + +Type annotations +----------------- + +.. function:: numba.extending.as_numba_type(py_type) + + Create a Numba type corresponding to the given Python *type annotation*. + ``TypingError`` is raised if the type annotation can't be mapped to a Numba + type. This function is meant to be used at statically compile time to + evaluate Python type annotations. For runtime checking of Python objects + see ``typeof`` above. + + For any numba type, ``as_numba_type(nb_type) == nb_type``. + + >>> numba.extending.as_numba_type(int) + int64 + >>> import typing # the Python library, not the Numba one + >>> numba.extending.as_numba_type(typing.List[float]) + ListType[float64] + >>> numba.extending.as_numba_type(numba.int32) + int32 + + ``as_numba_type`` is automatically updated to include any ``@jitclass``. + + >>> @jitclass + ... class Counter: + ... x: int + ... + ... def __init__(self): + ... self.x = 0 + ... + ... def inc(self): + ... old_val = self.x + ... self.x += 1 + ... return old_val + ... + >>> numba.extending.as_numba_type(Counter) + instance.jitclass.Counter#11bad4278 + + Currently ``as_numba_type`` is only used to infer fields for ``@jitclass``. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/utils.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/utils.rst new file mode 100644 index 0000000000000000000000000000000000000000..3fc866e5cc99cd61ca401af4234009d938629923 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/reference/utils.rst @@ -0,0 +1,34 @@ + +========= +Utilities +========= + +Dealing with pointers +===================== + +These functions can be called from pure Python as well as in +:term:`nopython mode`. + + +.. function:: numba.carray(ptr, shape, dtype=None) + + Return a Numpy array view over the data pointed to by *ptr* with the + given *shape*, in C order. If *dtype* is given, it is used as the array's + dtype, otherwise the array's dtype is inferred from *ptr*'s type. + As the returned array is a view, not a copy, writing to it will modify + the original data. + + *ptr* should be a ctypes pointer object (either a typed pointer + as created using :func:`~ctypes.POINTER`, or a :class:`~ctypes.c_void_p`). + + *shape* should be an integer or a tuple of integers. + + *dtype* should be a Numpy dtype or scalar class (i.e. both + ``np.dtype('int8')`` and ``np.int8`` are accepted). + + +.. function:: numba.farray(ptr, shape, dtype=None) + + Same as :func:`~numba.carray`, but the data is assumed to be laid out + in Fortran order, and the array view is constructed accordingly. + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/release-notes.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/release-notes.rst new file mode 100644 index 0000000000000000000000000000000000000000..103366e15907eab631739481c88184ae7f46ae67 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/release-notes.rst @@ -0,0 +1,5 @@ +====================== +Release Notes +====================== + +.. include:: ../../CHANGE_LOG diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/5minguide.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/5minguide.rst new file mode 100644 index 0000000000000000000000000000000000000000..4ab382a6b0c63c71318f22970f0bacb173a37244 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/5minguide.rst @@ -0,0 +1,223 @@ +.. _numba-5_mins: + +A ~5 minute guide to Numba +========================== + +Numba is a just-in-time compiler for Python that works best on code that uses +NumPy arrays and functions, and loops. The most common way to use Numba is +through its collection of decorators that can be applied to your functions to +instruct Numba to compile them. When a call is made to a Numba-decorated +function it is compiled to machine code "just-in-time" for execution and all or +part of your code can subsequently run at native machine code speed! + +Out of the box Numba works with the following: + +* OS: Windows (32 and 64 bit), OSX, Linux (32 and 64 bit). Unofficial support on + \*BSD. +* Architecture: x86, x86_64, ppc64le, armv7l, armv8l (aarch64). Unofficial + support on M1/Arm64. +* GPUs: Nvidia CUDA. +* CPython +* NumPy 1.18 - latest + +How do I get it? +---------------- +Numba is available as a `conda `_ package for the +`Anaconda Python distribution `_:: + + $ conda install numba + +Numba also has wheels available:: + + $ pip install numba + +Numba can also be +:ref:`compiled from source `, although we do +not recommend it for first-time Numba users. + +Numba is often used as a core package so its dependencies are kept to an +absolute minimum, however, extra packages can be installed as follows to provide +additional functionality: + +* ``scipy`` - enables support for compiling ``numpy.linalg`` functions. +* ``colorama`` - enables support for color highlighting in backtraces/error + messages. +* ``pyyaml`` - enables configuration of Numba via a YAML config file. +* ``icc_rt`` - allows the use of the Intel SVML (high performance short vector + math library, x86_64 only). Installation instructions are in the + :ref:`performance tips `. + +Will Numba work for my code? +---------------------------- +This depends on what your code looks like, if your code is numerically +orientated (does a lot of math), uses NumPy a lot and/or has a lot of loops, +then Numba is often a good choice. In these examples we'll apply the most +fundamental of Numba's JIT decorators, ``@jit``, to try and speed up some +functions to demonstrate what works well and what does not. + +Numba works well on code that looks like this:: + + from numba import jit + import numpy as np + + x = np.arange(100).reshape(10, 10) + + @jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit + def go_fast(a): # Function is compiled to machine code when called the first time + trace = 0.0 + for i in range(a.shape[0]): # Numba likes loops + trace += np.tanh(a[i, i]) # Numba likes NumPy functions + return a + trace # Numba likes NumPy broadcasting + + print(go_fast(x)) + + +It won't work very well, if at all, on code that looks like this:: + + from numba import jit + import pandas as pd + + x = {'a': [1, 2, 3], 'b': [20, 30, 40]} + + @jit + def use_pandas(a): # Function will not benefit from Numba jit + df = pd.DataFrame.from_dict(a) # Numba doesn't know about pd.DataFrame + df += 1 # Numba doesn't understand what this is + return df.cov() # or this! + + print(use_pandas(x)) + +Note that Pandas is not understood by Numba and as a result Numba would simply +run this code via the interpreter but with the added cost of the Numba internal +overheads! + +What is ``nopython`` mode? +-------------------------- +The Numba ``@jit`` decorator fundamentally operates in two compilation modes, +``nopython`` mode and ``object`` mode. In the ``go_fast`` example above, +``nopython=True`` is set in the ``@jit`` decorator; this is instructing Numba to +operate in ``nopython`` mode. The behaviour of the ``nopython`` compilation mode +is to essentially compile the decorated function so that it will run entirely +without the involvement of the Python interpreter. This is the recommended and +best-practice way to use the Numba ``jit`` decorator as it leads to the best +performance. + +Should the compilation in ``nopython`` mode fail, Numba can compile using +``object mode``. This is a fall back mode for the ``@jit`` decorator if +``nopython=True`` is not set (as seen in the ``use_pandas`` example above). In +this mode Numba will identify loops that it can compile and compile those into +functions that run in machine code, and it will run the rest of the code in the +interpreter. For best performance avoid using this mode! + +How to measure the performance of Numba? +---------------------------------------- +First, recall that Numba has to compile your function for the argument types +given before it executes the machine code version of your function. This takes +time. However, once the compilation has taken place Numba caches the machine +code version of your function for the particular types of arguments presented. +If it is called again with the same types, it can reuse the cached version +instead of having to compile again. + +A really common mistake when measuring performance is to not account for the +above behaviour and to time code once with a simple timer that includes the +time taken to compile your function in the execution time. + +For example:: + + from numba import jit + import numpy as np + import time + + x = np.arange(100).reshape(10, 10) + + @jit(nopython=True) + def go_fast(a): # Function is compiled and runs in machine code + trace = 0.0 + for i in range(a.shape[0]): + trace += np.tanh(a[i, i]) + return a + trace + + # DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME! + start = time.perf_counter() + go_fast(x) + end = time.perf_counter() + print("Elapsed (with compilation) = {}s".format((end - start))) + + # NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE + start = time.perf_counter() + go_fast(x) + end = time.perf_counter() + print("Elapsed (after compilation) = {}s".format((end - start))) + +This, for example prints:: + + Elapsed (with compilation) = 0.33030009269714355s + Elapsed (after compilation) = 6.67572021484375e-06s + +A good way to measure the impact Numba JIT has on your code is to time execution +using the `timeit `_ module +functions; these measure multiple iterations of execution and, as a result, +can be made to accommodate for the compilation time in the first execution. + +As a side note, if compilation time is an issue, Numba JIT supports +:ref:`on-disk caching ` of compiled functions and also has +an :ref:`Ahead-Of-Time ` compilation mode. + +How fast is it? +--------------- +Assuming Numba can operate in ``nopython`` mode, or at least compile some loops, +it will target compilation to your specific CPU. Speed up varies depending on +application but can be one to two orders of magnitude. Numba has a +:ref:`performance guide ` that covers common options for +gaining extra performance. + +How does Numba work? +-------------------- +Numba reads the Python bytecode for a decorated function and combines this with +information about the types of the input arguments to the function. It analyzes +and optimizes your code, and finally uses the LLVM compiler library to generate +a machine code version of your function, tailored to your CPU capabilities. This +compiled version is then used every time your function is called. + +Other things of interest: +------------------------- +Numba has quite a few decorators, we've seen ``@jit``, but there's +also: + +* ``@njit`` - this is an alias for ``@jit(nopython=True)`` as it is so commonly + used! +* ``@vectorize`` - produces NumPy ``ufunc`` s (with all the ``ufunc`` methods + supported). :ref:`Docs are here `. +* ``@guvectorize`` - produces NumPy generalized ``ufunc`` s. + :ref:`Docs are here `. +* ``@stencil`` - declare a function as a kernel for a stencil like operation. + :ref:`Docs are here `. +* ``@jitclass`` - for jit aware classes. :ref:`Docs are here `. +* ``@cfunc`` - declare a function for use as a native call back (to be called + from C/C++ etc). :ref:`Docs are here `. +* ``@overload`` - register your own implementation of a function for use in + nopython mode, e.g. ``@overload(scipy.special.j0)``. + :ref:`Docs are here `. + +Extra options available in some decorators: + +* ``parallel = True`` - :ref:`enable ` the + :ref:`automatic parallelization ` of the function. +* ``fastmath = True`` - enable :ref:`fast-math ` + behaviour for the function. + +ctypes/cffi/cython interoperability: + +* ``cffi`` - The calling of :ref:`CFFI ` functions is supported + in ``nopython`` mode. +* ``ctypes`` - The calling of :ref:`ctypes ` wrapped + functions is supported in ``nopython`` mode. +* Cython exported functions :ref:`are callable `. + +GPU targets: +~~~~~~~~~~~~ + +Numba can target `Nvidia CUDA `_ GPUs. +You can write a kernel in pure Python and have Numba handle the computation and +data movement (or do this explicitly). Click for Numba documentation on +:ref:`CUDA `. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/cfunc.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/cfunc.rst new file mode 100644 index 0000000000000000000000000000000000000000..845dc96341afaf1456b2b54ba2d6d819f85d488d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/cfunc.rst @@ -0,0 +1,218 @@ +.. _cfunc: + +==================================== +Creating C callbacks with ``@cfunc`` +==================================== + +Interfacing with some native libraries (for example written in C or C++) +can necessitate writing native callbacks to provide business logic to the +library. The :func:`numba.cfunc` decorator creates a compiled function +callable from foreign C code, using the signature of your choice. + + +Basic usage +=========== + +The ``@cfunc`` decorator has a similar usage to ``@jit``, but with an +important difference: passing a single signature is mandatory. +It determines the visible signature of the C callback:: + + from numba import cfunc + + @cfunc("float64(float64, float64)") + def add(x, y): + return x + y + + +The C function object exposes the address of the compiled C callback as +the :attr:`~CFunc.address` attribute, so that you can pass it to any +foreign C or C++ library. It also exposes a :mod:`ctypes` callback +object pointing to that callback; that object is also callable from +Python, making it easy to check the compiled code:: + + @cfunc("float64(float64, float64)") + def add(x, y): + return x + y + + print(add.ctypes(4.0, 5.0)) # prints "9.0" + + +Example +======= + +In this example, we are going to be using the ``scipy.integrate.quad`` +function. That function accepts either a regular Python callback or +a C callback wrapped in a :mod:`ctypes` callback object. + +Let's define a pure Python integrand and compile it as a +C callback:: + + >>> import numpy as np + >>> from numba import cfunc + >>> def integrand(t): + return np.exp(-t) / t**2 + ...: + >>> nb_integrand = cfunc("float64(float64)")(integrand) + +We can pass the ``nb_integrand`` object's :mod:`ctypes` callback to +``scipy.integrate.quad`` and check that the results are the same as with +the pure Python function:: + + >>> import scipy.integrate as si + >>> def do_integrate(func): + """ + Integrate the given function from 1.0 to +inf. + """ + return si.quad(func, 1, np.inf) + ...: + >>> do_integrate(integrand) + (0.14849550677592208, 3.8736750296130505e-10) + >>> do_integrate(nb_integrand.ctypes) + (0.14849550677592208, 3.8736750296130505e-10) + + +Using the compiled callback, the integration function does not invoke the +Python interpreter each time it evaluates the integrand. In our case, the +integration is made 18 times faster:: + + >>> %timeit do_integrate(integrand) + 1000 loops, best of 3: 242 µs per loop + >>> %timeit do_integrate(nb_integrand.ctypes) + 100000 loops, best of 3: 13.5 µs per loop + + +Dealing with pointers and array memory +====================================== + +A less trivial use case of C callbacks involves doing operation on some +array of data passed by the caller. As C doesn't have a high-level +abstraction similar to Numpy arrays, the C callback's signature will pass +low-level pointer and size arguments. Nevertheless, the Python code for +the callback will expect to exploit the power and expressiveness of Numpy +arrays. + +In the following example, the C callback is expected to operate on 2-d arrays, +with the signature ``void(double *input, double *output, int m, int n)``. +You can implement such a callback thusly:: + + from numba import cfunc, types, carray + + c_sig = types.void(types.CPointer(types.double), + types.CPointer(types.double), + types.intc, types.intc) + + @cfunc(c_sig) + def my_callback(in_, out, m, n): + in_array = carray(in_, (m, n)) + out_array = carray(out, (m, n)) + for i in range(m): + for j in range(n): + out_array[i, j] = 2 * in_array[i, j] + + +The :func:`numba.carray` function takes as input a data pointer and a shape +and returns an array view of the given shape over that data. The data is +assumed to be laid out in C order. If the data is laid out in Fortran order, +:func:`numba.farray` should be used instead. + + +Handling C structures +===================== + + +With CFFI +--------- + +For applications that have a lot of state, it is useful to pass data in C +structures. To simplify the interoperability with C code, numba can convert +a ``cffi`` type into a numba ``Record`` type using +``numba.core.typing.cffi_utils.map_type``:: + + from numba.core.typing import cffi_utils + + nbtype = cffi_utils.map_type(cffi_type, use_record_dtype=True) + +.. note:: **use_record_dtype=True** is needed otherwise pointers to C + structures are returned as void pointers. + +.. note:: From v0.49 the ``numba.cffi_support`` module has been phased out + in favour of ``numba.core.typing.cffi_utils`` + + +For example:: + + from cffi import FFI + + src = """ + + /* Define the C struct */ + typedef struct my_struct { + int i1; + float f2; + double d3; + float af4[7]; // arrays are supported + } my_struct; + + /* Define a callback function */ + typedef double (*my_func)(my_struct*, size_t); + """ + + ffi = FFI() + ffi.cdef(src) + + # Get the function signature from *my_func* + sig = cffi_utils.map_type(ffi.typeof('my_func'), use_record_dtype=True) + + # Make the cfunc + from numba import cfunc, carray + + @cfunc(sig) + def foo(ptr, n): + base = carray(ptr, n) # view pointer as an array of my_struct + tmp = 0 + for i in range(n): + tmp += base[i].i1 * base[i].f2 / base[i].d3 + tmp += base[i].af4.sum() # nested arrays are like normal NumPy arrays + return tmp + + +With ``numba.types.Record.make_c_struct`` +----------------------------------------- + +The ``numba.types.Record`` type can be created manually to follow a +C-structure's layout. To do that, use ``Record.make_c_struct``, for example:: + + my_struct = types.Record.make_c_struct([ + # Provides a sequence of 2-tuples i.e. (name:str, type:Type) + ('i1', types.int32), + ('f2', types.float32), + ('d3', types.float64), + ('af4', types.NestedArray(dtype=types.float32, shape=(7,))), + ]) + +Due to ABI limitations, structures should be passed as pointers +using ``types.CPointer(my_struct)`` as the argument type. Inside the ``cfunc`` +body, the ``my_struct*`` can be accessed with ``carray``. + +Full example +------------ + +See full example in ``examples/notebooks/Accessing C Struct Data.ipynb``. + + +Signature specification +======================= + +The explicit ``@cfunc`` signature can use any :ref:`Numba types `, +but only a subset of them make sense for a C callback. You should +generally limit yourself to scalar types (such as ``int8`` or ``float64``) +,pointers to them (for example ``types.CPointer(types.int8)``), or pointers +to ``Record`` type. + + +Compilation options +=================== + +A number of keyword-only arguments can be passed to the ``@cfunc`` +decorator: ``nopython`` and ``cache``. Their meaning is similar to those +in the ``@jit`` decorator. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/cli.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/cli.rst new file mode 100644 index 0000000000000000000000000000000000000000..3764c687f6e285f02791149053773f65a9d7d0bd --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/cli.rst @@ -0,0 +1,165 @@ +.. _cli: + +Command line interface +====================== + +Numba is a Python package, usually you ``import numba`` from Python and use the +Python application programming interface (API). However, Numba also ships with a +command line interface (CLI), i.e. a tool ``numba`` that is installed when you +install Numba. + +Currently, the only purpose of the CLI is to allow you to quickly show some +information about your system and installation, or to quickly get some debugging +information for a Python script using Numba. + +.. _cli_usage: + +Usage +----- + +To use the Numba CLI from the terminal, use ``numba`` followed by the options +and arguments like ``--help`` or ``-s``, as explained below. + +Sometimes it can happen that you get a "command not found" error when you type +``numba``, because your ``PATH`` isn't configured properly. In that case you can +use the equivalent command ``python -m numba``. If that still gives "command +not found", try to ``import numba`` as suggested here: +:ref:`numba-source-install-check`. + +The two versions ``numba`` and ``python -m numba`` are the same. The first is +shorter to type, but if you get a "command not found" error because your +``PATH`` doesn't contain the location where ``numba`` is installed, having the +``python -m numba`` variant is useful. + +To use the Numba CLI from IPython or Jupyter, use ``!numba``, i.e. prefix the +command with an exclamation mark. This is a general IPython/Jupyter feature to +execute shell commands, it is not available in the regular ``python`` terminal. + +.. _cli_help: + +Help +---- + +To see all available options, use ``numba --help``:: + + $ numba --help + usage: numba [-h] [--annotate] [--dump-llvm] [--dump-optimized] + [--dump-assembly] [--annotate-html ANNOTATE_HTML] [-s] + [--sys-json SYS_JSON] + [filename] + + positional arguments: + filename Python source filename + + optional arguments: + -h, --help show this help message and exit + --annotate Annotate source + --dump-llvm Print generated llvm assembly + --dump-optimized Dump the optimized llvm assembly + --dump-assembly Dump the LLVM generated assembly + --annotate-html ANNOTATE_HTML + Output source annotation as html + -s, --sysinfo Output system information for bug reporting + --sys-json SYS_JSON Saves the system info dict as a json file + + +.. _cli_sysinfo: + +System information +------------------ + +The ``numba -s`` (or the equivalent ``numba --sysinfo``) command prints a lot of +information about your system and your Numba installation and relevant +dependencies. + +Remember: you can use ``!numba -s`` with an exclamation mark to see this +information from IPython or Jupyter. + +Example output:: + + $ numba -s + + System info: + -------------------------------------------------------------------------------- + __Time Stamp__ + 2019-05-07 14:15:39.733994 + + __Hardware Information__ + Machine : x86_64 + CPU Name : haswell + CPU count : 8 + CPU Features : + aes avx avx2 bmi bmi2 cmov cx16 f16c fma fsgsbase invpcid lzcnt mmx movbe pclmul + popcnt rdrnd sahf sse sse2 sse3 sse4.1 sse4.2 ssse3 xsave xsaveopt + + __OS Information__ + Platform : Darwin-18.5.0-x86_64-i386-64bit + Release : 18.5.0 + System Name : Darwin + Version : Darwin Kernel Version 18.5.0: Mon Mar 11 20:40:32 PDT 2019; root:xnu-4903.251.3~3/RELEASE_X86_64 + OS specific info : 10.14.4 x86_64 + + __Python Information__ + Python Compiler : Clang 4.0.1 (tags/RELEASE_401/final) + Python Implementation : CPython + Python Version : 3.7.3 + Python Locale : en_US UTF-8 + + __LLVM information__ + LLVM version : 7.0.0 + + __CUDA Information__ + CUDA driver library cannot be found or no CUDA enabled devices are present. + Error class: + + __SVML Information__ + SVML state, config.USING_SVML : False + SVML library found and loaded : False + llvmlite using SVML patched LLVM : True + SVML operational : False + + __Threading Layer Information__ + TBB Threading layer available : False + +--> Disabled due to : Unknown import problem. + OpenMP Threading layer available : False + +--> Disabled due to : Unknown import problem. + Workqueue Threading layer available : True + + __Numba Environment Variable Information__ + None set. + + __Conda Information__ + conda_build_version : 3.17.8 + conda_env_version : 4.6.14 + platform : osx-64 + python_version : 3.7.3.final.0 + root_writable : True + + __Current Conda Env__ + (output truncated due to length) + +.. _cli_debug: + +Debugging +--------- + +As shown in the help output above, the ``numba`` command includes options that +can help you to debug Numba compiled code. + +To try it out, create an example script called ``myscript.py``:: + + import numba + + @numba.jit + def f(x): + return 2 * x + + f(42) + +and then execute one of the following commands:: + + $ numba myscript.py --annotate + $ numba myscript.py --annotate-html myscript.html + $ numba myscript.py --dump-llvm + $ numba myscript.py --dump-optimized + $ numba myscript.py --dump-assembly diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/examples.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/examples.rst new file mode 100644 index 0000000000000000000000000000000000000000..2651c12ef63b3bda066b691254051ed1d01bac01 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/examples.rst @@ -0,0 +1,52 @@ +======== +Examples +======== + + +Mandelbrot +---------- + +.. literalinclude:: ../../../numba/tests/doc_examples/test_examples.py + :language: python + :caption: from ``test_mandelbrot`` of ``numba/tests/doc_examples/test_examples.py`` + :start-after: magictoken.ex_mandelbrot.begin + :end-before: magictoken.ex_mandelbrot.end + :dedent: 12 + :linenos: + +.. _example-movemean: + +Moving average +-------------- + +.. literalinclude:: ../../../numba/tests/doc_examples/test_examples.py + :language: python + :caption: from ``test_moving_average`` of ``numba/tests/doc_examples/test_examples.py`` + :start-after: magictoken.ex_moving_average.begin + :end-before: magictoken.ex_moving_average.end + :dedent: 12 + :linenos: + +Multi-threading +--------------- + +The code below showcases the potential performance improvement when +using the :ref:`nogil ` feature. For example, on a 4-core machine, +the following results were printed:: + + numpy (1 thread) 145 ms + numba (1 thread) 128 ms + numba (4 threads) 35 ms + +.. note:: + If preferred it's possible to use the standard `concurrent.futures + `_ module + rather than spawn threads and dispatch tasks by hand. + +.. literalinclude:: ../../../numba/tests/doc_examples/test_examples.py + :language: python + :caption: from ``test_no_gil`` of ``numba/tests/doc_examples/test_examples.py`` + :start-after: magictoken.ex_no_gil.begin + :end-before: magictoken.ex_no_gil.end + :dedent: 12 + :linenos: diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/faq.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/faq.rst new file mode 100644 index 0000000000000000000000000000000000000000..b8fc588b0a55b55547eddf3f8d32ed2915ad3ca0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/faq.rst @@ -0,0 +1,413 @@ + +========================== +Frequently Asked Questions +========================== + +Installation +============ + +Numba could not be imported +--------------------------- + +If you are seeing an exception on importing Numba with an error message +that starts with:: + + ImportError: Numba could not be imported. + +here are some common issues and things to try to fix it. + +#. Your installation has more than one version of Numba a given environment. + + Common ways this occurs include: + + * Installing Numba with conda and then installing again with pip. + * Installing Numba with pip and then updating to a new version with pip (pip + re-installations don't seem to always clean up very well). + + To fix this the best approach is to create an entirely new environment and + install a single version of Numba in that environment using a package manager + of your choice. + +#. Your installation has Numba for Python version X but you are running with + Python version Y. + + This occurs due to a variety of Python environment mix-up/mismatch problems. + The most common mismatch comes from installing Numba into the + site-packages/environment of one version of Python by using a base or + system installation of Python that is a different version, this typically + happens through the use of the "wrong" ``pip`` binary. This will obviously + cause problems as the C-Extensions on which Numba relies are bound to + specific Python versions. A way to check if this likely the problem is to + see if the path to the ``python`` binary at:: + + python -c 'import sys; print(sys.executable)' + + matches the path to your installation tool and/or matches the reported + installation location and if the Python versions match up across all of + these. Note that Python version ``X.Y.A`` is compatible with ``X.Y.B``. + + To fix this the best approach is to create an entirely new environment and + ensure that the installation tool used to install Numba is the one from that + environment/the Python versions at install and run time match. + +#. Your core system libraries are too old. + + This is a somewhat rare occurrence, but there are occasions when a very old + (typically out of support) version of Linux is in use it doesn't have a + ``glibc`` library with sufficiently new versioned symbols for Numba's shared + libraries to resolve against. The fix for this is to update your OS system + libraries/update your OS. + +#. You are using an IDE e.g. Spyder. + + There are some unknown issues in relation to installing Numba via IDEs, but + it would appear that these are likely variations of 1. or 2. with the same + suggested fixes. Also, try installation from outside of the IDE with the + command line. + + +If you have an installation problem which is not one of the above problems, +please do ask on `numba.discourse.group `_ and +if possible include the path where Numba is installed and also the output of:: + + python -c 'import sys; print(sys.executable)' + + +Programming +=========== + +Can I pass a function as an argument to a jitted function? +---------------------------------------------------------- + +As of Numba 0.39, you can, so long as the function argument has also been +JIT-compiled:: + + @jit(nopython=True) + def f(g, x): + return g(x) + g(-x) + + result = f(jitted_g_function, 1) + +However, dispatching with arguments that are functions has extra overhead. +If this matters for your application, you can also use a factory function to +capture the function argument in a closure:: + + def make_f(g): + # Note: a new f() is created each time make_f() is called! + @jit(nopython=True) + def f(x): + return g(x) + g(-x) + return f + + f = make_f(jitted_g_function) + result = f(1) + +Improving the dispatch performance of functions in Numba is an ongoing task. + +Numba doesn't seem to care when I modify a global variable +---------------------------------------------------------- + +Numba considers global variables as compile-time constants. If you want +your jitted function to update itself when you have modified a global +variable's value, one solution is to recompile it using the +:meth:`~Dispatcher.recompile` method. This is a relatively slow operation, +though, so you may instead decide to rearchitect your code and turn the +global variable into a function argument. + +Can I debug a jitted function? +------------------------------ + +Calling into :mod:`pdb` or other such high-level facilities is currently not +supported from Numba-compiled code. However, you can temporarily disable +compilation by setting the :envvar:`NUMBA_DISABLE_JIT` environment +variable. + +How can I create a Fortran-ordered array? +----------------------------------------- + +Numba currently doesn't support the ``order`` argument to most Numpy +functions such as :func:`numpy.empty` (because of limitations in the +:term:`type inference` algorithm). You can work around this issue by +creating a C-ordered array and then transposing it. For example:: + + a = np.empty((3, 5), order='F') + b = np.zeros(some_shape, order='F') + +can be rewritten as:: + + a = np.empty((5, 3)).T + b = np.zeros(some_shape[::-1]).T + +How can I increase integer width? +--------------------------------- + +By default, Numba will generally use machine integer width for integer +variables. On a 32-bit machine, you may sometimes need the magnitude of +64-bit integers instead. You can simply initialize relevant variables as +``np.int64`` (for example ``np.int64(0)`` instead of ``0``). It will +propagate to all computations involving those variables. + +.. _parallel_faqs: + +How can I tell if ``parallel=True`` worked? +------------------------------------------- + +If the ``parallel=True`` transformations failed for a function +decorated as such, a warning will be displayed. See also +:ref:`numba-parallel-diagnostics` for information about parallel diagnostics. + +Performance +=========== + +Does Numba inline functions? +---------------------------- + +Numba gives enough information to LLVM so that functions short enough +can be inlined. This only works in :term:`nopython mode`. + +Does Numba vectorize array computations (SIMD)? +----------------------------------------------- + +Numba doesn't implement such optimizations by itself, but it lets LLVM +apply them. + +Why has my loop not vectorized? +------------------------------- + +Numba enables the loop-vectorize optimization in LLVM by default. +While it is a powerful optimization, not all loops are applicable. +Sometimes, loop-vectorization may fail due to subtle details like memory access +pattern. To see additional diagnostic information from LLVM, +add the following lines: + +.. code-block:: python + + import llvmlite.binding as llvm + llvm.set_option('', '--debug-only=loop-vectorize') + +This tells LLVM to print debug information from the **loop-vectorize** +pass to stderr. Each function entry looks like: + + +.. note:: + Using ``--debug-only`` requires LLVM to be build with assertions enabled to + work. Use the build of llvmlite in the `Numba channel `_ + which is linked against LLVM with assertions enabled. + +.. code-block:: text + + LV: Checking a loop in "" from + LV: Loop hints: force=? width=0 unroll=0 + ... + LV: Vectorization is possible but not beneficial. + LV: Interleaving is not beneficial. + +Each function entry is separated by an empty line. The reason for rejecting +the vectorization is usually at the end of the entry. In the example above, +LLVM rejected the vectorization because doing so will not speedup the loop. +In this case, it can be due to memory access pattern. For instance, the +array being looped over may not be in contiguous layout. + +When memory access pattern is non-trivial such that it cannot determine the +access memory region, LLVM may reject with the following message: + +.. code-block:: text + + LV: Can't vectorize due to memory conflicts + +Another common reason is: + +.. code-block:: text + + LV: Not vectorizing: loop did not meet vectorization requirements. + +In this case, vectorization is rejected because the vectorized code may behave +differently. This is a case to try turning on ``fastmath=True`` to allow +fastmath instructions. + +Why are the ``typed`` containers slower when used from the interpreter? +----------------------------------------------------------------------- + +The Numba ``typed`` containers found in ``numba.typed`` e.g. +``numba.typed.List`` store their data in an efficient form for access from JIT +compiled code. When these containers are used from the CPython interpreter, the +data involved has to be converted from/to the container format. This process is +relatively costly and as a result impacts performance. In JIT compiled code no +such penalty exists and so operations on the containers are much quicker and +often faster than the pure Python equivalent. + +Does Numba automatically parallelize code? +------------------------------------------ + +It can, in some cases: + +* Ufuncs and gufuncs with the ``target="parallel"`` option will run on multiple threads. +* The ``parallel=True`` option to ``@jit`` will attempt to optimize array + operations and run them in parallel. It also adds support for ``prange()`` to + explicitly parallelize a loop. + +You can also manually run computations on multiple threads yourself and use +the ``nogil=True`` option (see :ref:`releasing the GIL `). Numba +can also target parallel execution on GPU architectures using its CUDA and HSA +backends. + + +Can Numba speed up short-running functions? +------------------------------------------- + +Not significantly. New users sometimes expect to JIT-compile such +functions:: + + def f(x, y): + return x + y + +and get a significant speedup over the Python interpreter. But there isn't +much Numba can improve here: most of the time is probably spent in CPython's +function call mechanism, rather than the function itself. As a rule of +thumb, if a function takes less than 10 µs to execute: leave it. + +The exception is that you *should* JIT-compile that function if it is called +from another jitted function. + +There is a delay when JIT-compiling a complicated function, how can I improve it? +--------------------------------------------------------------------------------- + +Try to pass ``cache=True`` to the ``@jit`` decorator. It will keep the +compiled version on disk for later use. + +A more radical alternative is :ref:`ahead-of-time compilation `. + + +GPU Programming +=============== + +How do I work around the ``CUDA initialized before forking`` error? +------------------------------------------------------------------- + +On Linux, the ``multiprocessing`` module in the Python standard library +defaults to using the ``fork`` method for creating new processes. Because of +the way process forking duplicates state between the parent and child +processes, CUDA will not work correctly in the child process if the CUDA +runtime was initialized *prior* to the fork. Numba detects this and raises a +``CudaDriverError`` with the message ``CUDA initialized before forking``. + +One approach to avoid this error is to make all calls to ``numba.cuda`` +functions inside the child processes or after the process pool is created. +However, this is not always possible, as you might want to query the number of +available GPUs before starting the process pool. In Python 3, you can change +the process start method, as described in the `multiprocessing documentation +`_. +Switching from ``fork`` to ``spawn`` or ``forkserver`` will avoid the CUDA +initialization issue, although the child processes will not inherit any global +variables from their parent. + + +Integration with other utilities +================================ + +Can I "freeze" an application which uses Numba? +----------------------------------------------- + +If you're using PyInstaller or a similar utility to freeze an application, +you may encounter issues with llvmlite. llvmlite needs a non-Python DLL +for its working, but it won't be automatically detected by freezing utilities. +You have to inform the freezing utility of the DLL's location: it will +usually be named ``llvmlite/binding/libllvmlite.so`` or +``llvmlite/binding/llvmlite.dll``, depending on your system. + +I get errors when running a script twice under Spyder +----------------------------------------------------- + +When you run a script in a console under Spyder, Spyder first tries to +reload existing modules. This doesn't work well with Numba, and can +produce errors like ``TypeError: No matching definition for argument type(s)``. + +There is a fix in the Spyder preferences. Open the "Preferences" window, +select "Console", then "Advanced Settings", click the "Set UMR excluded +modules" button, and add ``numba`` inside the text box that pops up. + +To see the setting take effect, be sure to restart the IPython console or +kernel. + +.. _llvm-locale-bug: + +Why does Numba complain about the current locale? +------------------------------------------------- + +If you get an error message such as the following:: + + RuntimeError: Failed at nopython (nopython mode backend) + LLVM will produce incorrect floating-point code in the current locale + +it means you have hit a LLVM bug which causes incorrect handling of +floating-point constants. This is known to happen with certain third-party +libraries such as the Qt backend to matplotlib. + +To work around the bug, you need to force back the locale to its default +value, for example:: + + import locale + locale.setlocale(locale.LC_NUMERIC, 'C') + +How do I get Numba development builds? +-------------------------------------- + +Pre-release versions of Numba can be installed with conda:: + + $ conda install -c numba/label/dev numba + + +Miscellaneous +============= + +Where does the project name "Numba" come from? +---------------------------------------------- + +"Numba" is a combination of "NumPy" and "Mamba". Mambas are some of the fastest +snakes in the world, and Numba makes your Python code fast. + +How do I reference/cite/acknowledge Numba in other work? +-------------------------------------------------------- +For academic use, the best option is to cite our ACM Proceedings: `Numba: a +LLVM-based Python JIT compiler. +`_ You can also find +`the sources on github `_, including +`a pre-print pdf +`_, in case +you don't have access to the ACM site but would like to read the paper. + +Other related papers +~~~~~~~~~~~~~~~~~~~~ +A paper describing ParallelAccelerator technology, that is activated when the +``parallel=True`` jit option is used, can be found `here +`_. + +How do I write a minimal working reproducer for a problem with Numba? +--------------------------------------------------------------------- + +A minimal working reproducer for Numba should include: + +1. The source code of the function(s) that reproduce the problem. +2. Some example data and a demonstration of calling the reproducing code with + that data. As Numba compiles based on type information, unless your problem + is numerical, it's fine to just provide dummy data of the right type, e.g. + use ``numpy.ones`` of the correct ``dtype``/size/shape for arrays. +3. Ideally put 1. and 2. into a script with all the correct imports. Make sure + your script actually executes and reproduces the problem before submitting + it! The target is to make it so that the script can just be copied directly + from the `issue tracker `_ and run by + someone else such that they can see the same problem as you are having. + +Having made a reproducer, now remove every part of the code that does not +contribute directly to reproducing the problem to create a "minimal" reproducer. +This means removing imports that aren't used, removing variables that aren't +used or have no effect, removing lines of code which have no effect, reducing +the complexity of expressions, and shrinking input data to the minimal amount +required to trigger the problem. + +Doing the above really helps out the Numba issue triage process and will enable +a faster response to your problem! + +`Suggested further reading +`_ on +writing minimal working reproducers. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/generated-jit.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/generated-jit.rst new file mode 100644 index 0000000000000000000000000000000000000000..975df08c5903c5a2bca008173d0913f0cb47a942 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/generated-jit.rst @@ -0,0 +1,71 @@ +.. _generated-jit: + +================================================ +Flexible specializations with ``@generated_jit`` +================================================ + + +While the :func:`~numba.jit` decorator is useful for many situations, +sometimes you want to write a function that has different implementations +depending on its input types. The :func:`~numba.generated_jit` decorator +allows the user to control the selection of a specialization at compile-time, +while fully retaining runtime execution speed of a JIT function. + + +Example +======= + +Suppose you want to write a function which returns whether a given value +is a "missing" value according to certain conventions. For the sake of +the example, let's adopt the following definition: + +- for floating-point arguments, a missing value is a ``NaN`` +- for Numpy datetime64 and timedelta64 arguments, a missing value is a ``NaT`` +- other types don't have the concept of a missing value. + +That compile-time logic is easily implemented using the +:func:`~numba.generated_jit` decorator:: + + import numpy as np + + from numba import generated_jit, types + + @generated_jit(nopython=True) + def is_missing(x): + """ + Return True if the value is missing, False otherwise. + """ + if isinstance(x, types.Float): + return lambda x: np.isnan(x) + elif isinstance(x, (types.NPDatetime, types.NPTimedelta)): + # The corresponding Not-a-Time value + missing = x('NaT') + return lambda x: x == missing + else: + return lambda x: False + + +There are several things to note here: + +* The decorated function is called with the :ref:`Numba types ` + of the arguments, not their values. + +* The decorated function doesn't actually compute a result, it returns + a callable implementing the actual definition of the function for the + given types. + +* It is possible to pre-compute some data at compile-time (the ``missing`` + variable above) to have them reused inside the compiled implementation. + +* The function definitions use the same names for arguments as in the + decorated function, this is required to ensure passing arguments by + name works as expected. + + +Compilation options +=================== + +The :func:`~numba.generated_jit` decorator supports the same keyword-only +arguments as the :func:`~numba.jit` decorator, for example the ``nopython`` +and ``cache`` options. + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/index.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..74d2575fb3387ec02f2e3a52324d4581e8814c64 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/index.rst @@ -0,0 +1,26 @@ + +User Manual +=========== + +.. toctree:: + + 5minguide.rst + overview.rst + installing.rst + jit.rst + generated-jit.rst + vectorize.rst + jitclass.rst + cfunc.rst + pycc.rst + parallel.rst + stencil.rst + withobjmode.rst + jit-module.rst + performance-tips.rst + threading-layer.rst + cli.rst + troubleshoot.rst + faq.rst + examples.rst + talks.rst diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/installing.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/installing.rst new file mode 100644 index 0000000000000000000000000000000000000000..d06a91ee17046c1c3a8890f46a330d1e1fa525aa --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/installing.rst @@ -0,0 +1,373 @@ + +Installation +============ + +Compatibility +------------- + +Numba is compatible with Python 3.7--3.10, and Numpy versions 1.18 up to 1.23. + +Our supported platforms are: + +* Linux x86 (32-bit and 64-bit) +* Linux ppcle64 (POWER8, POWER9) +* Windows 7 and later (32-bit and 64-bit) +* OS X 10.9 and later (64-bit and unofficial support on M1/Arm64) +* \*BSD (unofficial support only) +* NVIDIA GPUs of compute capability 5.3 and later + + * Compute capabilities 3.5 - 5.2 are supported, but deprecated. +* ARMv7 (32-bit little-endian, such as Raspberry Pi 2 and 3) +* ARMv8 (64-bit little-endian, such as the NVIDIA Jetson) + +:ref:`numba-parallel` is only available on 64-bit platforms. + +Installing using conda on x86/x86_64/POWER Platforms +---------------------------------------------------- + +The easiest way to install Numba and get updates is by using ``conda``, +a cross-platform package manager and software distribution maintained +by Anaconda, Inc. You can either use `Anaconda +`_ to get the full stack in one download, +or `Miniconda `_ which will install +the minimum packages required for a conda environment. + +Once you have conda installed, just type:: + + $ conda install numba + +or:: + + $ conda update numba + +Note that Numba, like Anaconda, only supports PPC in 64-bit little-endian mode. + +To enable CUDA GPU support for Numba, install the latest `graphics drivers from +NVIDIA `_ for your platform. +(Note that the open source Nouveau drivers shipped by default with many Linux +distributions do not support CUDA.) Then install the ``cudatoolkit`` package:: + + $ conda install cudatoolkit + +You do not need to install the CUDA SDK from NVIDIA. + + +Installing using pip on x86/x86_64 Platforms +-------------------------------------------- + +Binary wheels for Windows, Mac, and Linux are also available from `PyPI +`_. You can install Numba using ``pip``:: + + $ pip install numba + +This will download all of the needed dependencies as well. You do not need to +have LLVM installed to use Numba (in fact, Numba will ignore all LLVM +versions installed on the system) as the required components are bundled into +the llvmlite wheel. + +To use CUDA with Numba installed by `pip`, you need to install the `CUDA SDK +`_ from NVIDIA. Please refer to +:ref:`cudatoolkit-lookup` for details. Numba can also detect CUDA libraries +installed system-wide on Linux. + + +.. _numba-install-armv7: + +Installing on Linux ARMv7 Platforms +----------------------------------- + +`Berryconda `_ is a +conda-based Python distribution for the Raspberry Pi. We are now uploading +packages to the ``numba`` channel on Anaconda Cloud for 32-bit little-endian, +ARMv7-based boards, which currently includes the Raspberry Pi 2 and 3, +but not the Pi 1 or Zero. These can be installed using conda from the +``numba`` channel:: + + $ conda install -c numba numba + +Berryconda and Numba may work on other Linux-based ARMv7 systems, but this has +not been tested. + + +Installing on Linux ARMv8 (AArch64) Platforms +--------------------------------------------- + +We build and test conda packages on the `NVIDIA Jetson TX2 +`_, +but they are likely to work for other AArch64 platforms. (Note that while the +Raspberry Pi CPU is 64-bit, Raspbian runs it in 32-bit mode, so look at +:ref:`numba-install-armv7` instead.) + +Conda-forge support for AArch64 is still quite experimental and packages are limited, +but it does work enough for Numba to build and pass tests. To set up the environment: + +* Install `miniforge `_. + This will create a minimal conda environment. + +* Then you can install Numba from the ``numba`` channel:: + + $ conda install -c numba numba + +On CUDA-enabled systems, like the Jetson, the CUDA toolkit should be +automatically detected in the environment. + +.. _numba-source-install-instructions: + +Installing from source +---------------------- + +Installing Numba from source is fairly straightforward (similar to other +Python packages), but installing `llvmlite +`_ can be quite challenging due to the need +for a special LLVM build. If you are building from source for the purposes of +Numba development, see :ref:`buildenv` for details on how to create a Numba +development environment with conda. + +If you are building Numba from source for other reasons, first follow the +`llvmlite installation guide `_. +Once that is completed, you can download the latest Numba source code from +`Github `_:: + + $ git clone git://github.com/numba/numba.git + +Source archives of the latest release can also be found on +`PyPI `_. In addition to ``llvmlite``, you will also need: + +* A C compiler compatible with your Python installation. If you are using + Anaconda, you can use the following conda packages: + + * Linux ``x86``: ``gcc_linux-32`` and ``gxx_linux-32`` + * Linux ``x86_64``: ``gcc_linux-64`` and ``gxx_linux-64`` + * Linux ``POWER``: ``gcc_linux-ppc64le`` and ``gxx_linux-ppc64le`` + * Linux ``ARM``: no conda packages, use the system compiler + * Mac OSX: ``clang_osx-64`` and ``clangxx_osx-64`` or the system compiler at + ``/usr/bin/clang`` (Mojave onwards) + * Mac OSX (M1): ``clang_osx-arm64`` and ``clangxx_osx-arm64`` + * Windows: a version of Visual Studio appropriate for the Python version in + use + +* `NumPy `_ + +Then you can build and install Numba from the top level of the source tree:: + + $ python setup.py install + +.. _numba-source-install-env_vars: + +Build time environment variables and configuration of optional components +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Below are environment variables that are applicable to altering how Numba would +otherwise build by default along with information on configuration options. + +.. envvar:: NUMBA_DISABLE_OPENMP (default: not set) + + To disable compilation of the OpenMP threading backend set this environment + variable to a non-empty string when building. If not set (default): + + * For Linux and Windows it is necessary to provide OpenMP C headers and + runtime libraries compatible with the compiler tool chain mentioned above, + and for these to be accessible to the compiler via standard flags. + * For OSX the conda package ``llvm-openmp`` provides suitable C headers and + libraries. If the compilation requirements are not met the OpenMP threading + backend will not be compiled. + +.. envvar:: NUMBA_DISABLE_TBB (default: not set) + + To disable the compilation of the TBB threading backend set this environment + variable to a non-empty string when building. If not set (default) the TBB C + headers and libraries must be available at compile time. If building with + ``conda build`` this requirement can be met by installing the ``tbb-devel`` + package. If not building with ``conda build`` the requirement can be met via a + system installation of TBB or through the use of the ``TBBROOT`` environment + variable to provide the location of the TBB installation. For more + information about setting ``TBBROOT`` see the `Intel documentation `_. + +.. _numba-source-install-check: + +Dependency List +--------------- + +Numba has numerous required and optional dependencies which additionally may +vary with target operating system and hardware. The following lists them all +(as of July 2020). + +* Required build time: + + * ``setuptools`` + * ``numpy`` + * ``llvmlite`` + * Compiler toolchain mentioned above + +* Required run time: + + * ``setuptools`` + * ``numpy`` + * ``llvmlite`` + +* Optional build time: + + See :ref:`numba-source-install-env_vars` for more details about additional + options for the configuration and specification of these optional components. + + * ``llvm-openmp`` (OSX) - provides headers for compiling OpenMP support into + Numba's threading backend + * ``tbb-devel`` - provides TBB headers/libraries for compiling TBB support + into Numba's threading backend (2021 <= version < 2021.6 required). + * ``importlib_metadata`` (for Python versions < 3.9) + +* Optional runtime are: + + * ``scipy`` - provides cython bindings used in Numba's ``np.linalg.*`` + support + * ``tbb`` - provides the TBB runtime libraries used by Numba's TBB threading + backend (version >= 2021 required). + * ``jinja2`` - for "pretty" type annotation output (HTML) via the ``numba`` + CLI + * ``cffi`` - permits use of CFFI bindings in Numba compiled functions + * ``llvm-openmp`` - (OSX) provides OpenMP library support for Numba's OpenMP + threading backend. + * ``intel-openmp`` - (OSX) provides an alternative OpenMP library for use with + Numba's OpenMP threading backend. + * ``ipython`` - if in use, caching will use IPython's cache + directories/caching still works + * ``pyyaml`` - permits the use of a ``.numba_config.yaml`` + file for storing per project configuration options + * ``colorama`` - makes error message highlighting work + * ``icc_rt`` - (numba channel) allows Numba to use Intel SVML for extra + performance + * ``pygments`` - for "pretty" type annotation + * ``gdb`` as an executable on the ``$PATH`` - if you would like to use the gdb + support + * Compiler toolchain mentioned above, if you would like to use ``pycc`` for + Ahead-of-Time (AOT) compilation + * ``r2pipe`` - required for assembly CFG inspection. + * ``radare2`` as an executable on the ``$PATH`` - required for assembly CFG + inspection. `See here `_ for + information on obtaining and installing. + * ``graphviz`` - for some CFG inspection functionality. + * ``pickle5`` - provides Python 3.8 pickling features for faster pickling in + Python 3.7. + * ``typeguard`` - used by ``runtests.py`` for + :ref:`runtime type-checking `. + * ``cuda-python`` - The NVIDIA CUDA Python bindings. See :ref:`cuda-bindings`. + Numba requires Version 11.6 or greater. + +* To build the documentation: + + * ``sphinx`` + * ``pygments`` + * ``sphinx_rtd_theme`` + * ``numpydoc`` + * ``make`` as an executable on the ``$PATH`` + +.. _numba_support_info: + +Version support information +--------------------------- + +This is the canonical reference for information concerning which versions of +Numba's dependencies were tested and known to work against a given version of +Numba. Other versions of the dependencies (especially NumPy) may work reasonably +well but were not tested. The use of ``x`` in a version number indicates all +patch levels supported. The use of ``?`` as a version is due to missing +information. + ++----------++--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| Numba | Release date | Python | NumPy | llvmlite | LLVM | TBB | ++===========+==============+===========================+============================+==============================+===================+=============================+ +| 0.57.x | TBC | 3.8.x <= version < 3.12 | 1.19 <= version < 1.24 | 0.40.x | 11.x | 2021.x | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.56.4 | 2022-11-03 | 3.7.x <= version < 3.11 | 1.18 <= version < 1.24 | 0.39.x | 11.x | 2021.x | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.56.3 | 2022-10-13 | 3.7.x <= version < 3.11 | 1.18 <= version < 1.24 | 0.39.x | 11.x | 2021.x | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.56.2 | 2022-09-01 | 3.7.x <= version < 3.11 | 1.18 <= version < 1.24 | 0.39.x | 11.x | 2021.x | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.56.0 | 2022-07-25 | 3.7.x <= version < 3.11 | 1.18 <= version < 1.23 | 0.39.x | 11.x | 2021.x | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.55.2 | 2022-05-25 | 3.7.x <= version < 3.11 | 1.18 <= version < 1.23 | 0.38.x | 11.x | 2021.x | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.55.{0,1}| 2022-01-13 | 3.7.x <= version < 3.11 | 1.18 <= version < 1.22 | 0.38.x | 11.x | 2021.x | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.54.x | 2021-08-19 | 3.6.x <= version < 3.10 | 1.17 <= version < 1.21 | 0.37.x | 11.x | 2021.x | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.53.x | 2021-03-11 | 3.6.x <= version < 3.10 | 1.15 <= version < 1.21 | 0.36.x | 11.x | 2019.5 <= version < 2021.4 | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.52.x | 2020-11-30 | 3.6.x <= version < 3.9 | 1.15 <= version < 1.20 | 0.35.x | 10.x | 2019.5 <= version < 2020.3 | +| | | | | | (9.x for aarch64) | | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.51.x | 2020-08-12 | 3.6.x <= version < 3.9 | 1.15 <= version < 1.19 | 0.34.x | 10.x | 2019.5 <= version < 2020.0 | +| | | | | | (9.x for aarch64) | | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.50.x | 2020-06-10 | 3.6.x <= version < 3.9 | 1.15 <= version < 1.19 | 0.33.x | 9.x | 2019.5 <= version < 2020.0 | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.49.x | 2020-04-16 | 3.6.x <= version < 3.9 | 1.15 <= version < 1.18 | 0.31.x <= version < 0.33.x | 9.x | 2019.5 <= version < 2020.0 | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.48.x | 2020-01-27 | 3.6.x <= version < 3.9 | 1.15 <= version < 1.18 | 0.31.x | 8.x | 2018.0.5 <= version < ? | +| | | | | | (7.x for ppc64le) | | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ +| 0.47.x | 2020-01-02 | 3.5.x <= version < 3.9; | 1.15 <= version < 1.18 | 0.30.x | 8.x | 2018.0.5 <= version < ? | +| | | version == 2.7.x | | | (7.x for ppc64le) | | ++-----------+--------------+---------------------------+----------------------------+------------------------------+-------------------+-----------------------------+ + +Checking your installation +-------------------------- + +You should be able to import Numba from the Python prompt:: + + $ python + Python 3.10.2 | packaged by conda-forge | (main, Jan 14 2022, 08:02:09) [GCC 9.4.0] on linux + Type "help", "copyright", "credits" or "license" for more information. + >>> import numba + >>> numba.__version__ + '0.55.1' + +You can also try executing the ``numba --sysinfo`` (or ``numba -s`` for short) +command to report information about your system capabilities. See :ref:`cli` for +further information. + +:: + + $ numba -s + System info: + -------------------------------------------------------------------------------- + __Time Stamp__ + Report started (local time) : 2022-01-18 10:35:08.981319 + + __Hardware Information__ + Machine : x86_64 + CPU Name : skylake-avx512 + CPU Count : 12 + CPU Features : + 64bit adx aes avx avx2 avx512bw avx512cd avx512dq avx512f avx512vl bmi bmi2 + clflushopt clwb cmov cx16 cx8 f16c fma fsgsbase fxsr invpcid lzcnt mmx + movbe pclmul pku popcnt prfchw rdrnd rdseed rtm sahf sse sse2 sse3 sse4.1 + sse4.2 ssse3 xsave xsavec xsaveopt xsaves + + __OS Information__ + Platform Name : Linux-5.4.0-94-generic-x86_64-with-glibc2.31 + Platform Release : 5.4.0-94-generic + OS Name : Linux + OS Version : #106-Ubuntu SMP Thu Jan 6 23:58:14 UTC 2022 + + __Python Information__ + Python Compiler : GCC 9.4.0 + Python Implementation : CPython + Python Version : 3.10.2 + Python Locale : en_GB.UTF-8 + + __LLVM information__ + LLVM Version : 11.1.0 + + __CUDA Information__ + Found 1 CUDA devices + id 0 b'Quadro RTX 8000' [SUPPORTED] + Compute Capability: 7.5 + PCI Device ID: 0 + PCI Bus ID: 21 + UUID: GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643 + Watchdog: Enabled + FP32/FP64 Performance Ratio: 32 + +(output truncated due to length) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/jit-module.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/jit-module.rst new file mode 100644 index 0000000000000000000000000000000000000000..cf2315b6c3b70b5cc2bf30d38e9bf14048fb57d1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/jit-module.rst @@ -0,0 +1,103 @@ +.. _jit-module: + +============================================ +Automatic module jitting with ``jit_module`` +============================================ + +A common usage pattern is to have an entire module containing user-defined +functions that all need to be jitted. One option to accomplish this is to +manually apply the ``@jit`` decorator to each function definition. This approach +works and is great in many cases. However, for large modules with many functions, +manually ``jit``-wrapping each function definition can be tedious. For these +situations, Numba provides another option, the ``jit_module`` function, to +automatically replace functions declared in a module with their ``jit``-wrapped +equivalents. + +It's important to note the conditions under which ``jit_module`` will *not* +impact a function: + +1. Functions which have already been wrapped with a Numba decorator (e.g. + ``jit``, ``vectorize``, ``cfunc``, etc.) are not impacted by ``jit_module``. + +2. Functions which are declared outside the module from which ``jit_module`` + is called are not automatically ``jit``-wrapped. + +3. Function declarations which occur logically after calling ``jit_module`` + are not impacted. + +All other functions in a module will have the ``@jit`` decorator automatically +applied to them. See the following section for an example use case. + +.. note:: This feature is for use by module authors. ``jit_module`` should not + be called outside the context of a module containing functions to be jitted. + + +Example usage +============= + +Let's assume we have a Python module we've created, ``mymodule.py`` (shown +below), which contains several functions. Some of these functions are defined +in ``mymodule.py`` while others are imported from other modules. We wish to have +all the functions which are defined in ``mymodule.py`` jitted using +``jit_module``. + +.. _jit-module-usage: + +.. code-block:: python + + # mymodule.py + + from numba import jit, jit_module + + def inc(x): + return x + 1 + + def add(x, y): + return x + y + + import numpy as np + # Use NumPy's mean function + mean = np.mean + + @jit(nogil=True) + def mul(a, b): + return a * b + + jit_module(nopython=True, error_model="numpy") + + def div(a, b): + return a / b + +There are several things to note in the above example: + +- Both the ``inc`` and ``add`` functions will be replaced with their + ``jit``-wrapped equivalents with :ref:`compilation options ` + ``nopython=True`` and ``error_model="numpy"``. + +- The ``mean`` function, because it's defined *outside* of ``mymodule.py`` in + NumPy, will not be modified. + +- ``mul`` will not be modified because it has been manually decorated with + ``jit``. + +- ``div`` will not be automatically ``jit``-wrapped because it is declared + after ``jit_module`` is called. + +When the above module is imported, we have: + +.. code-block:: python + + >>> import mymodule + >>> mymodule.inc + CPUDispatcher() + >>> mymodule.mean + + + +API +=== +.. warning:: This feature is experimental. The supported features may change + with or without notice. + +.. autofunction:: numba.jit_module + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/jit.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/jit.rst new file mode 100644 index 0000000000000000000000000000000000000000..36b4ba98552df5adc01c1a2df55fa1f70dd63cc5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/jit.rst @@ -0,0 +1,201 @@ +.. _jit: + +=================================== +Compiling Python code with ``@jit`` +=================================== + +Numba provides several utilities for code generation, but its central +feature is the :func:`numba.jit` decorator. Using this decorator, you can mark +a function for optimization by Numba's JIT compiler. Various invocation +modes trigger differing compilation options and behaviours. + + +Basic usage +=========== + +.. _jit-lazy: + +Lazy compilation +---------------- + +The recommended way to use the ``@jit`` decorator is to let Numba decide +when and how to optimize:: + + from numba import jit + + @jit + def f(x, y): + # A somewhat trivial example + return x + y + +In this mode, compilation will be deferred until the first function +execution. Numba will infer the argument types at call time, and generate +optimized code based on this information. Numba will also be able to +compile separate specializations depending on the input types. For example, +calling the ``f()`` function above with integer or complex numbers will +generate different code paths:: + + >>> f(1, 2) + 3 + >>> f(1j, 2) + (2+1j) + +Eager compilation +----------------- + +You can also tell Numba the function signature you are expecting. The +function ``f()`` would now look like:: + + from numba import jit, int32 + + @jit(int32(int32, int32)) + def f(x, y): + # A somewhat trivial example + return x + y + +``int32(int32, int32)`` is the function's signature. In this case, the +corresponding specialization will be compiled by the ``@jit`` decorator, +and no other specialization will be allowed. This is useful if you want +fine-grained control over types chosen by the compiler (for example, +to use single-precision floats). + +If you omit the return type, e.g. by writing ``(int32, int32)`` instead of +``int32(int32, int32)``, Numba will try to infer it for you. Function +signatures can also be strings, and you can pass several of them as a list; +see the :func:`numba.jit` documentation for more details. + +Of course, the compiled function gives the expected results:: + + >>> f(1,2) + 3 + +and if we specified ``int32`` as return type, the higher-order bits get +discarded:: + + >>> f(2**31, 2**31 + 1) + 1 + + +Calling and inlining other functions +==================================== + +Numba-compiled functions can call other compiled functions. The function +calls may even be inlined in the native code, depending on optimizer +heuristics. For example:: + + @jit + def square(x): + return x ** 2 + + @jit + def hypot(x, y): + return math.sqrt(square(x) + square(y)) + +The ``@jit`` decorator *must* be added to any such library function, +otherwise Numba may generate much slower code. + + +Signature specifications +======================== + +Explicit ``@jit`` signatures can use a number of types. Here are some +common ones: + +* ``void`` is the return type of functions returning nothing (which + actually return :const:`None` when called from Python) +* ``intp`` and ``uintp`` are pointer-sized integers (signed and unsigned, + respectively) +* ``intc`` and ``uintc`` are equivalent to C ``int`` and ``unsigned int`` + integer types +* ``int8``, ``uint8``, ``int16``, ``uint16``, ``int32``, ``uint32``, + ``int64``, ``uint64`` are fixed-width integers of the corresponding bit + width (signed and unsigned) +* ``float32`` and ``float64`` are single- and double-precision floating-point + numbers, respectively +* ``complex64`` and ``complex128`` are single- and double-precision complex + numbers, respectively +* array types can be specified by indexing any numeric type, e.g. ``float32[:]`` + for a one-dimensional single-precision array or ``int8[:,:]`` for a + two-dimensional array of 8-bit integers. + + +.. _jit-options: + +Compilation options +=================== + +A number of keyword-only arguments can be passed to the ``@jit`` decorator. + +.. _jit-nopython: + +``nopython`` +------------ + +Numba has two compilation modes: :term:`nopython mode` and +:term:`object mode`. The former produces much faster code, but has +limitations that can force Numba to fall back to the latter. To prevent +Numba from falling back, and instead raise an error, pass ``nopython=True``. + +:: + + @jit(nopython=True) + def f(x, y): + return x + y + +.. seealso:: :ref:`numba-troubleshooting` + +.. _jit-nogil: + +``nogil`` +--------- + +Whenever Numba optimizes Python code to native code that only works on +native types and variables (rather than Python objects), it is not necessary +anymore to hold Python's :py:term:`global interpreter lock` (GIL). +Numba will release the GIL when entering such a compiled function if you +passed ``nogil=True``. + +:: + + @jit(nogil=True) + def f(x, y): + return x + y + +Code running with the GIL released runs concurrently with other +threads executing Python or Numba code (either the same compiled function, +or another one), allowing you to take advantage of multi-core systems. +This will not be possible if the function is compiled in :term:`object mode`. + +When using ``nogil=True``, you'll have to be wary of the usual pitfalls +of multi-threaded programming (consistency, synchronization, race conditions, +etc.). + +.. _jit-cache: + +``cache`` +--------- + +To avoid compilation times each time you invoke a Python program, +you can instruct Numba to write the result of function compilation into +a file-based cache. This is done by passing ``cache=True``:: + + @jit(cache=True) + def f(x, y): + return x + y + +.. _parallel_jit_option: + +``parallel`` +------------ + +Enables automatic parallelization (and related optimizations) for those +operations in the function known to have parallel semantics. For a list of +supported operations, see :ref:`numba-parallel`. This feature is enabled by +passing ``parallel=True`` and must be used in conjunction with +``nopython=True``:: + + @jit(nopython=True, parallel=True) + def f(x, y): + return x + y + +.. seealso:: :ref:`numba-parallel` diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/jitclass.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/jitclass.rst new file mode 100644 index 0000000000000000000000000000000000000000..9000bf43678e1aab833dde69e6a79419a6a5caa1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/jitclass.rst @@ -0,0 +1,270 @@ +.. _jitclass: + +=========================================== +Compiling Python classes with ``@jitclass`` +=========================================== + +.. note:: + + This is a early version of jitclass support. Not all compiling features are + exposed or implemented, yet. + + +Numba supports code generation for classes via the +:func:`numba.experimental.jitclass` decorator. A class can be marked for +optimization using this decorator along with a specification of the types of +each field. We call the resulting class object a *jitclass*. All methods of a +jitclass are compiled into nopython functions. The data of a jitclass instance +is allocated on the heap as a C-compatible structure so that any compiled +functions can have direct access to the underlying data, bypassing the +interpreter. + + +Basic usage +=========== + +Here's an example of a jitclass: + +.. literalinclude:: ../../../numba/tests/doc_examples/test_jitclass.py + :language: python + :start-after: magictoken.ex_jitclass.begin + :end-before: magictoken.ex_jitclass.end + :dedent: 8 + +In the above example, a ``spec`` is provided as a list of 2-tuples. The tuples +contain the name of the field and the Numba type of the field. Alternatively, +user can use a dictionary (an ``OrderedDict`` preferably for stable field +ordering), which maps field names to types. + +The definition of the class requires at least a ``__init__`` method for +initializing each defined fields. Uninitialized fields contains garbage data. +Methods and properties (getters and setters only) can be defined. They will be +automatically compiled. + + +Inferred class member types from type annotations with ``as_numba_type`` +======================================================================== + +Fields of a ``jitclass`` can also be inferred from Python type annotations. + +.. literalinclude:: ../../../numba/tests/doc_examples/test_jitclass.py + :language: python + :start-after: magictoken.ex_jitclass_type_hints.begin + :end-before: magictoken.ex_jitclass_type_hints.end + :dedent: 8 + +Any type annotations on the class will be used to extend the spec if that field +is not already present. The Numba type corresponding to the given Python type +is inferred using ``as_numba_type``. For example, if we have the class + +.. code-block:: python + + @jitclass([("w", int32), ("y", float64[:])]) + class Foo: + w: int + x: float + y: np.ndarray + z: SomeOtherType + + def __init__(self, w: int, x: float, y: np.ndarray, z: SomeOtherType): + ... + +then the full spec used for ``Foo`` will be: + +* ``"w": int32`` (specified in the ``spec``) +* ``"x": float64`` (added from type annotation) +* ``"y": array(float64, 1d, A)`` (specified in the ``spec``) +* ``"z": numba.as_numba_type(SomeOtherType)`` (added from type annotation) + +Here ``SomeOtherType`` could be any supported Python type (e.g. +``bool``, ``typing.Dict[int, typing.Tuple[float, float]]``, or another +``jitclass``). + +Note that only type annotations on the class will be used to infer spec +elements. Method type annotations (e.g. those of ``__init__`` above) are +ignored. + +Numba requires knowing the dtype and rank of NumPy arrays, which cannot +currently be expressed with type annotations. Because of this, NumPy arrays need +to be included in the ``spec`` explicitly. + + +Specifying ``numba.typed`` containers as class members explicitly +================================================================= + +The following patterns demonstrate how to specify a ``numba.typed.Dict`` or +``numba.typed.List`` explicitly as part of the ``spec`` passed to ``jitclass``. + +First, using explicit Numba types and explicit construction. + +.. code-block:: python + + from numba import jitclass, types, typed + + # key and value types + kv_ty = (types.int64, types.unicode_type) + + # A container class with: + # * member 'd' holding a typed dictionary of int64 -> unicode string (kv_ty) + # * member 'l' holding a typed list of float64 + @jitclass([('d', types.DictType(*kv_ty)), + ('l', types.ListType(types.float64))]) + class ContainerHolder(object): + def __init__(self): + # initialize the containers + self.d = typed.Dict.empty(*kv_ty) + self.l = typed.List.empty_list(types.float64) + + container = ContainerHolder() + container.d[1] = "apple" + container.d[2] = "orange" + container.l.append(123.) + container.l.append(456.) + print(container.d) # {1: apple, 2: orange} + print(container.l) # [123.0, 456.0] + +Another useful pattern is to use the ``numba.typed`` container attribute +``_numba_type_`` to find the type of a container, this can be accessed directly +from an instance of the container in the Python interpreter. The same +information can be obtained by calling :func:`numba.typeof` on the instance. For +example: + +.. code-block:: python + + from numba import jitclass, typed, typeof + + d = typed.Dict() + d[1] = "apple" + d[2] = "orange" + l = typed.List() + l.append(123.) + l.append(456.) + + + @jitclass([('d', typeof(d)), ('l', typeof(l))]) + class ContainerInstHolder(object): + def __init__(self, dict_inst, list_inst): + self.d = dict_inst + self.l = list_inst + + container = ContainerInstHolder(d, l) + print(container.d) # {1: apple, 2: orange} + print(container.l) # [123.0, 456.0] + +It is worth noting that the instance of the container in a ``jitclass`` must be +initialized before use, for example, this will cause an invalid memory access +as ``self.d`` is written to without ``d`` being initialized as a ``type.Dict`` +instance of the type specified. + +.. code-block:: python + + from numba import jitclass, types + + dict_ty = types.DictType(types.int64, types.unicode_type) + + @jitclass([('d', dict_ty)]) + class NotInitialisingContainer(object): + def __init__(self): + self.d[10] = "apple" # this is invalid, `d` is not initialized + + NotInitialisingContainer() # segmentation fault/memory access violation + + +Support operations +================== + +The following operations of jitclasses work in both the interpreter and Numba +compiled functions: + +* calling the jitclass class object to construct a new instance + (e.g. ``mybag = Bag(123)``); +* read/write access to attributes and properties (e.g. ``mybag.value``); +* calling methods (e.g. ``mybag.increment(3)``); +* calling static methods as instance attributes (e.g. ``mybag.add(1, 1)``); +* calling static methods as class attributes (e.g. ``Bag.add(1, 2)``); +* using select dunder methods (e.g. ``__add__`` with ``mybag + otherbag``); + +Using jitclasses in Numba compiled function is more efficient. +Short methods can be inlined (at the discretion of LLVM inliner). +Attributes access are simply reading from a C structure. +Using jitclasses from the interpreter has the same overhead of calling any +Numba compiled function from the interpreter. Arguments and return values +must be unboxed or boxed between Python objects and native representation. +Values encapsulated by a jitclass does not get boxed into Python object when +the jitclass instance is handed to the interpreter. It is during attribute +access to the field values that they are boxed. +Calling static methods as class attributes is only supported outside of the +class definition (i.e. code cannot call ``Bag.add()`` from within another method +of ``Bag``). + + +Supported dunder methods +------------------------ + +The following dunder methods may be defined for jitclasses: + +* ``__abs__`` +* ``__bool__`` +* ``__complex__`` +* ``__contains__`` +* ``__float__`` +* ``__getitem__`` +* ``__hash__`` +* ``__index__`` +* ``__int__`` +* ``__len__`` +* ``__setitem__`` +* ``__str__`` +* ``__eq__`` +* ``__ne__`` +* ``__ge__`` +* ``__gt__`` +* ``__le__`` +* ``__lt__`` +* ``__add__`` +* ``__floordiv__`` +* ``__lshift__`` +* ``__mod__`` +* ``__mul__`` +* ``__neg__`` +* ``__pos__`` +* ``__pow__`` +* ``__rshift__`` +* ``__sub__`` +* ``__truediv__`` +* ``__and__`` +* ``__or__`` +* ``__xor__`` +* ``__iadd__`` +* ``__ifloordiv__`` +* ``__ilshift__`` +* ``__imod__`` +* ``__imul__`` +* ``__ipow__`` +* ``__irshift__`` +* ``__isub__`` +* ``__itruediv__`` +* ``__iand__`` +* ``__ior__`` +* ``__ixor__`` + +Refer to the `Python Data Model documentation +`_ for descriptions of +these methods. + + +Limitations +=========== + +* A jitclass class object is treated as a function (the constructor) inside + a Numba compiled function. +* ``isinstance()`` only works in the interpreter. +* Manipulating jitclass instances in the interpreter is not optimized, yet. +* Support for jitclasses are available on CPU only. + (Note: Support for GPU devices is planned for a future release.) + + +The decorator: ``@jitclass`` +============================ + +.. autofunction:: numba.experimental.jitclass diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/overview.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/overview.rst new file mode 100644 index 0000000000000000000000000000000000000000..9b11b5a49bebfddef39b704a5f338921d81c89d9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/overview.rst @@ -0,0 +1,34 @@ + +Overview +======== + +Numba is a compiler for Python array and numerical functions that gives +you the power to speed up your applications with high performance +functions written directly in Python. + +Numba generates optimized machine code from pure Python code using +the `LLVM compiler infrastructure `_. With a few simple +annotations, array-oriented and math-heavy Python code can be +just-in-time optimized to performance similar as C, C++ and Fortran, without +having to switch languages or Python interpreters. + +Numba's main features are: + +* :ref:`on-the-fly code generation ` (at import time or runtime, at the + user's preference) +* native code generation for the CPU (default) and + :doc:`GPU hardware <../cuda/index>` +* integration with the Python scientific software stack (thanks to Numpy) + +Here is how a Numba-optimized function, taking a Numpy array as argument, +might look like:: + + @numba.jit + def sum2d(arr): + M, N = arr.shape + result = 0.0 + for i in range(M): + for j in range(N): + result += arr[i,j] + return result + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/parallel.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/parallel.rst new file mode 100644 index 0000000000000000000000000000000000000000..3f09a0bc9c7f666127ecbbbb137bd8e814d1a28d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/parallel.rst @@ -0,0 +1,719 @@ +.. Copyright (c) 2017 Intel Corporation + SPDX-License-Identifier: BSD-2-Clause + +.. _numba-parallel: + +======================================= +Automatic parallelization with ``@jit`` +======================================= + +Setting the :ref:`parallel_jit_option` option for :func:`~numba.jit` enables +a Numba transformation pass that attempts to automatically parallelize and +perform other optimizations on (part of) a function. At the moment, this +feature only works on CPUs. + +Some operations inside a user defined function, e.g. adding a scalar value to +an array, are known to have parallel semantics. A user program may contain +many such operations and while each operation could be parallelized +individually, such an approach often has lackluster performance due to poor +cache behavior. Instead, with auto-parallelization, Numba attempts to +identify such operations in a user program, and fuse adjacent ones together, +to form one or more kernels that are automatically run in parallel. +The process is fully automated without modifications to the user program, +which is in contrast to Numba's :func:`~numba.vectorize` or +:func:`~numba.guvectorize` mechanism, where manual effort is required +to create parallel kernels. + +.. _numba-parallel-supported: + +Supported Operations +==================== + +In this section, we give a list of all the array operations that have +parallel semantics and for which we attempt to parallelize. + +#. All numba array operations that are supported by :ref:`case-study-array-expressions`, + which include common arithmetic functions between Numpy arrays, and between + arrays and scalars, as well as Numpy ufuncs. They are often called + `element-wise` or `point-wise` array operations: + + * unary operators: ``+`` ``-`` ``~`` + * binary operators: ``+`` ``-`` ``*`` ``/`` ``/?`` ``%`` ``|`` ``>>`` ``^`` ``<<`` ``&`` ``**`` ``//`` + * comparison operators: ``==`` ``!=`` ``<`` ``<=`` ``>`` ``>=`` + * :ref:`Numpy ufuncs ` that are supported in :term:`nopython mode`. + * User defined :class:`~numba.DUFunc` through :func:`~numba.vectorize`. + +#. Numpy reduction functions ``sum``, ``prod``, ``min``, ``max``, ``argmin``, + and ``argmax``. Also, array math functions ``mean``, ``var``, and ``std``. + +#. Numpy array creation functions ``zeros``, ``ones``, ``arange``, ``linspace``, + and several random functions (rand, randn, ranf, random_sample, sample, + random, standard_normal, chisquare, weibull, power, geometric, exponential, + poisson, rayleigh, normal, uniform, beta, binomial, f, gamma, lognormal, + laplace, randint, triangular). + +#. Numpy ``dot`` function between a matrix and a vector, or two vectors. + In all other cases, Numba's default implementation is used. + +#. Multi-dimensional arrays are also supported for the above operations + when operands have matching dimension and size. The full semantics of + Numpy broadcast between arrays with mixed dimensionality or size is + not supported, nor is the reduction across a selected dimension. + +#. Array assignment in which the target is an array selection using a slice + or a boolean array, and the value being assigned is either a scalar or + another selection where the slice range or bitarray are inferred to be + compatible. + +#. The ``reduce`` operator of ``functools`` is supported for specifying parallel + reductions on 1D Numpy arrays but the initial value argument is mandatory. + +.. _numba-prange: + +Explicit Parallel Loops +======================== + +Another feature of the code transformation pass (when ``parallel=True``) is +support for explicit parallel loops. One can use Numba's ``prange`` instead of +``range`` to specify that a loop can be parallelized. The user is required to +make sure that the loop does not have cross iteration dependencies except for +supported reductions. + +A reduction is inferred automatically if a variable is updated by a supported binary +function/operator using its previous value in the loop body. The following +functions/operators are supported: ``+=``, ``+``, ``-=``, ``-``, ``*=``, +``*``, ``/=``, ``/``, ``max()``, ``min()``. +The initial value of the reduction is inferred automatically for the +supported operators (i.e., not the ``max`` and ``min`` functions). +Note that the ``//=`` operator is not supported because +in the general case the result depends on the order in which the divisors are +applied. However, if all divisors are integers then the programmer may be +able to rewrite the ``//=`` reduction as a ``*=`` reduction followed by +a single floor division after the parallel region where the divisor is the +accumulated product. +For the ``max`` and ``min`` functions, the reduction variable should hold the identity +value right before entering the ``prange`` loop. Reductions in this manner +are supported for scalars and for arrays of arbitrary dimensions. + +The example below demonstrates a parallel loop with a +reduction (``A`` is a one-dimensional Numpy array):: + + from numba import njit, prange + + @njit(parallel=True) + def prange_test(A): + s = 0 + # Without "parallel=True" in the jit-decorator + # the prange statement is equivalent to range + for i in prange(A.shape[0]): + s += A[i] + return s + +The following example demonstrates a product reduction on a two-dimensional array:: + + from numba import njit, prange + import numpy as np + + @njit(parallel=True) + def two_d_array_reduction_prod(n): + shp = (13, 17) + result1 = 2 * np.ones(shp, np.int_) + tmp = 2 * np.ones_like(result1) + + for i in prange(n): + result1 *= tmp + + return result1 + +.. note:: When using Python's ``range`` to induce a loop, Numba types the + induction variable as a signed integer. This is also the case for + Numba's ``prange`` when ``parallel=False``. However, for + ``parallel=True``, if the range is identifiable as strictly positive, + the type of the induction variable will be ``uint64``. The impact of + a ``uint64`` induction variable is often most noticable when + undertaking operations involving it and a signed integer. Under + Numba's type coercion rules, such a case will commonly result in the + operation producing a floating point result type. + + +Care should be taken, however, when reducing into slices or elements of an array +if the elements specified by the slice or index are written to simultaneously by +multiple parallel threads. The compiler may not detect such cases and then a race condition +would occur. + +The following example demonstrates such a case where a race condition in the execution of the +parallel for-loop results in an incorrect return value:: + + from numba import njit, prange + import numpy as np + + @njit(parallel=True) + def prange_wrong_result(x): + n = x.shape[0] + y = np.zeros(4) + for i in prange(n): + # accumulating into the same element of `y` from different + # parallel iterations of the loop results in a race condition + y[:] += x[i] + + return y + +as does the following example where the accumulating element is explicitly specified:: + + from numba import njit, prange + import numpy as np + + @njit(parallel=True) + def prange_wrong_result(x): + n = x.shape[0] + y = np.zeros(4) + for i in prange(n): + # accumulating into the same element of `y` from different + # parallel iterations of the loop results in a race condition + y[i % 4] += x[i] + + return y + +whereas performing a whole array reduction is fine:: + + from numba import njit, prange + import numpy as np + + @njit(parallel=True) + def prange_ok_result_whole_arr(x): + n = x.shape[0] + y = np.zeros(4) + for i in prange(n): + y += x[i] + return y + +as is creating a slice reference outside of the parallel reduction loop:: + + from numba import njit, prange + import numpy as np + + @njit(parallel=True) + def prange_ok_result_outer_slice(x): + n = x.shape[0] + y = np.zeros(4) + z = y[:] + for i in prange(n): + z += x[i] + return y + +Examples +======== + +In this section, we give an example of how this feature helps +parallelize Logistic Regression:: + + @numba.jit(nopython=True, parallel=True) + def logistic_regression(Y, X, w, iterations): + for i in range(iterations): + w -= np.dot(((1.0 / (1.0 + np.exp(-Y * np.dot(X, w))) - 1.0) * Y), X) + return w + +We will not discuss details of the algorithm, but instead focus on how +this program behaves with auto-parallelization: + +1. Input ``Y`` is a vector of size ``N``, ``X`` is an ``N x D`` matrix, + and ``w`` is a vector of size ``D``. + +2. The function body is an iterative loop that updates variable ``w``. + The loop body consists of a sequence of vector and matrix operations. + +3. The inner ``dot`` operation produces a vector of size ``N``, followed by a + sequence of arithmetic operations either between a scalar and vector of + size ``N``, or two vectors both of size ``N``. + +4. The outer ``dot`` produces a vector of size ``D``, followed by an inplace + array subtraction on variable ``w``. + +5. With auto-parallelization, all operations that produce array of size + ``N`` are fused together to become a single parallel kernel. This includes + the inner ``dot`` operation and all point-wise array operations following it. + +6. The outer ``dot`` operation produces a result array of different dimension, + and is not fused with the above kernel. + +Here, the only thing required to take advantage of parallel hardware is to set +the :ref:`parallel_jit_option` option for :func:`~numba.jit`, with no +modifications to the ``logistic_regression`` function itself. If we were to +give an equivalence parallel implementation using :func:`~numba.guvectorize`, +it would require a pervasive change that rewrites the code to extract kernel +computation that can be parallelized, which was both tedious and challenging. + +Unsupported Operations +====================== + +This section contains a non-exhaustive list of commonly encountered but +currently unsupported features: + +#. **Mutating a list is not threadsafe** + + Concurrent write operations on container types (i.e. lists, sets and + dictionaries) in a ``prange`` parallel region are not threadsafe e.g.:: + + @njit(parallel=True) + def invalid(): + z = [] + for i in prange(10000): + z.append(i) + return z + + It is highly likely that the above will result in corruption or an access + violation as containers require thread-safety under mutation but this feature + is not implemented. + +#. **Induction variables are not associated with thread ID** + + The use of the induction variable induced by a ``prange`` based loop in + conjunction with ``get_num_threads`` as a method of ensuring safe writes into + a pre-sized container is not valid e.g.:: + + @njit(parallel=True) + def invalid(): + n = get_num_threads() + z = [0 for _ in range(n)] + for i in prange(100): + z[i % n] += i + return z + + The above can on occasion appear to work, but it does so by luck. There's no + guarantee about which indexes are assigned to which executing threads or the + order in which the loop iterations execute. + +.. _numba-parallel-diagnostics: + +Diagnostics +=========== + +.. note:: At present not all parallel transforms and functions can be tracked + through the code generation process. Occasionally diagnostics about + some loops or transforms may be missing. + +The :ref:`parallel_jit_option` option for :func:`~numba.jit` can produce +diagnostic information about the transforms undertaken in automatically +parallelizing the decorated code. This information can be accessed in two ways, +the first is by setting the environment variable +:envvar:`NUMBA_PARALLEL_DIAGNOSTICS`, the second is by calling +:meth:`~Dispatcher.parallel_diagnostics`, both methods give the same information +and print to ``STDOUT``. The level of verbosity in the diagnostic information is +controlled by an integer argument of value between 1 and 4 inclusive, 1 being +the least verbose and 4 the most. For example:: + + @njit(parallel=True) + def test(x): + n = x.shape[0] + a = np.sin(x) + b = np.cos(a * a) + acc = 0 + for i in prange(n - 2): + for j in prange(n - 1): + acc += b[i] + b[j + 1] + return acc + + test(np.arange(10)) + + test.parallel_diagnostics(level=4) + +produces:: + + ================================================================================ + ======= Parallel Accelerator Optimizing: Function test, example.py (4) ======= + ================================================================================ + + + Parallel loop listing for Function test, example.py (4) + --------------------------------------|loop #ID + @njit(parallel=True) | + def test(x): | + n = x.shape[0] | + a = np.sin(x)---------------------| #0 + b = np.cos(a * a)-----------------| #1 + acc = 0 | + for i in prange(n - 2):-----------| #3 + for j in prange(n - 1):-------| #2 + acc += b[i] + b[j + 1] | + return acc | + --------------------------------- Fusing loops --------------------------------- + Attempting fusion of parallel loops (combines loops with similar properties)... + Trying to fuse loops #0 and #1: + - fusion succeeded: parallel for-loop #1 is fused into for-loop #0. + Trying to fuse loops #0 and #3: + - fusion failed: loop dimension mismatched in axis 0. slice(0, x_size0.1, 1) + != slice(0, $40.4, 1) + ----------------------------- Before Optimization ------------------------------ + Parallel region 0: + +--0 (parallel) + +--1 (parallel) + + + Parallel region 1: + +--3 (parallel) + +--2 (parallel) + + + -------------------------------------------------------------------------------- + ------------------------------ After Optimization ------------------------------ + Parallel region 0: + +--0 (parallel, fused with loop(s): 1) + + + Parallel region 1: + +--3 (parallel) + +--2 (serial) + + + + Parallel region 0 (loop #0) had 1 loop(s) fused. + + Parallel region 1 (loop #3) had 0 loop(s) fused and 1 loop(s) serialized as part + of the larger parallel loop (#3). + -------------------------------------------------------------------------------- + -------------------------------------------------------------------------------- + + ---------------------------Loop invariant code motion--------------------------- + + Instruction hoisting: + loop #0: + Failed to hoist the following: + dependency: $arg_out_var.10 = getitem(value=x, index=$parfor__index_5.99) + dependency: $0.6.11 = getattr(value=$0.5, attr=sin) + dependency: $expr_out_var.9 = call $0.6.11($arg_out_var.10, func=$0.6.11, args=[Var($arg_out_var.10, example.py (7))], kws=(), vararg=None) + dependency: $arg_out_var.17 = $expr_out_var.9 * $expr_out_var.9 + dependency: $0.10.20 = getattr(value=$0.9, attr=cos) + dependency: $expr_out_var.16 = call $0.10.20($arg_out_var.17, func=$0.10.20, args=[Var($arg_out_var.17, example.py (8))], kws=(), vararg=None) + loop #3: + Has the following hoisted: + $const58.3 = const(int, 1) + $58.4 = _n_23 - $const58.3 + -------------------------------------------------------------------------------- + + + +To aid users unfamiliar with the transforms undertaken when the +:ref:`parallel_jit_option` option is used, and to assist in the understanding of +the subsequent sections, the following definitions are provided: + +* Loop fusion + `Loop fusion `_ is a + technique whereby loops with equivalent bounds may be combined under certain + conditions to produce a loop with a larger body (aiming to improve data + locality). + +* Loop serialization + Loop serialization occurs when any number of ``prange`` driven loops are + present inside another ``prange`` driven loop. In this case the outermost + of all the ``prange`` loops executes in parallel and any inner ``prange`` + loops (nested or otherwise) are treated as standard ``range`` based loops. + Essentially, nested parallelism does not occur. + +* Loop invariant code motion + `Loop invariant code motion + `_ is an + optimization technique that analyses a loop to look for statements that can + be moved outside the loop body without changing the result of executing the + loop, these statements are then "hoisted" out of the loop to save repeated + computation. + +* Allocation hoisting + Allocation hoisting is a specialized case of loop invariant code motion that + is possible due to the design of some common NumPy allocation methods. + Explanation of this technique is best driven by an example: + + .. code-block:: python + + @njit(parallel=True) + def test(n): + for i in prange(n): + temp = np.zeros((50, 50)) # <--- Allocate a temporary array with np.zeros() + for j in range(50): + temp[j, j] = i + + # ...do something with temp + + internally, this is transformed to approximately the following: + + .. code-block:: python + + @njit(parallel=True) + def test(n): + for i in prange(n): + temp = np.empty((50, 50)) # <--- np.zeros() is rewritten as np.empty() + temp[:] = 0 # <--- and then a zero initialisation + for j in range(50): + temp[j, j] = i + + # ...do something with temp + + then after hoisting: + + .. code-block:: python + + @njit(parallel=True) + def test(n): + temp = np.empty((50, 50)) # <--- allocation is hoisted as a loop invariant as `np.empty` is considered pure + for i in prange(n): + temp[:] = 0 # <--- this remains as assignment is a side effect + for j in range(50): + temp[j, j] = i + + # ...do something with temp + + it can be seen that the ``np.zeros`` allocation is split into an allocation + and an assignment, and then the allocation is hoisted out of the loop in + ``i``, this producing more efficient code as the allocation only occurs + once. + +The parallel diagnostics report sections +---------------------------------------- + +The report is split into the following sections: + +#. Code annotation + This is the first section and contains the source code of the decorated + function with loops that have parallel semantics identified and enumerated. + The ``loop #ID`` column on the right of the source code lines up with + identified parallel loops. From the example, ``#0`` is ``np.sin``, ``#1`` + is ``np.cos`` and ``#2`` and ``#3`` are ``prange()``: + + .. code-block:: python + + Parallel loop listing for Function test, example.py (4) + --------------------------------------|loop #ID + @njit(parallel=True) | + def test(x): | + n = x.shape[0] | + a = np.sin(x)---------------------| #0 + b = np.cos(a * a)-----------------| #1 + acc = 0 | + for i in prange(n - 2):-----------| #3 + for j in prange(n - 1):-------| #2 + acc += b[i] + b[j + 1] | + return acc | + + It is worth noting that the loop IDs are enumerated in the order they are + discovered which is not necessarily the same order as present in the source. + Further, it should also be noted that the parallel transforms use a static + counter for loop ID indexing. As a consequence it is possible for the loop + ID index to not start at 0 due to use of the same counter for internal + optimizations/transforms taking place that are invisible to the user. + +#. Fusing loops + This section describes the attempts made at fusing discovered + loops noting which succeeded and which failed. In the case of failure to + fuse a reason is given (e.g. dependency on other data). From the example: + + .. code-block:: text + + --------------------------------- Fusing loops --------------------------------- + Attempting fusion of parallel loops (combines loops with similar properties)... + Trying to fuse loops #0 and #1: + - fusion succeeded: parallel for-loop #1 is fused into for-loop #0. + Trying to fuse loops #0 and #3: + - fusion failed: loop dimension mismatched in axis 0. slice(0, x_size0.1, 1) + != slice(0, $40.4, 1) + + It can be seen that fusion of loops ``#0`` and ``#1`` was attempted and this + succeeded (both are based on the same dimensions of ``x``). Following the + successful fusion of ``#0`` and ``#1``, fusion was attempted between ``#0`` + (now including the fused ``#1`` loop) and ``#3``. This fusion failed because + there is a loop dimension mismatch, ``#0`` is size ``x.shape`` whereas + ``#3`` is size ``x.shape[0] - 2``. + +#. Before Optimization + This section shows the structure of the parallel regions in the code before + any optimization has taken place, but with loops associated with their final + parallel region (this is to make before/after optimization output directly + comparable). Multiple parallel regions may exist if there are loops which + cannot be fused, in this case code within each region will execute in + parallel, but each parallel region will run sequentially. From the example: + + .. code-block:: text + + Parallel region 0: + +--0 (parallel) + +--1 (parallel) + + + Parallel region 1: + +--3 (parallel) + +--2 (parallel) + + As alluded to by the `Fusing loops` section, there are necessarily two + parallel regions in the code. The first contains loops ``#0`` and ``#1``, + the second contains ``#3`` and ``#2``, all loops are marked ``parallel`` as + no optimization has taken place yet. + +#. After Optimization + This section shows the structure of the parallel regions in the code after + optimization has taken place. Again, parallel regions are enumerated with + their corresponding loops but this time loops which are fused or serialized + are noted and a summary is presented. From the example: + + .. code-block:: text + + Parallel region 0: + +--0 (parallel, fused with loop(s): 1) + + + Parallel region 1: + +--3 (parallel) + +--2 (serial) + + Parallel region 0 (loop #0) had 1 loop(s) fused. + + Parallel region 1 (loop #3) had 0 loop(s) fused and 1 loop(s) serialized as part + of the larger parallel loop (#3). + + + It can be noted that parallel region 0 contains loop ``#0`` and, as seen in + the `fusing loops` section, loop ``#1`` is fused into loop ``#0``. It can + also be noted that parallel region 1 contains loop ``#3`` and that loop + ``#2`` (the inner ``prange()``) has been serialized for execution in the + body of loop ``#3``. + +#. Loop invariant code motion + This section shows for each loop, after optimization has occurred: + + * the instructions that failed to be hoisted and the reason for failure + (dependency/impure). + * the instructions that were hoisted. + * any allocation hoisting that may have occurred. + + From the example: + + .. code-block:: text + + Instruction hoisting: + loop #0: + Failed to hoist the following: + dependency: $arg_out_var.10 = getitem(value=x, index=$parfor__index_5.99) + dependency: $0.6.11 = getattr(value=$0.5, attr=sin) + dependency: $expr_out_var.9 = call $0.6.11($arg_out_var.10, func=$0.6.11, args=[Var($arg_out_var.10, example.py (7))], kws=(), vararg=None) + dependency: $arg_out_var.17 = $expr_out_var.9 * $expr_out_var.9 + dependency: $0.10.20 = getattr(value=$0.9, attr=cos) + dependency: $expr_out_var.16 = call $0.10.20($arg_out_var.17, func=$0.10.20, args=[Var($arg_out_var.17, example.py (8))], kws=(), vararg=None) + loop #3: + Has the following hoisted: + $const58.3 = const(int, 1) + $58.4 = _n_23 - $const58.3 + + The first thing to note is that this information is for advanced users as it + refers to the :term:`Numba IR` of the function being transformed. As an + example, the expression ``a * a`` in the example source partly translates to + the expression ``$arg_out_var.17 = $expr_out_var.9 * $expr_out_var.9`` in + the IR, this clearly cannot be hoisted out of ``loop #0`` because it is not + loop invariant! Whereas in ``loop #3``, the expression + ``$const58.3 = const(int, 1)`` comes from the source ``b[j + 1]``, the + number ``1`` is clearly a constant and so can be hoisted out of the loop. + +.. _numba-parallel-scheduling: + +Scheduling +========== + +By default, Numba divides the iterations of a parallel region into approximately equal +sized chunks and gives one such chunk to each configured thread. +(See :ref:`setting_the_number_of_threads`). +This scheduling approach is equivalent to OpenMP's static schedule with no specified +chunk size and is appropriate when the work required for each iteration is nearly constant. +Conversely, if the work required per iteration, as shown in the ``prange`` loop below, +varies significantly then this static +scheduling approach can lead to load imbalances and longer execution times. + +.. literalinclude:: ../../../numba/tests/doc_examples/test_parallel_chunksize.py + :language: python + :caption: from ``test_unbalanced_example`` of ``numba/tests/doc_examples/test_parallel_chunksize.py`` + :start-after: magictoken.ex_unbalanced.begin + :end-before: magictoken.ex_unbalanced.end + :dedent: 12 + :linenos: + +In such cases, +Numba provides a mechanism to control how many iterations of a parallel region +(i.e., the chunk size) go into each chunk. +Numba then computes the number of required chunks which is +equal to the number of iterations divided by the chunk size, truncated to the nearest +integer. All of these chunks are then approximately, equally sized. +Numba then gives one such chunk to each configured +thread as above and when a thread finishes a chunk, Numba gives that thread the next +available chunk. +This scheduling approach is similar to OpenMP's dynamic scheduling +option with the specified chunk size. +(Note that Numba is only capable of supporting this dynamic scheduling +of parallel regions if the underlying Numba threading backend, +:ref:`numba-threading-layer`, is also capable of dynamic scheduling. +At the moment, only the ``tbb`` backend is capable of dynamic +scheduling and so is required if any performance +benefit is to be achieved from this chunk size selection mechanism.) +To minimize execution time, the programmer must +pick a chunk size that strikes a balance between greater load balancing with smaller +chunk sizes and less scheduling overhead with larger chunk sizes. +See :ref:`chunk-details-label` for additional details on the internal implementation +of chunk sizes. + +The number of iterations of a parallel region in a chunk is stored as a thread-local +variable and can be set using +:func:`numba.set_parallel_chunksize`. This function takes one integer parameter +whose value must be greater than +or equal to 0. A value of 0 is the default value and instructs Numba to use the +static scheduling approach above. Values greater than 0 instruct Numba to use that value +as the chunk size in the dynamic scheduling approach described above. +:func:`numba.set_parallel_chunksize` returns the previous value of the chunk size. +The current value of this thread local variable is used as the chunk size for all +subsequent parallel regions invoked by this thread. +However, upon entering a parallel region, Numba sets the chunk size thread local variable +for each of the threads executing that parallel region back to the default of 0, +since it is unlikely +that any nested parallel regions would require the same chunk size. If the same thread is +used to execute a sequential and parallel region then that thread's chunk size +variable is set to 0 at the beginning of the parallel region and restored to +its original value upon exiting the parallel region. +This behavior is demonstrated in ``func1`` in the example below in that the +reported chunk size inside the ``prange`` parallel region is 0 but is 4 outside +the parallel region. Note that if the ``prange`` is not executed in parallel for +any reason (e.g., setting ``parallel=False``) then the chunk size reported inside +the non-parallel prange would be reported as 4. +This behavior may initially be counterintuitive to programmers as it differs from +how thread local variables typically behave in other languages. +A programmer may use +the chunk size API described in this section within the threads executing a parallel +region if the programmer wishes to specify a chunk size for any nested parallel regions +that may be launched. +The current value of the parallel chunk size can be obtained by calling +:func:`numba.get_parallel_chunksize`. +Both of these functions can be used from standard Python and from within Numba JIT compiled functions +as shown below. Both invocations of ``func1`` would be executed with a chunk size of 4 whereas +``func2`` would use a chunk size of 8. + +.. literalinclude:: ../../../numba/tests/doc_examples/test_parallel_chunksize.py + :language: python + :caption: from ``test_chunksize_manual`` of ``numba/tests/doc_examples/test_parallel_chunksize.py`` + :start-after: magictoken.ex_chunksize_manual.begin + :end-before: magictoken.ex_chunksize_manual.end + :dedent: 12 + :linenos: + +Since this idiom of saving and restoring is so common, Numba provides the +:func:`parallel_chunksize` with clause context-manager to simplify the idiom. +As shown below, this with clause can be invoked from both standard Python and +within Numba JIT compiled functions. As with other Numba context-managers, be +aware that the raising of exceptions is not supported from within a context managed +block that is part of a Numba JIT compiled function. + +.. literalinclude:: ../../../numba/tests/doc_examples/test_parallel_chunksize.py + :language: python + :caption: from ``test_chunksize_with`` of ``numba/tests/doc_examples/test_parallel_chunksize.py`` + :start-after: magictoken.ex_chunksize_with.begin + :end-before: magictoken.ex_chunksize_with.end + :dedent: 12 + :linenos: + +Note that these functions to set the chunk size only have an effect on +Numba automatic parallelization with the :ref:`parallel_jit_option` option. +Chunk size specification has no effect on the :func:`~numba.vectorize` decorator +or the :func:`~numba.guvectorize` decorator. + +.. seealso:: :ref:`parallel_jit_option`, :ref:`Parallel FAQs ` diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/performance-tips.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/performance-tips.rst new file mode 100644 index 0000000000000000000000000000000000000000..688071dec00580012430a3dc8a53292170e0fb20 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/performance-tips.rst @@ -0,0 +1,248 @@ +.. _performance-tips: + +Performance Tips +================ + +This is a short guide to features present in Numba that can help with obtaining +the best performance from code. Two examples are used, both are entirely +contrived and exist purely for pedagogical reasons to motivate discussion. +The first is the computation of the trigonometric identity +``cos(x)^2 + sin(x)^2``, the second is a simple element wise square root of a +vector with reduction over summation. All performance numbers are indicative +only and unless otherwise stated were taken from running on an Intel ``i7-4790`` +CPU (4 hardware threads) with an input of ``np.arange(1.e7)``. + +.. note:: + A reasonably effective approach to achieving high performance code is to + profile the code running with real data and use that to guide performance + tuning. The information presented here is to demonstrate features, not to act + as canonical guidance! + +No Python mode vs Object mode +----------------------------- + +A common pattern is to decorate functions with ``@jit`` as this is the most +flexible decorator offered by Numba. ``@jit`` essentially encompasses two modes +of compilation, first it will try and compile the decorated function in no +Python mode, if this fails it will try again to compile the function using +object mode. Whilst the use of looplifting in object mode can enable some +performance increase, getting functions to compile under no python mode is +really the key to good performance. To make it such that only no python mode is +used and if compilation fails an exception is raised the decorators ``@njit`` +and ``@jit(nopython=True)`` can be used (the first is an alias of the +second for convenience). + +Loops +----- +Whilst NumPy has developed a strong idiom around the use of vector operations, +Numba is perfectly happy with loops too. For users familiar with C or Fortran, +writing Python in this style will work fine in Numba (after all, LLVM gets a +lot of use in compiling C lineage languages). For example:: + + @njit + def ident_np(x): + return np.cos(x) ** 2 + np.sin(x) ** 2 + + @njit + def ident_loops(x): + r = np.empty_like(x) + n = len(x) + for i in range(n): + r[i] = np.cos(x[i]) ** 2 + np.sin(x[i]) ** 2 + return r + +The above run at almost identical speeds when decorated with ``@njit``, without +the decorator the vectorized function is a couple of orders of magnitude faster. + ++-----------------+-------+----------------+ +| Function Name | @njit | Execution time | ++=================+=======+================+ +| ``ident_np`` | No | 0.581s | ++-----------------+-------+----------------+ +| ``ident_np`` | Yes | 0.659s | ++-----------------+-------+----------------+ +| ``ident_loops`` | No | 25.2s | ++-----------------+-------+----------------+ +| ``ident_loops`` | Yes | 0.670s | ++-----------------+-------+----------------+ + +.. _fast-math: + +Fastmath +-------- +In certain classes of applications strict IEEE 754 compliance is less +important. As a result it is possible to relax some numerical rigour with +view of gaining additional performance. The way to achieve this behaviour in +Numba is through the use of the ``fastmath`` keyword argument:: + + @njit(fastmath=False) + def do_sum(A): + acc = 0. + # without fastmath, this loop must accumulate in strict order + for x in A: + acc += np.sqrt(x) + return acc + + @njit(fastmath=True) + def do_sum_fast(A): + acc = 0. + # with fastmath, the reduction can be vectorized as floating point + # reassociation is permitted. + for x in A: + acc += np.sqrt(x) + return acc + + ++-----------------+-----------------+ +| Function Name | Execution time | ++=================+=================+ +| ``do_sum`` | 35.2 ms | ++-----------------+-----------------+ +| ``do_sum_fast`` | 17.8 ms | ++-----------------+-----------------+ + +In some cases you may wish to opt-in to only a subset of possible fast-math +optimizations. This can be done by supplying a set of `LLVM fast-math flags +`_ to ``fastmath``.:: + + def add_assoc(x, y): + return (x - y) + y + + print(njit(fastmath=False)(add_assoc)(0, np.inf)) # nan + print(njit(fastmath=True) (add_assoc)(0, np.inf)) # 0.0 + print(njit(fastmath={'reassoc', 'nsz'})(add_assoc)(0, np.inf)) # 0.0 + print(njit(fastmath={'reassoc'}) (add_assoc)(0, np.inf)) # nan + print(njit(fastmath={'nsz'}) (add_assoc)(0, np.inf)) # nan + + +Parallel=True +------------- +If code contains operations that are parallelisable (:ref:`and supported +`) Numba can compile a version that will run in +parallel on multiple native threads (no GIL!). This parallelisation is performed +automatically and is enabled by simply adding the ``parallel`` keyword +argument:: + + @njit(parallel=True) + def ident_parallel(x): + return np.cos(x) ** 2 + np.sin(x) ** 2 + + +Executions times are as follows: + ++--------------------+-----------------+ +| Function Name | Execution time | ++====================+=================+ +| ``ident_parallel`` | 112 ms | ++--------------------+-----------------+ + + +The execution speed of this function with ``parallel=True`` present is +approximately 5x that of the NumPy equivalent and 6x that of standard +``@njit``. + + +Numba parallel execution also has support for explicit parallel loop +declaration similar to that in OpenMP. To indicate that a loop should be +executed in parallel the ``numba.prange`` function should be used, this function +behaves like Python ``range`` and if ``parallel=True`` is not set it acts +simply as an alias of ``range``. Loops induced with ``prange`` can be used for +embarrassingly parallel computation and also reductions. + +Revisiting the reduce over sum example, assuming it is safe for the sum to be +accumulated out of order, the loop in ``n`` can be parallelised through the use +of ``prange``. Further, the ``fastmath=True`` keyword argument can be added +without concern in this case as the assumption that out of order execution is +valid has already been made through the use of ``parallel=True`` (as each thread +computes a partial sum). +:: + + @njit(parallel=True) + def do_sum_parallel(A): + # each thread can accumulate its own partial sum, and then a cross + # thread reduction is performed to obtain the result to return + n = len(A) + acc = 0. + for i in prange(n): + acc += np.sqrt(A[i]) + return acc + + @njit(parallel=True, fastmath=True) + def do_sum_parallel_fast(A): + n = len(A) + acc = 0. + for i in prange(n): + acc += np.sqrt(A[i]) + return acc + + +Execution times are as follows, ``fastmath`` again improves performance. + ++-------------------------+-----------------+ +| Function Name | Execution time | ++=========================+=================+ +| ``do_sum_parallel`` | 9.81 ms | ++-------------------------+-----------------+ +| ``do_sum_parallel_fast``| 5.37 ms | ++-------------------------+-----------------+ + +.. _intel-svml: + +Intel SVML +---------- + +Intel provides a short vector math library (SVML) that contains a large number +of optimised transcendental functions available for use as compiler +intrinsics. If the ``icc_rt`` package is present in the environment (or the SVML +libraries are simply locatable!) then Numba automatically configures the LLVM +back end to use the SVML intrinsic functions where ever possible. SVML provides +both high and low accuracy versions of each intrinsic and the version that is +used is determined through the use of the ``fastmath`` keyword. The default is +to use high accuracy which is accurate to within ``1 ULP``, however if +``fastmath`` is set to ``True`` then the lower accuracy versions of the +intrinsics are used (answers to within ``4 ULP``). + + +First obtain SVML, using conda for example:: + + conda install -c numba icc_rt + +Rerunning the identity function example ``ident_np`` from above with various +combinations of options to ``@njit`` and with/without SVML yields the following +performance results (input size ``np.arange(1.e8)``). For reference, with just +NumPy the function executed in ``5.84s``: + ++-----------------------------------+--------+-------------------+ +| ``@njit`` kwargs | SVML | Execution time | ++===================================+========+===================+ +| ``None`` | No | 5.95s | ++-----------------------------------+--------+-------------------+ +| ``None`` | Yes | 2.26s | ++-----------------------------------+--------+-------------------+ +| ``fastmath=True`` | No | 5.97s | ++-----------------------------------+--------+-------------------+ +| ``fastmath=True`` | Yes | 1.8s | ++-----------------------------------+--------+-------------------+ +| ``parallel=True`` | No | 1.36s | ++-----------------------------------+--------+-------------------+ +| ``parallel=True`` | Yes | 0.624s | ++-----------------------------------+--------+-------------------+ +| ``parallel=True, fastmath=True`` | No | 1.32s | ++-----------------------------------+--------+-------------------+ +| ``parallel=True, fastmath=True`` | Yes | 0.576s | ++-----------------------------------+--------+-------------------+ + +It is evident that SVML significantly increases the performance of this +function. The impact of ``fastmath`` in the case of SVML not being present is +zero, this is expected as there is nothing in the original function that would +benefit from relaxing numerical strictness. + +Linear algebra +-------------- +Numba supports most of ``numpy.linalg`` in no Python mode. The internal +implementation relies on a LAPACK and BLAS library to do the numerical work +and it obtains the bindings for the necessary functions from SciPy. Therefore, +to achieve good performance in ``numpy.linalg`` functions with Numba it is +necessary to use a SciPy built against a well optimised LAPACK/BLAS library. +In the case of the Anaconda distribution SciPy is built against Intel's MKL +which is highly optimised and as a result Numba makes use of this performance. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/pycc.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/pycc.rst new file mode 100644 index 0000000000000000000000000000000000000000..b0f1275a540e24529bc71bf645a2a873c0dbfafa --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/pycc.rst @@ -0,0 +1,140 @@ + +============================ +Compiling code ahead of time +============================ + +.. _pycc: + +While Numba's main use case is :term:`Just-in-Time compilation`, it also +provides a facility for :term:`Ahead-of-Time compilation` (AOT). + + +Overview +======== + +Benefits +-------- + +#. AOT compilation produces a compiled extension module which does not depend + on Numba: you can distribute the module on machines which do not have + Numba installed (but Numpy is required). + +#. There is no compilation overhead at runtime (but see the + ``@jit`` :ref:`cache ` option), nor any overhead of importing + Numba. + +.. seealso:: + Compiled extension modules are discussed in the + `Python packaging user guide `_. + + +Limitations +----------- + +#. AOT compilation only allows for regular functions, not :term:`ufuncs `. + +#. You have to specify function signatures explicitly. + +#. Each exported function can have only one signature (but you can export + several different signatures under different names). + +#. Exported functions do not check the types of the arguments that are passed + to them; the caller is expected to provide arguments of the correct type. + +#. AOT compilation produces generic code for your CPU's architectural family + (for example "x86-64"), while JIT compilation produces code optimized + for your particular CPU model. + + +Usage +===== + +Standalone example +------------------ + +:: + + from numba.pycc import CC + + cc = CC('my_module') + # Uncomment the following line to print out the compilation steps + #cc.verbose = True + + @cc.export('multf', 'f8(f8, f8)') + @cc.export('multi', 'i4(i4, i4)') + def mult(a, b): + return a * b + + @cc.export('square', 'f8(f8)') + def square(a): + return a ** 2 + + if __name__ == "__main__": + cc.compile() + + +If you run this Python script, it will generate an extension module named +``my_module``. Depending on your platform, the actual filename may be +``my_module.so``, ``my_module.pyd``, ``my_module.cpython-34m.so``, etc. + +The generated module has three functions: ``multf``, ``multi`` and ``square``. +``multi`` operates on 32-bit integers (``i4``), while ``multf`` and ``square`` +operate on double-precision floats (``f8``):: + + >>> import my_module + >>> my_module.multi(3, 4) + 12 + >>> my_module.square(1.414) + 1.9993959999999997 + + +Distutils integration +--------------------- + +You can also integrate the compilation step for your extension modules +in your ``setup.py`` script, using distutils or setuptools:: + + from distutils.core import setup + + from source_module import cc + + setup(..., + ext_modules=[cc.distutils_extension()]) + + +The ``source_module`` above is the module defining the ``cc`` object. +Extensions compiled like this will be automatically included in the +build files for your Python project, so you can distribute them inside +binary packages such as wheels or Conda packages. Note that in the case of +using conda, the compilers used for AOT need to be those that are available +in the Anaconda distribution. + + +Signature syntax +---------------- + +The syntax for exported signatures is the same as in the ``@jit`` +decorator. You can read more about it in the :ref:`types ` +reference. + +Here is an example of exporting an implementation of the second-order +centered difference on a 1d array:: + + @cc.export('centdiff_1d', 'f8[:](f8[:], f8)') + def centdiff_1d(u, dx): + D = np.empty_like(u) + D[0] = 0 + D[-1] = 0 + for i in range(1, len(D) - 1): + D[i] = (u[i+1] - 2 * u[i] + u[i-1]) / dx**2 + return D + +.. (example from http://nbviewer.ipython.org/gist/ketch/ae87a94f4ef0793d5d52) + +You can also omit the return type, which will then be inferred by Numba:: + + @cc.export('centdiff_1d', '(f8[:], f8)') + def centdiff_1d(u, dx): + # Same code as above + ... + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/stencil.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/stencil.rst new file mode 100644 index 0000000000000000000000000000000000000000..6888a556f57d61b60af28d67c98f32f3d8620224 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/stencil.rst @@ -0,0 +1,255 @@ +.. Copyright (c) 2017 Intel Corporation + SPDX-License-Identifier: BSD-2-Clause + +.. _numba-stencil: + +================================ +Using the ``@stencil`` decorator +================================ + +Stencils are a common computational pattern in which array elements +are updated according to some fixed pattern called the stencil kernel. +Numba provides the ``@stencil`` decorator so that users may +easily specify a stencil kernel and Numba then generates the looping +code necessary to apply that kernel to some input array. Thus, the +stencil decorator allows clearer, more concise code and in conjunction +with :ref:`the parallel jit option ` enables higher +performance through parallelization of the stencil execution. + + +Basic usage +=========== + +An example use of the ``@stencil`` decorator:: + + from numba import stencil + + @stencil + def kernel1(a): + return 0.25 * (a[0, 1] + a[1, 0] + a[0, -1] + a[-1, 0]) + +The stencil kernel is specified by what looks like a standard Python +function definition but there are different semantics with +respect to array indexing. +Stencils produce an output array of the same size and shape as the +input array although depending on the kernel definition may have a +different type. +Conceptually, the stencil kernel is run once for each element in the +output array. The return value from the stencil kernel is the value +written into the output array for that particular element. + +The parameter ``a`` represents the input array over which the +kernel is applied. +Indexing into this array takes place with respect to the current element +of the output array being processed. For example, if element ``(x, y)`` +is being processed then ``a[0, 0]`` in the stencil kernel corresponds to +``a[x + 0, y + 0]`` in the input array. Similarly, ``a[-1, 1]`` in the stencil +kernel corresponds to ``a[x - 1, y + 1]`` in the input array. + +Depending on the specified kernel, the kernel may not be applicable to the +borders of the output array as this may cause the input array to be +accessed out-of-bounds. The way in which the stencil decorator handles +this situation is dependent upon which :ref:`stencil-mode` is selected. +The default mode is for the stencil decorator to set the border elements +of the output array to zero. + +To invoke a stencil on an input array, call the stencil as if it were +a regular function and pass the input array as the argument. For example, using +the kernel defined above:: + + >>> import numpy as np + >>> input_arr = np.arange(100).reshape((10, 10)) + array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], + [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59], + [60, 61, 62, 63, 64, 65, 66, 67, 68, 69], + [70, 71, 72, 73, 74, 75, 76, 77, 78, 79], + [80, 81, 82, 83, 84, 85, 86, 87, 88, 89], + [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]]) + >>> output_arr = kernel1(input_arr) + array([[ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], + [ 0., 11., 12., 13., 14., 15., 16., 17., 18., 0.], + [ 0., 21., 22., 23., 24., 25., 26., 27., 28., 0.], + [ 0., 31., 32., 33., 34., 35., 36., 37., 38., 0.], + [ 0., 41., 42., 43., 44., 45., 46., 47., 48., 0.], + [ 0., 51., 52., 53., 54., 55., 56., 57., 58., 0.], + [ 0., 61., 62., 63., 64., 65., 66., 67., 68., 0.], + [ 0., 71., 72., 73., 74., 75., 76., 77., 78., 0.], + [ 0., 81., 82., 83., 84., 85., 86., 87., 88., 0.], + [ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]) + >>> input_arr.dtype + dtype('int64') + >>> output_arr.dtype + dtype('float64') + +Note that the stencil decorator has determined that the output type +of the specified stencil kernel is ``float64`` and has thus created the +output array as ``float64`` while the input array is of type ``int64``. + +Stencil Parameters +================== + +Stencil kernel definitions may take any number of arguments with +the following provisions. The first argument must be an array. +The size and shape of the output array will be the same as that of the +first argument. Additional arguments may either be scalars or +arrays. For array arguments, those arrays must be at least as large +as the first argument (array) in each dimension. Array indexing is relative for +all such input array arguments. + +.. _stencil-kernel-shape-inference: + +Kernel shape inference and border handling +========================================== + +In the above example and in most cases, the array indexing in the +stencil kernel will exclusively use ``Integer`` literals. +In such cases, the stencil decorator is able to analyze the stencil +kernel to determine its size. In the above example, the stencil +decorator determines that the kernel is ``3 x 3`` in shape since indices +``-1`` to ``1`` are used for both the first and second dimensions. Note that +the stencil decorator also correctly handles non-symmetric and +non-square stencil kernels. + +Based on the size of the stencil kernel, the stencil decorator is +able to compute the size of the border in the output array. If +applying the kernel to some element of input array would cause +an index to be out-of-bounds then that element belongs to the border +of the output array. In the above example, points ``-1`` and ``+1`` are +accessed in each dimension and thus the output array has a border +of size one in all dimensions. + +The parallel mode is able to infer kernel indices as constants from +simple expressions if possible. For example:: + + @njit(parallel=True) + def stencil_test(A): + c = 2 + B = stencil( + lambda a, c: 0.3 * (a[-c+1] + a[0] + a[c-1]))(A, c) + return B + + +Stencil decorator options +========================= + +.. note:: + The stencil decorator may be augmented in the future to provide additional + mechanisms for border handling. At present, only one behaviour is + implemented, ``"constant"`` (see ``func_or_mode`` below for details). + +.. _stencil-neighborhood: + +``neighborhood`` +---------------- + +Sometimes it may be inconvenient to write the stencil kernel +exclusively with ``Integer`` literals. For example, let us say we +would like to compute the trailing 30-day moving average of a +time series of data. One could write +``(a[-29] + a[-28] + ... + a[-1] + a[0]) / 30`` but the stencil +decorator offers a more concise form using the ``neighborhood`` +option:: + + @stencil(neighborhood = ((-29, 0),)) + def kernel2(a): + cumul = 0 + for i in range(-29, 1): + cumul += a[i] + return cumul / 30 + +The neighborhood option is a tuple of tuples. The outer tuple's +length is equal to the number of dimensions of the input array. +The inner tuple's lengths are always two because +each element of the inner tuple corresponds to minimum and +maximum index offsets used in the corresponding dimension. + +If a user specifies a neighborhood but the kernel accesses elements outside the +specified neighborhood, **the behavior is undefined.** + +.. _stencil-mode: + +``func_or_mode`` +---------------- + +The optional ``func_or_mode`` parameter controls how the border of the output array +is handled. Currently, there is only one supported value, ``"constant"``. +In ``constant`` mode, the stencil kernel is not applied in cases where +the kernel would access elements outside the valid range of the input +array. In such cases, those elements in the output array are assigned +to a constant value, as specified by the ``cval`` parameter. + +``cval`` +-------- + +The optional cval parameter defaults to zero but can be set to any +desired value, which is then used for the border of the output array +if the ``func_or_mode`` parameter is set to ``constant``. The cval parameter is +ignored in all other modes. The type of the cval parameter must match +the return type of the stencil kernel. If the user wishes the output +array to be constructed from a particular type then they should ensure +that the stencil kernel returns that type. + +``standard_indexing`` +--------------------- + +By default, all array accesses in a stencil kernel are processed as +relative indices as described above. However, sometimes it may be +advantageous to pass an auxiliary array (e.g. an array of weights) +to a stencil kernel and have that array use standard Python indexing +rather than relative indexing. For this purpose, there is the +stencil decorator option ``standard_indexing`` whose value is a +collection of strings whose names match those parameters to the +stencil function that are to be accessed with standard Python indexing +rather than relative indexing:: + + @stencil(standard_indexing=("b",)) + def kernel3(a, b): + return a[-1] * b[0] + a[0] + b[1] + +``StencilFunc`` +=============== + +The stencil decorator returns a callable object of type ``StencilFunc``. +``StencilFunc`` objects contains a number of attributes but the only one of +potential interest to users is the ``neighborhood`` attribute. +If the ``neighborhood`` option was passed to the stencil decorator then +the provided neighborhood is stored in this attribute. Else, upon +first execution or compilation, the system calculates the neighborhood +as described above and then stores the computed neighborhood into this +attribute. A user may then inspect the attribute if they wish to verify +that the calculated neighborhood is correct. + +Stencil invocation options +========================== + +Internally, the stencil decorator transforms the specified stencil +kernel into a regular Python function. This function will have the +same parameters as specified in the stencil kernel definition but will +also include the following optional parameter. + +.. _stencil-function-out: + +``out`` +------- + +The optional ``out`` parameter is added to every stencil function +generated by Numba. If specified, the ``out`` parameter tells +Numba that the user is providing their own pre-allocated array +to be used for the output of the stencil. In this case, the +stencil function will not allocate its own output array. +Users should assure that the return type of the stencil kernel can +be safely cast to the element-type of the user-specified output array +following the `NumPy ufunc casting rules`_. + +.. _`NumPy ufunc casting rules`: http://docs.scipy.org/doc/numpy/reference/ufuncs.html#casting-rules + +An example usage is shown below:: + + >>> import numpy as np + >>> input_arr = np.arange(100).reshape((10, 10)) + >>> output_arr = np.full(input_arr.shape, 0.0) + >>> kernel1(input_arr, out=output_arr) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/talks.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/talks.rst new file mode 100644 index 0000000000000000000000000000000000000000..ef35ae9444ce17c4bcaae31bbf1b3efd0299c53c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/talks.rst @@ -0,0 +1,36 @@ + +Talks and Tutorials +=================== + +.. note:: This is a selection of talks and tutorials that have been given by members of + the Numba team as well as Numba users. If you know of a Numba-related talk + that should be included on this list, please `open an issue `_. + +Talks on Numba +-------------- + +* AnacondaCON 2018 - Accelerating Scientific Workloads with Numba - Siu Kwan Lam (`Video `__) +* `DIANA-HEP Meeting, 23 April 2018 `__ - Overview of Numba - Stan Seibert + +Talks on Applications of Numba +------------------------------ + +* GPU Technology Conference 2016 - Accelerating a Spectral Algorithm for Plasma Physics with Python/Numba on GPU - Manuel Kirchen & Rémi Lehe (`Slides `__) +* `DIANA-HEP Meeting, 23 April 2018 `_ - Use of Numba in XENONnT - Chris Tunnell +* `DIANA-HEP Meeting, 23 April 2018 `_ - Extending Numba for HEP data types - Jim Pivarski +* STAC Summit, Nov 1 2017 - Scaling High-Performance Python with Minimal Effort - Ehsan Totoni (`Video `__, `Slides `__) +* SciPy 2018 - UMAP: Uniform Manifold Approximation and Projection for Dimensional Reduction - Leland McInnes (`Video `__, `Github `__) +* PyData Berlin 2018 - Extending Pandas using Apache Arrow and Numba - Uwe L. Korn (`Video `__, `Blog `__) +* FOSDEM 2019 - Extending Numba - Joris Geessels (`Video, Slides & Examples `__) +* PyCon India 2019 - Real World Numba: Taking the Path of Least Resistance - Ankit Mahato (`Video `__) +* SciPy 2019 - How to Accelerate an Existing Codebase with Numba - Siu Kwan Lam & Stanley Seibert (`Video `__) +* SciPy 2019 - Real World Numba: Creating a Skeleton Analysis Library - Juan Nunez-Iglesias (`Video `__) +* SciPy 2019 - Fast Gradient Boosting Decision Trees with PyGBM and Numba - Nicholas Hug (`Video `__) +* PyCon Sweden 2020 - Accelerating Scientific Computing using Numba - Ankit Mahato (`Video `__) + +Tutorials +--------- + +* SciPy 2017 - Numba: Tell those C++ Bullies to Get Lost - Gil Forsyth & Lorena Barba (`Video `__, `Notebooks `__) +* GPU Technology Conference 2018 - GPU Computing in Python with Numba - Stan Seibert (`Notebooks `__) +* PyData Amsterdam 2019 - Create CUDA kernels from Python using Numba and CuPy - Valentin Haenel (`Video `__) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/threading-layer.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/threading-layer.rst new file mode 100644 index 0000000000000000000000000000000000000000..6aa5fef435ebc7ddb10d5a1f983f3a51ac16d2e1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/threading-layer.rst @@ -0,0 +1,313 @@ +.. _numba-threading-layer: + +The Threading Layers +==================== + +This section is about the Numba threading layer, this is the library that is +used internally to perform the parallel execution that occurs through the use of +the ``parallel`` targets for CPUs, namely: + +* The use of the ``parallel=True`` kwarg in ``@jit`` and ``@njit``. +* The use of the ``target='parallel'`` kwarg in ``@vectorize`` and + ``@guvectorize``. + +.. note:: + If a code base does not use the ``threading`` or ``multiprocessing`` + modules (or any other sort of parallelism) the defaults for the threading + layer that ship with Numba will work well, no further action is required! + + +Which threading layers are available? +------------------------------------- +There are three threading layers available and they are named as follows: + +* ``tbb`` - A threading layer backed by Intel TBB. +* ``omp`` - A threading layer backed by OpenMP. +* ``workqueue`` -A simple built-in work-sharing task scheduler. + +In practice, the only threading layer guaranteed to be present is ``workqueue``. +The ``omp`` layer requires the presence of a suitable OpenMP runtime library. +The ``tbb`` layer requires the presence of Intel's TBB libraries, these can be +obtained via the conda command:: + + $ conda install tbb + +If you installed Numba with ``pip``, TBB can be enabled by running:: + + $ pip install tbb + +Due to compatibility issues with manylinux1 and other portability concerns, +the OpenMP threading layer is disabled in the Numba binary wheels on PyPI. + +.. note:: + The default manner in which Numba searches for and loads a threading layer + is tolerant of missing libraries, incompatible runtimes etc. + + +.. _numba-threading-layer-setting-mech: + +Setting the threading layer +--------------------------- + + +The threading layer is set via the environment variable +``NUMBA_THREADING_LAYER`` or through assignment to +``numba.config.THREADING_LAYER``. If the programmatic approach to setting the +threading layer is used it must occur logically before any Numba based +compilation for a parallel target has occurred. There are two approaches to +choosing a threading layer, the first is by selecting a threading layer that is +safe under various forms of parallel execution, the second is through explicit +selection via the threading layer name (e.g. ``tbb``). + +Setting the threading layer selection priority +---------------------------------------------- + +By default the threading layers are searched in the order of ``'tbb'``, +``'omp'``, then ``'workqueue'``. To change this search order whilst +maintaining the selection of a threading layer based on availability, the +environment variable :envvar:`NUMBA_THREADING_LAYER_PRIORITY` can be used. + +Note that it can also be set via +:py:data:`numba.config.THREADING_LAYER_PRIORITY`. +Similar to :py:data:`numba.config.THREADING_LAYER`, +it must occur logically before any Numba based +compilation for a parallel target has occurred. + +For example, to instruct Numba to choose ``omp`` first if available, +then ``tbb`` and so on, set the environment variable as +``NUMBA_THREADING_LAYER_PRIORITY="omp tbb workqueue"``. +Or programmatically, +``numba.config.THREADING_LAYER_PRIORITY = ["omp", "tbb", "workqueue"]``. + +Selecting a threading layer for safe parallel execution +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Parallel execution is fundamentally derived from core Python libraries in four +forms (the first three also apply to code using parallel execution via other +means!): + +* ``threads`` from the ``threading`` module. +* ``spawn`` ing processes from the ``multiprocessing`` module via ``spawn`` + (default on Windows, only available in Python 3.4+ on Unix) +* ``fork`` ing processes from the ``multiprocessing`` module via ``fork`` + (default on Unix). +* ``fork`` ing processes from the ``multiprocessing`` module through the use of + a ``forkserver`` (only available in Python 3 on Unix). Essentially a new + process is spawned and then forks are made from this new process on request. + +Any library in use with these forms of parallelism must exhibit safe behaviour +under the given paradigm. As a result, the threading layer selection methods +are designed to provide a way to choose a threading layer library that is safe +for a given paradigm in an easy, cross platform and environment tolerant manner. +The options that can be supplied to the +:ref:`setting mechanisms ` are as +follows: + +* ``default`` provides no specific safety guarantee and is the default. +* ``safe`` is both fork and thread safe, this requires the ``tbb`` package + (Intel TBB libraries) to be installed. +* ``forksafe`` provides a fork safe library. +* ``threadsafe`` provides a thread safe library. + +To discover the threading layer that was selected, the function +``numba.threading_layer()`` may be called after parallel execution. For example, +on a Linux machine with no TBB installed:: + + from numba import config, njit, threading_layer + import numpy as np + + # set the threading layer before any parallel target compilation + config.THREADING_LAYER = 'threadsafe' + + @njit(parallel=True) + def foo(a, b): + return a + b + + x = np.arange(10.) + y = x.copy() + + # this will force the compilation of the function, select a threading layer + # and then execute in parallel + foo(x, y) + + # demonstrate the threading layer chosen + print("Threading layer chosen: %s" % threading_layer()) + +which produces:: + + Threading layer chosen: omp + +and this makes sense as GNU OpenMP, as present on Linux, is thread safe. + +Selecting a named threading layer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Advanced users may wish to select a specific threading layer for their use case, +this is done by directly supplying the threading layer name to the +:ref:`setting mechanisms `. The options +and requirements are as follows: + ++----------------------+-----------+-------------------------------------------+ +| Threading Layer Name | Platform | Requirements | ++======================+===========+===========================================+ +| ``tbb`` | All | The ``tbb`` package (``$ conda install | +| | | tbb``) | ++----------------------+-----------+-------------------------------------------+ +| ``omp`` | Linux | GNU OpenMP libraries (very likely this | +| | | will already exist) | +| | | | +| | Windows | MS OpenMP libraries (very likely this will| +| | | already exist) | +| | | | +| | OSX | Either the ``intel-openmp`` package or the| +| | | ``llvm-openmp`` package | +| | | (``conda install`` the package as named). | ++----------------------+-----------+-------------------------------------------+ +| ``workqueue`` | All | None | ++----------------------+-----------+-------------------------------------------+ + +Should the threading layer not load correctly Numba will detect this and provide +a hint about how to resolve the problem. It should also be noted that the Numba +diagnostic command ``numba -s`` has a section +``__Threading Layer Information__`` that reports on the availability of +threading layers in the current environment. + + +Extra notes +----------- +The threading layers have fairly complex interactions with CPython internals and +system level libraries, some additional things to note: + +* The installation of Intel's TBB libraries vastly widens the options available + in the threading layer selection process. +* On Linux, the ``omp`` threading layer is not fork safe due to the GNU OpenMP + runtime library (``libgomp``) not being fork safe. If a fork occurs in a + program that is using the ``omp`` threading layer, a detection mechanism is + present that will try and gracefully terminate the forked child and print an + error message to ``STDERR``. +* On systems with the ``fork(2)`` system call available, if the TBB backed + threading layer is in use and a ``fork`` call is made from a thread other than + the thread that launched TBB (typically the main thread) then this results in + undefined behaviour and a warning will be displayed on ``STDERR``. As + ``spawn`` is essentially ``fork`` followed by ``exec`` it is safe to ``spawn`` + from a non-main thread, but as this cannot be differentiated from just a + ``fork`` call the warning message will still be displayed. +* On OSX, the ``intel-openmp`` package is required to enable the OpenMP based + threading layer. + +.. _setting_the_number_of_threads: + +Setting the Number of Threads +----------------------------- + +The number of threads used by numba is based on the number of CPU cores +available (see :obj:`numba.config.NUMBA_DEFAULT_NUM_THREADS`), but it can be +overridden with the :envvar:`NUMBA_NUM_THREADS` environment variable. + +The total number of threads that numba launches is in the variable +:obj:`numba.config.NUMBA_NUM_THREADS`. + +For some use cases, it may be desirable to set the number of threads to a +lower value, so that numba can be used with higher level parallelism. + +The number of threads can be set dynamically at runtime using +:func:`numba.set_num_threads`. Note that :func:`~.set_num_threads` only allows +setting the number of threads to a smaller value than +:obj:`~.NUMBA_NUM_THREADS`. Numba always launches +:obj:`numba.config.NUMBA_NUM_THREADS` threads, but :func:`~.set_num_threads` +causes it to mask out unused threads so they aren't used in computations. + +The current number of threads used by numba can be accessed with +:func:`numba.get_num_threads`. Both functions work inside of a jitted +function. + +.. _numba-threading-layer-thread-masking: + +Example of Limiting the Number of Threads +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this example, suppose the machine we are running on has 8 cores (so +:obj:`numba.config.NUMBA_NUM_THREADS` would be ``8``). Suppose we want to run +some code with ``@njit(parallel=True)``, but we also want to run our code +concurrently in 4 different processes. With the default number of threads, +each Python process would run 8 threads, for a total in 4*8 = 32 threads, +which is oversubscription for our 8 cores. We should rather limit each process +to 2 threads, so that the total will be 4*2 = 8, which matches our number of +physical cores. + +There are two ways to do this. One is to set the :envvar:`NUMBA_NUM_THREADS` +environment variable to ``2``. + +.. code:: bash + + $ NUMBA_NUM_THREADS=2 python ourcode.py + +However, there are two downsides to this approach: + +1. :envvar:`NUMBA_NUM_THREADS` must be set before Numba is imported, and + ideally before Python is launched. As soon as Numba is imported the + environment variable is read and that number of threads is locked in as the + number of threads Numba launches. + +2. If we want to later increase the number of threads used by the process, we + cannot. :envvar:`NUMBA_NUM_THREADS` sets the *maximum* number of threads + that are launched for a process. Calling :func:`~.set_num_threads()` with a + value greater than :obj:`numba.config.NUMBA_NUM_THREADS` results in an + error. + +The advantage of this approach is that we can do it from outside of the +process without changing the code. + +Another approach is to use the :func:`numba.set_num_threads` function in our code + +.. code:: python + + from numba import njit, set_num_threads + + @njit(parallel=True) + def func(): + ... + + set_num_threads(2) + func() + +If we call ``set_num_threads(2)`` before executing our parallel code, it has +the same effect as calling the process with ``NUMBA_NUM_THREADS=2``, in that +the parallel code will only execute on 2 threads. However, we can later call +``set_num_threads(8)`` to increase the number of threads back to the default +size. And we do not have to worry about setting it before Numba gets imported. +It only needs to be called before the parallel function is run. + +.. _numba-threading-layer-thread-id: + +Getting a Thread ID +------------------- + +In some cases it may be beneficial to have access to a unique identifier for the +current thread that is executing a parallel region in Numba. For that purpose, +Numba provides the :func:`numba.get_thread_id` function. This function is the +corollary of OpenMP's function ``omp_get_thread_num`` and returns an integer +between 0 (inclusive) and the number of configured threads as described above +(exclusive). + +API Reference +~~~~~~~~~~~~~ + +.. py:data:: numba.config.NUMBA_NUM_THREADS + + The total (maximum) number of threads launched by numba. + + Defaults to :obj:`numba.config.NUMBA_DEFAULT_NUM_THREADS`, but can be + overridden with the :envvar:`NUMBA_NUM_THREADS` environment variable. + +.. py:data:: numba.config.NUMBA_DEFAULT_NUM_THREADS + + The number of usable CPU cores on the system (as determined by + ``len(os.sched_getaffinity(0))``, if supported by the OS, or + ``multiprocessing.cpu_count()`` if not). + This is the default value for :obj:`numba.config.NUMBA_NUM_THREADS` unless + the :envvar:`NUMBA_NUM_THREADS` environment variable is set. + +.. autofunction:: numba.set_num_threads + +.. autofunction:: numba.get_num_threads + +.. autofunction:: numba.get_thread_id diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/troubleshoot.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/troubleshoot.rst new file mode 100644 index 0000000000000000000000000000000000000000..b921443460b2d620d5e9781227596040bfe79d15 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/troubleshoot.rst @@ -0,0 +1,1179 @@ + +.. _numba-troubleshooting: + +======================== +Troubleshooting and tips +======================== + +.. _what-to-compile: + +What to compile +=============== + +The general recommendation is that you should only try to compile the +critical paths in your code. If you have a piece of performance-critical +computational code amongst some higher-level code, you may factor out +the performance-critical code in a separate function and compile the +separate function with Numba. Letting Numba focus on that small piece +of performance-critical code has several advantages: + +* it reduces the risk of hitting unsupported features; +* it reduces the compilation times; +* it allows you to evolve the higher-level code which is outside of the + compiled function much easier. + +.. _code-doesnt-compile: + +My code doesn't compile +======================= + +There can be various reasons why Numba cannot compile your code, and raises +an error instead. One common reason is that your code relies on an +unsupported Python feature, especially in :term:`nopython mode`. +Please see the list of :ref:`pysupported`. If you find something that +is listed there and still fails compiling, please +:ref:`report a bug `. + +When Numba tries to compile your code it first tries to work out the types of +all the variables in use, this is so it can generate a type specific +implementation of your code that can be compiled down to machine code. A common +reason for Numba failing to compile (especially in :term:`nopython mode`) is a +type inference failure, essentially Numba cannot work out what the type of all +the variables in your code should be. + +For example, let's consider this trivial function:: + + @jit(nopython=True) + def f(x, y): + return x + y + +If you call it with two numbers, Numba is able to infer the types properly:: + + >>> f(1, 2) + 3 + +If however you call it with a tuple and a number, Numba is unable to say +what the result of adding a tuple and number is, and therefore compilation +errors out:: + + >>> f(1, (2,)) + Traceback (most recent call last): + File "", line 1, in + File "/numba/numba/dispatcher.py", line 339, in _compile_for_args + reraise(type(e), e, None) + File "/numba/numba/six.py", line 658, in reraise + raise value.with_traceback(tb) + numba.errors.TypingError: Failed at nopython (nopython frontend) + Invalid use of + with parameters (int64, tuple(int64 x 1)) + Known signatures: + * (int64, int64) -> int64 + * (int64, uint64) -> int64 + * (uint64, int64) -> int64 + * (uint64, uint64) -> uint64 + * (float32, float32) -> float32 + * (float64, float64) -> float64 + * (complex64, complex64) -> complex64 + * (complex128, complex128) -> complex128 + * (uint16,) -> uint64 + * (uint8,) -> uint64 + * (uint64,) -> uint64 + * (uint32,) -> uint64 + * (int16,) -> int64 + * (int64,) -> int64 + * (int8,) -> int64 + * (int32,) -> int64 + * (float32,) -> float32 + * (float64,) -> float64 + * (complex64,) -> complex64 + * (complex128,) -> complex128 + * parameterized + [1] During: typing of intrinsic-call at (3) + + File "", line 3: + +The error message helps you find out what went wrong: +"Invalid use of + with parameters (int64, tuple(int64 x 1))" is to be +interpreted as "Numba encountered an addition of variables typed as integer +and 1-tuple of integer, respectively, and doesn't know about any such +operation". + +Note that if you allow object mode:: + + @jit + def g(x, y): + return x + y + +compilation will succeed and the compiled function will raise at runtime as +Python would do:: + + >>> g(1, (2,)) + Traceback (most recent call last): + File "", line 1, in + TypeError: unsupported operand type(s) for +: 'int' and 'tuple' + + +My code has a type unification problem +====================================== + +Another common reason for Numba not being able to compile your code is that it +cannot statically determine the return type of a function. The most likely +cause of this is the return type depending on a value that is available only at +runtime. Again, this is most often problematic when using +:term:`nopython mode`. The concept of type unification is simply trying to find +a type in which two variables could safely be represented. For example a 64 bit +float and a 64 bit complex number could both be represented in a 128 bit complex +number. + +As an example of type unification failure, this function has a return type that +is determined at runtime based on the value of `x`:: + + In [1]: from numba import jit + + In [2]: @jit(nopython=True) + ...: def f(x): + ...: if x > 10: + ...: return (1,) + ...: else: + ...: return 1 + ...: + + In [3]: f(10) + +Trying to execute this function, errors out as follows:: + + TypingError: Failed at nopython (nopython frontend) + Can't unify return type from the following types: tuple(int64 x 1), int64 + Return of: IR name '$8.2', type '(int64 x 1)', location: + File "", line 4: + def f(x): + + if x > 10: + return (1,) + ^ + Return of: IR name '$12.2', type 'int64', location: + File "", line 6: + def f(x): + + else: + return 1 + +The error message "Can't unify return type from the following types: +tuple(int64 x 1), int64" should be read as "Numba cannot find a type that +can safely represent a 1-tuple of integer and an integer". + +.. _code-has-untyped-list: + +My code has an untyped list problem +=================================== + +As :ref:`noted previously ` the first part of Numba +compiling your code involves working out what the types of all the variables +are. In the case of lists, a list must contain items that are of the same type +or can be empty if the type can be inferred from some later operation. What is +not possible is to have a list which is defined as empty and has no inferable +type (i.e. an untyped list). + +For example, this is using a list of a known type:: + + from numba import jit + @jit(nopython=True) + def f(): + return [1, 2, 3] # this list is defined on construction with `int` type + +This is using an empty list, but the type can be inferred:: + + from numba import jit + @jit(nopython=True) + def f(x): + tmp = [] # defined empty + for i in range(x): + tmp.append(i) # list type can be inferred from the type of `i` + return tmp + +This is using an empty list and the type cannot be inferred:: + + from numba import jit + @jit(nopython=True) + def f(x): + tmp = [] # defined empty + return (tmp, x) # ERROR: the type of `tmp` is unknown + +Whilst slightly contrived, if you need an empty list and the type cannot be +inferred but you know what type you want the list to be, this "trick" can be +used to instruct the typing mechanism:: + + from numba import jit + import numpy as np + @jit(nopython=True) + def f(x): + # define empty list, but instruct that the type is np.complex64 + tmp = [np.complex64(x) for x in range(0)] + return (tmp, x) # the type of `tmp` is known, but it is still empty + +The compiled code is too slow +============================= + +The most common reason for slowness of a compiled JIT function is that +compiling in :term:`nopython mode` has failed and the Numba compiler has +fallen back to :term:`object mode`. :term:`object mode` currently provides +little to no speedup compared to regular Python interpretation, and its +main point is to allow an internal optimization known as +:term:`loop-lifting`: this optimization will allow to compile inner +loops in :term:`nopython mode` regardless of what code surrounds those +inner loops. + +To find out if type inference succeeded on your function, you can use +the :meth:`~Dispatcher.inspect_types` method on the compiled function. + +For example, let's take the following function:: + + @jit + def f(a, b): + s = a + float(b) + return s + +When called with numbers, this function should be fast as Numba is able +to convert number types to floating-point numbers. Let's see:: + + >>> f(1, 2) + 3.0 + >>> f.inspect_types() + f (int64, int64) + -------------------------------------------------------------------------------- + # --- LINE 7 --- + + @jit + + # --- LINE 8 --- + + def f(a, b): + + # --- LINE 9 --- + # label 0 + # a.1 = a :: int64 + # del a + # b.1 = b :: int64 + # del b + # $0.2 = global(float: ) :: Function() + # $0.4 = call $0.2(b.1, ) :: (int64,) -> float64 + # del b.1 + # del $0.2 + # $0.5 = a.1 + $0.4 :: float64 + # del a.1 + # del $0.4 + # s = $0.5 :: float64 + # del $0.5 + + s = a + float(b) + + # --- LINE 10 --- + # $0.7 = cast(value=s) :: float64 + # del s + # return $0.7 + + return s + +Without trying to understand too much of the Numba intermediate representation, +it is still visible that all variables and temporary values have had their +types inferred properly: for example *a* has the type ``int64``, *$0.5* has +the type ``float64``, etc. + +However, if *b* is passed as a string, compilation will fall back on object +mode as the float() constructor with a string is currently not supported +by Numba:: + + >>> f(1, "2") + 3.0 + >>> f.inspect_types() + [... snip annotations for other signatures, see above ...] + ================================================================================ + f (int64, str) + -------------------------------------------------------------------------------- + # --- LINE 7 --- + + @jit + + # --- LINE 8 --- + + def f(a, b): + + # --- LINE 9 --- + # label 0 + # a.1 = a :: pyobject + # del a + # b.1 = b :: pyobject + # del b + # $0.2 = global(float: ) :: pyobject + # $0.4 = call $0.2(b.1, ) :: pyobject + # del b.1 + # del $0.2 + # $0.5 = a.1 + $0.4 :: pyobject + # del a.1 + # del $0.4 + # s = $0.5 :: pyobject + # del $0.5 + + s = a + float(b) + + # --- LINE 10 --- + # $0.7 = cast(value=s) :: pyobject + # del s + # return $0.7 + + return s + +Here we see that all variables end up typed as ``pyobject``. This means +that the function was compiled in object mode and values are passed +around as generic Python objects, without Numba trying to look into them +to reason about their raw values. This is a situation you want to avoid +when caring about the speed of your code. + +If a function fails to compile in ``nopython`` mode warnings will be emitted +with explanation as to why compilation failed. For example with the ``f()`` +function above (slightly edited for documentation purposes):: + + >>> f(1, 2) + 3.0 + >>> f(1, "2") + example.py:7: NumbaWarning: + Compilation is falling back to object mode WITH looplifting enabled because Function "f" failed type inference due to: Invalid use of Function() with argument(s) of type(s): (unicode_type) + * parameterized + In definition 0: + TypeError: float() only support for numbers + raised from /numba/typing/builtins.py:880 + In definition 1: + TypeError: float() only support for numbers + raised from /numba/typing/builtins.py:880 + This error is usually caused by passing an argument of a type that is unsupported by the named function. + [1] During: resolving callee type: Function() + [2] During: typing of call at example.py (9) + + + File "example.py", line 9: + def f(a, b): + s = a + float(b) + ^ + + /numba/compiler.py:722: NumbaWarning: Function "f" was compiled in object mode without forceobj=True. + + File "example.py", line 8: + @jit + def f(a, b): + ^ + + 3.0 + + +Disabling JIT compilation +========================= + +In order to debug code, it is possible to disable JIT compilation, which makes +the ``jit`` decorator (and the ``njit`` decorator) act as if +they perform no operation, and the invocation of decorated functions calls the +original Python function instead of a compiled version. This can be toggled by +setting the :envvar:`NUMBA_DISABLE_JIT` environment variable to ``1``. + +When this mode is enabled, the ``vectorize`` and ``guvectorize`` decorators will +still result in compilation of a ufunc, as there is no straightforward pure +Python implementation of these functions. + + +.. _debugging-jit-compiled-code: + +Debugging JIT compiled code with GDB +==================================== + +Setting the ``debug`` keyword argument in the ``jit`` decorator +(e.g. ``@jit(debug=True)``) enables the emission of debug info in the jitted +code. To debug, GDB version 7.0 or above is required. Currently, the following +debug info is available: + +* Function name will be shown in the backtrace along with type information and + values (if available). +* Source location (filename and line number) is available. For example, + users can set a break point by the absolute filename and line number; + e.g. ``break /path/to/myfile.py:6``. +* Arguments to the current function can be show with ``info args`` +* Local variables in the current function can be shown with ``info locals``. +* The type of variables can be shown with ``whatis myvar``. +* The value of variables can be shown with ``print myvar`` or ``display myvar``. + + * Simple numeric types, i.e. int, float and double, are shown in their + native representation. + * Other types are shown as a structure based on Numba's memory model + representation of the type. + +Further, the Numba ``gdb`` printing extension can be loaded into ``gdb`` (if the +``gdb`` has Python support) to permit the printing of variables as they would be +in native Python. The extension does this by reinterpreting Numba's memory model +representations as Python types. Information about the ``gdb`` installation that +Numba is using, including the path to load the ``gdb`` printing extension, can +be displayed by using the ``numba -g`` command. For best results ensure that the +Python that ``gdb`` is using has a NumPy module accessible. An example output +of the ``gdb`` information follows: + +.. code-block:: none + :emphasize-lines: 1 + + $ numba -g + GDB info: + -------------------------------------------------------------------------------- + Binary location : /gdb + Print extension location : /numba/misc/gdb_print_extension.py + Python version : 3.8 + NumPy version : 1.20.0 + Numba printing extension supported : True + + To load the Numba gdb printing extension, execute the following from the gdb prompt: + + source /numba/misc/gdb_print_extension.py + + -------------------------------------------------------------------------------- + +Known issues: + +* Stepping depends heavily on optimization level. At full optimization + (equivalent to O3), most of the variables are optimized out. It is often + beneficial to use the jit option ``_dbg_optnone=True`` + or the environment variable :envvar:`NUMBA_OPT` to adjust the + optimization level and the jit option ``_dbg_extend_lifetimes=True`` + (which is on by default if ``debug=True``) or + :envvar:`NUMBA_EXTEND_VARIABLE_LIFETIMES` to extend + the lifetime of variables to the end of their scope so as to get a debugging + experience closer to the semantics of Python execution. + +* Memory consumption increases significantly with debug info enabled. + The compiler emits extra information (`DWARF `_) + along with the instructions. The emitted object code can be 2x bigger with + debug info. + +Internal details: + +* Since Python semantics allow variables to bind to value of different types, + Numba internally creates multiple versions of the variable for each type. + So for code like:: + + x = 1 # type int + x = 2.3 # type float + x = (1, 2, 3) # type 3-tuple of int + + Each assignments will store to a different variable name. In the debugger, + the variables will be ``x``, ``x$1`` and ``x$2``. (In the Numba IR, they are + ``x``, ``x.1`` and ``x.2``.) + +* When debug is enabled, inlining of functions at LLVM IR level is disabled. + +JIT options for debug +--------------------- + +* ``debug`` (bool). Set to ``True`` to enable debug info. Defaults to ``False``. +* ``_dbg_optnone`` (bool). Set to ``True`` to disable all LLVM optimization passes + on the function. Defaults to ``False``. See :envvar:`NUMBA_OPT` for a global setting + to disable optimization. +* ``_dbg_extend_lifetimes`` (bool). Set to ``True`` to extend the lifetime of + objects such that they more closely follow the semantics of Python. + Automatically set to ``True`` when + ``debug=True``; otherwise, defaults to ``False``. Users can explicitly set this option + to ``False`` to retain the normal execution semantics of compiled code. + See :envvar:`NUMBA_EXTEND_VARIABLE_LIFETIMES` for a global option to extend object + lifetimes. + +Example debug usage +------------------- + +The python source: + +.. code-block:: python + :linenos: + + from numba import njit + + @njit(debug=True) + def foo(a): + b = a + 1 + c = a * 2.34 + d = (a, b, c) + print(a, b, c, d) + + r = foo(123) + print(r) + +In the terminal: + +.. code-block:: none + :emphasize-lines: 1, 3, 7, 12, 14, 16, 20, 22, 26, 28, 30, 32, 34, 36 + + $ NUMBA_OPT=0 NUMBA_EXTEND_VARIABLE_LIFETIMES=1 gdb -q python + Reading symbols from python... + (gdb) break test1.py:5 + No source file named test1.py. + Make breakpoint pending on future shared library load? (y or [n]) y + Breakpoint 1 (test1.py:5) pending. + (gdb) run test1.py + Starting program: /bin/python test1.py + ... + Breakpoint 1, __main__::foo_241[abi:c8tJTC_2fWgEeGLSgydRTQUgiqKEZ6gEoDvQJmaQIA](long long) (a=123) at test1.py:5 + 5 b = a + 1 + (gdb) info args + a = 123 + (gdb) n + 6 c = a * 2.34 + (gdb) info locals + b = 124 + c = 0 + d = {f0 = 0, f1 = 0, f2 = 0} + (gdb) n + 7 d = (a, b, c) + (gdb) info locals + b = 124 + c = 287.81999999999999 + d = {f0 = 0, f1 = 0, f2 = 0} + (gdb) whatis b + type = int64 + (gdb) whatis d + type = Tuple(int64, int64, float64) ({i64, i64, double}) + (gdb) n + 8 print(a, b, c, d) + (gdb) print b + $1 = 124 + (gdb) print d + $2 = {f0 = 123, f1 = 124, f2 = 287.81999999999999} + (gdb) bt + #0 __main__::foo_241[abi:c8tJTC_2fWgEeGLSgydRTQUgiqKEZ6gEoDvQJmaQIA](long long) (a=123) at test1.py:8 + #1 0x00007ffff06439fa in cpython::__main__::foo_241[abi:c8tJTC_2fWgEeGLSgydRTQUgiqKEZ6gEoDvQJmaQIA](long long) () + + +Another example follows that makes use of the Numba ``gdb`` printing extension +mentioned above, note the change in the print format once the extension is +loaded with ``source`` : + +The Python source: + +.. code-block:: python + :linenos: + + from numba import njit + import numpy as np + + @njit(debug=True) + def foo(n): + x = np.arange(n) + y = (x[0], x[-1]) + return x, y + + foo(4) + +In the terminal: + +.. code-block:: none + :emphasize-lines: 1, 3, 4, 7, 12, 14, 16, 17, 20 + + $ NUMBA_OPT=0 NUMBA_EXTEND_VARIABLE_LIFETIMES=1 gdb -q python + Reading symbols from python... + (gdb) set breakpoint pending on + (gdb) break test2.py:8 + No source file named test2.py. + Breakpoint 1 (test2.py:8) pending. + (gdb) run test2.py + Starting program: /bin/python test2.py + ... + Breakpoint 1, __main__::foo_241[abi:c8tJTC_2fWgEeGLSgydRTQUgiqKEZ6gEoDvQJmaQIA](long long) (n=4) at test2.py:8 + 8 return x, y + (gdb) print x + $1 = {meminfo = 0x55555688f470 "\001", parent = 0x0, nitems = 4, itemsize = 8, data = 0x55555688f4a0, shape = {4}, strides = {8}} + (gdb) print y + $2 = {0, 3} + (gdb) source numba/misc/gdb_print_extension.py + (gdb) print x + $3 = + [0 1 2 3] + (gdb) print y + $4 = (0, 3) + + + +Globally override debug setting +------------------------------- + +It is possible to enable debug for the full application by setting environment +variable ``NUMBA_DEBUGINFO=1``. This sets the default value of the ``debug`` +option in ``jit``. Debug can be turned off on individual functions by setting +``debug=False``. + +Beware that enabling debug info significantly increases the memory consumption +for each compiled function. For large application, this may cause out-of-memory +error. + +Using Numba's direct ``gdb`` bindings in ``nopython`` mode +=========================================================== + +Numba (version 0.42.0 and later) has some additional functions relating to +``gdb`` support for CPUs that make it easier to debug programs. All the ``gdb`` +related functions described in the following work in the same manner +irrespective of whether they are called from the standard CPython interpreter or +code compiled in either :term:`nopython mode` or :term:`object mode`. + +.. note:: This feature is experimental! + +.. warning:: This feature does unexpected things if used from Jupyter or + alongside the ``pdb`` module. It's behaviour is harmless, just hard + to predict! + +Set up +------ +Numba's ``gdb`` related functions make use of a ``gdb`` binary, the location and +name of this binary can be configured via the :envvar:`NUMBA_GDB_BINARY` +environment variable if desired. + +.. note:: Numba's ``gdb`` support requires the ability for ``gdb`` to attach to + another process. On some systems (notably Ubuntu Linux) default + security restrictions placed on ``ptrace`` prevent this from being + possible. This restriction is enforced at the system level by the + Linux security module `Yama`. Documentation for this module and the + security implications of making changes to its behaviour can be found + in the `Linux Kernel documentation `_. + The `Ubuntu Linux security documentation `_ + discusses how to adjust the behaviour of `Yama` on with regards to + ``ptrace_scope`` so as to permit the required behaviour. + +Basic ``gdb`` support +--------------------- + +.. warning:: Calling :func:`numba.gdb` and/or :func:`numba.gdb_init` more than + once in the same program is not advisable, unexpected things may + happen. If multiple breakpoints are desired within a program, + launch ``gdb`` once via :func:`numba.gdb` or :func:`numba.gdb_init` + and then use :func:`numba.gdb_breakpoint` to register additional + breakpoint locations. + +The most simple function for adding ``gdb`` support is :func:`numba.gdb`, which, +at the call location, will: + +* launch ``gdb`` and attach it to the running process. +* create a breakpoint at the site of the :func:`numba.gdb()` function call, the + attached ``gdb`` will pause execution here awaiting user input. + +use of this functionality is best motivated by example, continuing with the +example used above: + +.. code-block:: python + :linenos: + + from numba import njit, gdb + + @njit(debug=True) + def foo(a): + b = a + 1 + gdb() # instruct Numba to attach gdb at this location and pause execution + c = a * 2.34 + d = (a, b, c) + print(a, b, c, d) + + r= foo(123) + print(r) + +In the terminal (``...`` on a line by itself indicates output that is not +presented for brevity): + +.. code-block:: none + :emphasize-lines: 1, 4, 8, 13, 24, 26, 28, 30, 32, 37 + + $ NUMBA_OPT=0 NUMBA_EXTEND_VARIABLE_LIFETIMES=1 python demo_gdb.py + ... + Breakpoint 1, 0x00007fb75238d830 in numba_gdb_breakpoint () from numba/_helperlib.cpython-39-x86_64-linux-gnu.so + (gdb) s + Single stepping until exit from function numba_gdb_breakpoint, + which has no line number information. + 0x00007fb75233e1cf in numba::misc::gdb_hook::hook_gdb::_3clocals_3e::impl_242[abi:c8tJTIeFCjyCbUFRqqOAK_2f6h0phxApMogijRBAA_3d](StarArgTuple) () + (gdb) s + Single stepping until exit from function _ZN5numba4misc8gdb_hook8hook_gdb12_3clocals_3e8impl_242B44c8tJTIeFCjyCbUFRqqOAK_2f6h0phxApMogijRBAA_3dE12StarArgTuple, + which has no line number information. + __main__::foo_241[abi:c8tJTC_2fWgEeGLSgydRTQUgiqKEZ6gEoDvQJmaQIA](long long) (a=123) at demo_gdb.py:7 + 7 c = a * 2.34 + (gdb) l + 2 + 3 @njit(debug=True) + 4 def foo(a): + 5 b = a + 1 + 6 gdb() # instruct Numba to attach gdb at this location and pause execution + 7 c = a * 2.34 + 8 d = (a, b, c) + 9 print(a, b, c, d) + 10 + 11 r= foo(123) + (gdb) p a + $1 = 123 + (gdb) p b + $2 = 124 + (gdb) p c + $3 = 0 + (gdb) b 9 + Breakpoint 2 at 0x7fb73d1f7287: file demo_gdb.py, line 9. + (gdb) c + Continuing. + + Breakpoint 2, __main__::foo_241[abi:c8tJTC_2fWgEeGLSgydRTQUgiqKEZ6gEoDvQJmaQIA](long long) (a=123) at demo_gdb.py:9 + 9 print(a, b, c, d) + (gdb) info locals + b = 124 + c = 287.81999999999999 + d = {f0 = 123, f1 = 124, f2 = 287.81999999999999} + + +It can be seen in the above example that execution of the code is paused at the +location of the ``gdb()`` function call at end of the ``numba_gdb_breakpoint`` +function (this is the Numba internal symbol registered as breakpoint with +``gdb``). Issuing a ``step`` twice at this point moves to the stack frame of the +compiled Python source. From there, it can be seen that the variables ``a`` and +``b`` have been evaluated but ``c`` has not, as demonstrated by printing their +values, this is precisely as expected given the location of the ``gdb()`` call. +Issuing a ``break`` on line 9 and then continuing execution leads to the +evaluation of line ``7``. The variable ``c`` is assigned a value as a result of +the execution and this can be seen in output of ``info locals`` when the +breakpoint is hit. + +Running with ``gdb`` enabled +---------------------------- + +The functionality provided by :func:`numba.gdb` (launch and attach ``gdb`` to +the executing process and pause on a breakpoint) is also available as two +separate functions: + +* :func:`numba.gdb_init` this function injects code at the call site to launch + and attach ``gdb`` to the executing process but does not pause execution. +* :func:`numba.gdb_breakpoint` this function injects code at the call site that + will call the special ``numba_gdb_breakpoint`` function that is registered as + a breakpoint in Numba's ``gdb`` support. This is demonstrated in the next + section. + +This functionality enables more complex debugging capabilities. Again, motivated +by example, debugging a 'segfault' (memory access violation signalling +``SIGSEGV``): + +.. code-block:: python + :linenos: + + from numba import njit, gdb_init + import numpy as np + + # NOTE debug=True switches bounds-checking on, but for the purposes of this + # example it is explicitly turned off so that the out of bounds index is + # not caught! + @njit(debug=True, boundscheck=False) + def foo(a, index): + gdb_init() # instruct Numba to attach gdb at this location, but not to pause execution + b = a + 1 + c = a * 2.34 + d = c[index] # access an address that is a) invalid b) out of the page + print(a, b, c, d) + + bad_index = int(1e9) # this index is invalid + z = np.arange(10) + r = foo(z, bad_index) + print(r) + +In the terminal (``...`` on a line by itself indicates output that is not +presented for brevity): + +.. code-block:: none + :emphasize-lines: 1, 6, 8, 10, 12 + + $ NUMBA_OPT=0 python demo_gdb_segfault.py + ... + Program received signal SIGSEGV, Segmentation fault. + 0x00007f5a4ca655eb in __main__::foo_241[abi:c8tJTC_2fWgEeGLSgydRTQUgiqKEZ6gEoDvQJmaQIA](Array, long long) (a=..., index=1000000000) at demo_gdb_segfault.py:12 + 12 d = c[index] # access an address that is a) invalid b) out of the page + (gdb) p index + $1 = 1000000000 + (gdb) p c + $2 = {meminfo = 0x5586cfb95830 "\001", parent = 0x0, nitems = 10, itemsize = 8, data = 0x5586cfb95860, shape = {10}, strides = {8}} + (gdb) whatis c + type = array(float64, 1d, C) ({i8*, i8*, i64, i64, double*, [1 x i64], [1 x i64]}) + (gdb) p c.nitems + $3 = 10 + +In the ``gdb`` output it can be noted that a ``SIGSEGV`` signal was caught, and +the line in which the access violation occurred is printed. + +Continuing the example as a debugging session demonstration, first ``index`` +can be printed, and it is evidently 1e9. Printing ``c`` shows that it is a +structure, so the type needs looking up and it can be seen that is it an +``array(float64, 1d, C)`` type. Given the segfault came from an invalid access +it would be informative to check the number of items in the array and compare +that to the index requested. Inspecting the ``nitems`` member of the structure +``c`` shows 10 items. It's therefore clear that the segfault comes from an +invalid access of index ``1000000000`` in an array containing ``10`` items. + +Adding breakpoints to code +-------------------------- + +The next example demonstrates using multiple breakpoints that are defined +through the invocation of the :func:`numba.gdb_breakpoint` function: + +.. code-block:: python + :linenos: + + from numba import njit, gdb_init, gdb_breakpoint + + @njit(debug=True) + def foo(a): + gdb_init() # instruct Numba to attach gdb at this location + b = a + 1 + gdb_breakpoint() # instruct gdb to break at this location + c = a * 2.34 + d = (a, b, c) + gdb_breakpoint() # and to break again at this location + print(a, b, c, d) + + r= foo(123) + print(r) + +In the terminal (``...`` on a line by itself indicates output that is not +presented for brevity): + +.. code-block:: none + :emphasize-lines: 1, 4, 9, 20, 22, 24, 29, 31 + + $ NUMBA_OPT=0 python demo_gdb_breakpoints.py + ... + Breakpoint 1, 0x00007fb65bb4c830 in numba_gdb_breakpoint () from numba/_helperlib.cpython-39-x86_64-linux-gnu.so + (gdb) step + Single stepping until exit from function numba_gdb_breakpoint, + which has no line number information. + __main__::foo_241[abi:c8tJTC_2fWgEeGLSgydRTQUgiqKEZ6gEoDvQJmaQIA](long long) (a=123) at demo_gdb_breakpoints.py:8 + 8 c = a * 2.34 + (gdb) l + 3 @njit(debug=True) + 4 def foo(a): + 5 gdb_init() # instruct Numba to attach gdb at this location + 6 b = a + 1 + 7 gdb_breakpoint() # instruct gdb to break at this location + 8 c = a * 2.34 + 9 d = (a, b, c) + 10 gdb_breakpoint() # and to break again at this location + 11 print(a, b, c, d) + 12 + (gdb) p b + $1 = 124 + (gdb) p c + $2 = 0 + (gdb) c + Continuing. + + Breakpoint 1, 0x00007fb65bb4c830 in numba_gdb_breakpoint () + from numba/_helperlib.cpython-39-x86_64-linux-gnu.so + (gdb) step + 11 print(a, b, c, d) + (gdb) p c + $3 = 287.81999999999999 + +From the ``gdb`` output it can be seen that execution paused at line 8 as a +breakpoint was hit, and after a ``continue`` was issued, it broke again at line +11 where the next breakpoint was hit. + +Debugging in parallel regions +----------------------------- + +The follow example is quite involved, it executes with ``gdb`` instrumentation +from the outset as per the example above, but it also uses threads and makes use +of the breakpoint functionality. Further, the last iteration of the parallel +section calls the function ``work``, which is actually just a binding to +``glibc``'s ``free(3)`` in this case, but could equally be some involved +function that is presenting a segfault for unknown reasons. + +.. code-block:: python + :linenos: + + from numba import njit, prange, gdb_init, gdb_breakpoint + import ctypes + + def get_free(): + lib = ctypes.cdll.LoadLibrary('libc.so.6') + free_binding = lib.free + free_binding.argtypes = [ctypes.c_void_p,] + free_binding.restype = None + return free_binding + + work = get_free() + + @njit(debug=True, parallel=True) + def foo(): + gdb_init() # instruct Numba to attach gdb at this location, but not to pause execution + counter = 0 + n = 9 + for i in prange(n): + if i > 3 and i < 8: # iterations 4, 5, 6, 7 will break here + gdb_breakpoint() + + if i == 8: # last iteration segfaults + work(0xBADADD) + + counter += 1 + return counter + + r = foo() + print(r) + +In the terminal (``...`` on a line by itself indicates output that is not +presented for brevity), note the setting of ``NUMBA_NUM_THREADS`` to 4 to ensure +that there are 4 threads running in the parallel section: + +.. code-block:: none + :emphasize-lines: 1, 19, 29, 44, 50, 56, 62, 69 + + $ NUMBA_NUM_THREADS=4 NUMBA_OPT=0 python demo_gdb_threads.py + Attaching to PID: 21462 + ... + Attaching to process 21462 + [New LWP 21467] + [New LWP 21468] + [New LWP 21469] + [New LWP 21470] + [Thread debugging using libthread_db enabled] + Using host libthread_db library "/lib64/libthread_db.so.1". + 0x00007f59ec31756d in nanosleep () at ../sysdeps/unix/syscall-template.S:81 + 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS) + Breakpoint 1 at 0x7f59d631e8f0: file numba/_helperlib.c, line 1090. + Continuing. + [Switching to Thread 0x7f59d1fd1700 (LWP 21470)] + + Thread 5 "python" hit Breakpoint 1, numba_gdb_breakpoint () at numba/_helperlib.c:1090 + 1090 } + (gdb) info threads + Id Target Id Frame + 1 Thread 0x7f59eca2f740 (LWP 21462) "python" pthread_cond_wait@@GLIBC_2.3.2 () + at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 + 2 Thread 0x7f59d37d4700 (LWP 21467) "python" pthread_cond_wait@@GLIBC_2.3.2 () + at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 + 3 Thread 0x7f59d2fd3700 (LWP 21468) "python" pthread_cond_wait@@GLIBC_2.3.2 () + at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 + 4 Thread 0x7f59d27d2700 (LWP 21469) "python" numba_gdb_breakpoint () at numba/_helperlib.c:1090 + * 5 Thread 0x7f59d1fd1700 (LWP 21470) "python" numba_gdb_breakpoint () at numba/_helperlib.c:1090 + (gdb) thread apply 2-5 info locals + + Thread 2 (Thread 0x7f59d37d4700 (LWP 21467)): + No locals. + + Thread 3 (Thread 0x7f59d2fd3700 (LWP 21468)): + No locals. + + Thread 4 (Thread 0x7f59d27d2700 (LWP 21469)): + No locals. + + Thread 5 (Thread 0x7f59d1fd1700 (LWP 21470)): + sched$35 = '\000' + counter__arr = '\000' , "\001\000\000\000\000\000\000\000\b\000\000\000\000\000\000\000\370B]\"hU\000\000\001", '\000' + counter = 0 + (gdb) continue + Continuing. + [Switching to Thread 0x7f59d27d2700 (LWP 21469)] + + Thread 4 "python" hit Breakpoint 1, numba_gdb_breakpoint () at numba/_helperlib.c:1090 + 1090 } + (gdb) continue + Continuing. + [Switching to Thread 0x7f59d1fd1700 (LWP 21470)] + + Thread 5 "python" hit Breakpoint 1, numba_gdb_breakpoint () at numba/_helperlib.c:1090 + 1090 } + (gdb) continue + Continuing. + [Switching to Thread 0x7f59d27d2700 (LWP 21469)] + + Thread 4 "python" hit Breakpoint 1, numba_gdb_breakpoint () at numba/_helperlib.c:1090 + 1090 } + (gdb) continue + Continuing. + + Thread 5 "python" received signal SIGSEGV, Segmentation fault. + [Switching to Thread 0x7f59d1fd1700 (LWP 21470)] + __GI___libc_free (mem=0xbadadd) at malloc.c:2935 + 2935 if (chunk_is_mmapped(p)) /* release mmapped memory. */ + (gdb) bt + #0 __GI___libc_free (mem=0xbadadd) at malloc.c:2935 + #1 0x00007f59d37ded84 in $3cdynamic$3e::__numba_parfor_gufunc__0x7ffff80a61ae3e31$244(Array, Array) () at :24 + #2 0x00007f59d17ce326 in __gufunc__._ZN13$3cdynamic$3e45__numba_parfor_gufunc__0x7ffff80a61ae3e31$244E5ArrayIyLi1E1C7mutable7alignedE5ArrayIxLi1E1C7mutable7alignedE () + #3 0x00007f59d37d7320 in thread_worker () + from /numba/numba/npyufunc/workqueue.cpython-37m-x86_64-linux-gnu.so + #4 0x00007f59ec626e25 in start_thread (arg=0x7f59d1fd1700) at pthread_create.c:308 + #5 0x00007f59ec350bad in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:113 + +In the output it can be seen that there are 4 threads launched and that they all +break at the breakpoint, further that ``Thread 5`` receives a signal ``SIGSEGV`` +and that back tracing shows that it came from ``__GI___libc_free`` with the +invalid address in ``mem``, as expected. + +Using the ``gdb`` command language +---------------------------------- +Both the :func:`numba.gdb` and :func:`numba.gdb_init` functions accept unlimited +string arguments which will be passed directly to ``gdb`` as command line +arguments when it initializes, this makes it easy to set breakpoints on other +functions and perform repeated debugging tasks without having to manually type +them every time. For example, this code runs with ``gdb`` attached and sets a +breakpoint on ``_dgesdd`` (say for example the arguments passed to the LAPACK's +double precision divide and conqueror SVD function need debugging). + +.. code-block:: python + :linenos: + + from numba import njit, gdb + import numpy as np + + @njit(debug=True) + def foo(a): + # instruct Numba to attach gdb at this location and on launch, switch + # breakpoint pending on , and then set a breakpoint on the function + # _dgesdd, continue execution, and once the breakpoint is hit, backtrace + gdb('-ex', 'set breakpoint pending on', + '-ex', 'b dgesdd_', + '-ex','c', + '-ex','bt') + b = a + 10 + u, s, vh = np.linalg.svd(b) + return s # just return singular values + + z = np.arange(70.).reshape(10, 7) + r = foo(z) + print(r) + +In the terminal (``...`` on a line by itself indicates output that is not +presented for brevity), note that no interaction is required to break and +backtrace: + +.. code-block:: none + :emphasize-lines: 1 + + $ NUMBA_OPT=0 python demo_gdb_args.py + Attaching to PID: 22300 + GNU gdb (GDB) Red Hat Enterprise Linux 8.0.1-36.el7 + ... + Attaching to process 22300 + Reading symbols from /bin/python3.7...done. + 0x00007f652305a550 in __nanosleep_nocancel () at ../sysdeps/unix/syscall-template.S:81 + 81 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS) + Breakpoint 1 at 0x7f650d0618f0: file numba/_helperlib.c, line 1090. + Continuing. + + Breakpoint 1, numba_gdb_breakpoint () at numba/_helperlib.c:1090 + 1090 } + Breakpoint 2 at 0x7f65102322e0 (2 locations) + Continuing. + + Breakpoint 2, 0x00007f65182be5f0 in mkl_lapack.dgesdd_ () + from /lib/python3.7/site-packages/numpy/core/../../../../libmkl_rt.so + #0 0x00007f65182be5f0 in mkl_lapack.dgesdd_ () + from /lib/python3.7/site-packages/numpy/core/../../../../libmkl_rt.so + #1 0x00007f650d065b71 in numba_raw_rgesdd (kind=kind@entry=100 'd', jobz=, jobz@entry=65 'A', m=m@entry=10, + n=n@entry=7, a=a@entry=0x561c6fbb20c0, lda=lda@entry=10, s=0x561c6facf3a0, u=0x561c6fb680e0, ldu=10, vt=0x561c6fd375c0, + ldvt=7, work=0x7fff4c926c30, lwork=-1, iwork=0x7fff4c926c40, info=0x7fff4c926c20) at numba/_lapack.c:1277 + #2 0x00007f650d06768f in numba_ez_rgesdd (ldvt=7, vt=0x561c6fd375c0, ldu=10, u=0x561c6fb680e0, s=0x561c6facf3a0, lda=10, + a=0x561c6fbb20c0, n=7, m=10, jobz=65 'A', kind=) at numba/_lapack.c:1307 + #3 numba_ez_gesdd (kind=, jobz=, m=10, n=7, a=0x561c6fbb20c0, lda=10, s=0x561c6facf3a0, + u=0x561c6fb680e0, ldu=10, vt=0x561c6fd375c0, ldvt=7) at numba/_lapack.c:1477 + #4 0x00007f650a3147a3 in numba::targets::linalg::svd_impl::$3clocals$3e::svd_impl$243(Array, omitted$28default$3d1$29) () + #5 0x00007f650a1c0489 in __main__::foo$241(Array) () at demo_gdb_args.py:15 + #6 0x00007f650a1c2110 in cpython::__main__::foo$241(Array) () + #7 0x00007f650cd096a4 in call_cfunc () + from /numba/numba/_dispatcher.cpython-37m-x86_64-linux-gnu.so + ... + + +How does the ``gdb`` binding work? +---------------------------------- +For advanced users and debuggers of Numba applications it's important to know +some of the internal implementation details of the outlined ``gdb`` bindings. +The :func:`numba.gdb` and :func:`numba.gdb_init` functions work by injecting the +following into the function's LLVM IR: + +* At the call site of the function first inject a call to ``getpid(3)`` to get + the PID of the executing process and store this for use later, then inject a + ``fork(3)`` call: + + * In the parent: + + * Inject a call ``sleep(3)`` (hence the pause whilst ``gdb`` loads). + * Inject a call to the ``numba_gdb_breakpoint`` function (only + :func:`numba.gdb` does this). + + * In the child: + + * Inject a call to ``execl(3)`` with the arguments + ``numba.config.GDB_BINARY``, the ``attach`` command and the PID recorded + earlier. Numba has a special ``gdb`` command file that contains + instructions to break on the symbol ``numba_gdb_breakpoint`` and then + ``finish``, this is to make sure that the program stops on the + breakpoint but the frame it stops in is the compiled Python frame (or + one ``step`` away from, depending on optimisation). This command file is + also added to the arguments and finally and any user specified arguments + are added. + +At the call site of a :func:`numba.gdb_breakpoint` a call is injected to the +special ``numba_gdb_breakpoint`` symbol, which is already registered and +instrumented as a place to break and ``finish`` immediately. + +As a result of this, a e.g. :func:`numba.gdb` call will cause a fork in the +program, the parent will sleep whilst the child launches ``gdb`` and attaches it +to the parent and tells the parent to continue. The launched ``gdb`` has the +``numba_gdb_breakpoint`` symbol registered as a breakpoint and when the parent +continues and stops sleeping it will immediately call ``numba_gdb_breakpoint`` +on which the child will break. Additional :func:`numba.gdb_breakpoint` calls +create calls to the registered breakpoint hence the program will also break at +these locations. + +.. _debugging-cuda-python-code: + +Debugging CUDA Python code +========================== + +Using the simulator +------------------- + +CUDA Python code can be run in the Python interpreter using the CUDA Simulator, +allowing it to be debugged with the Python debugger or with print statements. To +enable the CUDA simulator, set the environment variable +:envvar:`NUMBA_ENABLE_CUDASIM` to 1. For more information on the CUDA Simulator, +see :ref:`the CUDA Simulator documentation `. + + +Debug Info +---------- + +By setting the ``debug`` argument to ``cuda.jit`` to ``True`` +(``@cuda.jit(debug=True)``), Numba will emit source location in the compiled +CUDA code. Unlike the CPU target, only filename and line information are +available, but no variable type information is emitted. The information +is sufficient to debug memory error with +`cuda-memcheck `_. + +For example, given the following cuda python code: + +.. code-block:: python + :linenos: + + import numpy as np + from numba import cuda + + @cuda.jit(debug=True) + def foo(arr): + arr[cuda.threadIdx.x] = 1 + + arr = np.arange(30) + foo[1, 32](arr) # more threads than array elements + +We can use ``cuda-memcheck`` to find the memory error: + +.. code-block:: none + + $ cuda-memcheck python chk_cuda_debug.py + ========= CUDA-MEMCHECK + ========= Invalid __global__ write of size 8 + ========= at 0x00000148 in /home/user/chk_cuda_debug.py:6:cudapy::__main__::foo$241(Array<__int64, int=1, C, mutable, aligned>) + ========= by thread (31,0,0) in block (0,0,0) + ========= Address 0x500a600f8 is out of bounds + ... + ========= + ========= Invalid __global__ write of size 8 + ========= at 0x00000148 in /home/user/chk_cuda_debug.py:6:cudapy::__main__::foo$241(Array<__int64, int=1, C, mutable, aligned>) + ========= by thread (30,0,0) in block (0,0,0) + ========= Address 0x500a600f0 is out of bounds + ... diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/vectorize.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/vectorize.rst new file mode 100644 index 0000000000000000000000000000000000000000..dc15cda1bc02e50032c44e242e4be8d644c96300 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/vectorize.rst @@ -0,0 +1,423 @@ +================================== +Creating NumPy universal functions +================================== + +There are two types of universal functions: + +* Those which operate on scalars, these are "universal functions" or *ufuncs* + (see ``@vectorize`` below). +* Those which operate on higher dimensional arrays and scalars, these are + "generalized universal functions" or *gufuncs* (``@guvectorize`` below). + +.. _vectorize: + +The ``@vectorize`` decorator +============================ + +Numba's vectorize allows Python functions taking scalar input arguments to +be used as NumPy `ufuncs`_. Creating a traditional NumPy ufunc is +not the most straightforward process and involves writing some C code. +Numba makes this easy. Using the :func:`~numba.vectorize` decorator, Numba +can compile a pure Python function into a ufunc that operates over NumPy +arrays as fast as traditional ufuncs written in C. + +.. _ufuncs: http://docs.scipy.org/doc/numpy/reference/ufuncs.html + +Using :func:`~numba.vectorize`, you write your function as operating over +input scalars, rather than arrays. Numba will generate the surrounding +loop (or *kernel*) allowing efficient iteration over the actual inputs. + +The :func:`~numba.vectorize` decorator has two modes of operation: + +* Eager, or decoration-time, compilation: If you pass one or more type + signatures to the decorator, you will be building a NumPy universal + function (ufunc). The rest of this subsection describes building + ufuncs using decoration-time compilation. + +* Lazy, or call-time, compilation: When not given any signatures, the + decorator will give you a Numba dynamic universal function + (:class:`~numba.DUFunc`) that dynamically compiles a new kernel when + called with a previously unsupported input type. A later + subsection, ":ref:`dynamic-universal-functions`", describes this mode in + more depth. + +As described above, if you pass a list of signatures to the +:func:`~numba.vectorize` decorator, your function will be compiled +into a NumPy ufunc. In the basic case, only one signature will be +passed:: + + from numba import vectorize, float64 + + @vectorize([float64(float64, float64)]) + def f(x, y): + return x + y + +If you pass several signatures, beware that you have to pass most specific +signatures before least specific ones (e.g., single-precision floats +before double-precision floats), otherwise type-based dispatching will not work +as expected:: + + @vectorize([int32(int32, int32), + int64(int64, int64), + float32(float32, float32), + float64(float64, float64)]) + def f(x, y): + return x + y + +The function will work as expected over the specified array types:: + + >>> a = np.arange(6) + >>> f(a, a) + array([ 0, 2, 4, 6, 8, 10]) + >>> a = np.linspace(0, 1, 6) + >>> f(a, a) + array([ 0. , 0.4, 0.8, 1.2, 1.6, 2. ]) + +but it will fail working on other types:: + + >>> a = np.linspace(0, 1+1j, 6) + >>> f(a, a) + Traceback (most recent call last): + File "", line 1, in + TypeError: ufunc 'ufunc' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'' + + +You might ask yourself, "why would I go through this instead of compiling +a simple iteration loop using the :ref:`@jit ` decorator?". The +answer is that NumPy ufuncs automatically get other features such as +reduction, accumulation or broadcasting. Using the example above:: + + >>> a = np.arange(12).reshape(3, 4) + >>> a + array([[ 0, 1, 2, 3], + [ 4, 5, 6, 7], + [ 8, 9, 10, 11]]) + >>> f.reduce(a, axis=0) + array([12, 15, 18, 21]) + >>> f.reduce(a, axis=1) + array([ 6, 22, 38]) + >>> f.accumulate(a) + array([[ 0, 1, 2, 3], + [ 4, 6, 8, 10], + [12, 15, 18, 21]]) + >>> f.accumulate(a, axis=1) + array([[ 0, 1, 3, 6], + [ 4, 9, 15, 22], + [ 8, 17, 27, 38]]) + +.. seealso:: + `Standard features of ufuncs `_ (NumPy documentation). + +.. note:: + Only the broadcasting features of ufuncs are supported in compiled code. + +The :func:`~numba.vectorize` decorator supports multiple ufunc targets: + +================= =============================================================== +Target Description +================= =============================================================== +cpu Single-threaded CPU + + +parallel Multi-core CPU + + +cuda CUDA GPU + + .. NOTE:: This creates an *ufunc-like* object. + See `documentation for CUDA ufunc <../cuda/ufunc.html>`_ for detail. +================= =============================================================== + +A general guideline is to choose different targets for different data sizes +and algorithms. +The "cpu" target works well for small data sizes (approx. less than 1KB) and low +compute intensity algorithms. It has the least amount of overhead. +The "parallel" target works well for medium data sizes (approx. less than 1MB). +Threading adds a small delay. +The "cuda" target works well for big data sizes (approx. greater than 1MB) and +high compute intensity algorithms. Transferring memory to and from the GPU adds +significant overhead. + + +.. _guvectorize: + +The ``@guvectorize`` decorator +============================== + +While :func:`~numba.vectorize` allows you to write ufuncs that work on one +element at a time, the :func:`~numba.guvectorize` decorator takes the concept +one step further and allows you to write ufuncs that will work on an +arbitrary number of elements of input arrays, and take and return arrays of +differing dimensions. The typical example is a running median or a +convolution filter. + +Contrary to :func:`~numba.vectorize` functions, :func:`~numba.guvectorize` +functions don't return their result value: they take it as an array +argument, which must be filled in by the function. This is because the +array is actually allocated by NumPy's dispatch mechanism, which calls into +the Numba-generated code. + +Similar to :func:`~numba.vectorize` decorator, :func:`~numba.guvectorize` +also has two modes of operation: Eager, or decoration-time compilation and +lazy, or call-time compilation. + + +Here is a very simple example:: + + @guvectorize([(int64[:], int64, int64[:])], '(n),()->(n)') + def g(x, y, res): + for i in range(x.shape[0]): + res[i] = x[i] + y + +The underlying Python function simply adds a given scalar (``y``) to all +elements of a 1-dimension array. What's more interesting is the declaration. +There are two things there: + +* the declaration of input and output *layouts*, in symbolic form: + ``(n),()->(n)`` tells NumPy that the function takes a *n*-element one-dimension + array, a scalar (symbolically denoted by the empty tuple ``()``) and + returns a *n*-element one-dimension array; + +* the list of supported concrete *signatures* as per ``@vectorize``; here, + as in the above example, we demonstrate ``int64`` arrays. + +.. note:: + 1D array type can also receive scalar arguments (those with shape ``()``). + In the above example, the second argument also could be declared as + ``int64[:]``. In that case, the value must be read by ``y[0]``. + +We can now check what the compiled ufunc does, over a simple example:: + + >>> a = np.arange(5) + >>> a + array([0, 1, 2, 3, 4]) + >>> g(a, 2) + array([2, 3, 4, 5, 6]) + +The nice thing is that NumPy will automatically dispatch over more +complicated inputs, depending on their shapes:: + + >>> a = np.arange(6).reshape(2, 3) + >>> a + array([[0, 1, 2], + [3, 4, 5]]) + >>> g(a, 10) + array([[10, 11, 12], + [13, 14, 15]]) + >>> g(a, np.array([10, 20])) + array([[10, 11, 12], + [23, 24, 25]]) + + +.. note:: + Both :func:`~numba.vectorize` and :func:`~numba.guvectorize` support + passing ``nopython=True`` :ref:`as in the @jit decorator `. + Use it to ensure the generated code does not fallback to + :term:`object mode`. + +.. _overwriting-input-values: + +Overwriting input values +------------------------ + +In most cases, writing to inputs may also appear to work - however, this +behaviour cannot be relied on. Consider the following example function:: + + @guvectorize([(float64[:], float64[:])], '()->()') + def init_values(invals, outvals): + invals[0] = 6.5 + outvals[0] = 4.2 + +Calling the `init_values` function with an array of `float64` type results in +visible changes to the input:: + + >>> invals = np.zeros(shape=(3, 3), dtype=np.float64) + >>> outvals = init_values(invals) + >>> invals + array([[6.5, 6.5, 6.5], + [6.5, 6.5, 6.5], + [6.5, 6.5, 6.5]]) + >>> outvals + array([[4.2, 4.2, 4.2], + [4.2, 4.2, 4.2], + [4.2, 4.2, 4.2]]) + +This works because NumPy can pass the input data directly into the `init_values` +function as the data `dtype` matches that of the declared argument. However, it +may also create and pass in a temporary array, in which case changes to the +input are lost. For example, this can occur when casting is required. To +demonstrate, we can use an array of `float32` with the `init_values` function:: + + >>> invals = np.zeros(shape=(3, 3), dtype=np.float32) + >>> outvals = init_values(invals) + >>> invals + array([[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]], dtype=float32) + +In this case, there is no change to the `invals` array because the temporary +casted array was mutated instead. + +.. _dynamic-universal-functions: + +Dynamic universal functions +=========================== + +As described above, if you do not pass any signatures to the +:func:`~numba.vectorize` decorator, your Python function will be used +to build a dynamic universal function, or :class:`~numba.DUFunc`. For +example:: + + from numba import vectorize + + @vectorize + def f(x, y): + return x * y + +The resulting :func:`f` is a :class:`~numba.DUFunc` instance that +starts with no supported input types. As you make calls to :func:`f`, +Numba generates new kernels whenever you pass a previously unsupported +input type. Given the example above, the following set of interpreter +interactions illustrate how dynamic compilation works:: + + >>> f + + >>> f.ufunc + + >>> f.ufunc.types + [] + +The example above shows that :class:`~numba.DUFunc` instances are not +ufuncs. Rather than subclass ufunc's, :class:`~numba.DUFunc` +instances work by keeping a :attr:`~numba.DUFunc.ufunc` member, and +then delegating ufunc property reads and method calls to this member +(also known as type aggregation). When we look at the initial types +supported by the ufunc, we can verify there are none. + +Let's try to make a call to :func:`f`:: + + >>> f(3,4) + 12 + >>> f.types # shorthand for f.ufunc.types + ['ll->l'] + +If this was a normal NumPy ufunc, we would have seen an exception +complaining that the ufunc couldn't handle the input types. When we +call :func:`f` with integer arguments, not only do we receive an +answer, but we can verify that Numba created a loop supporting C +:code:`long` integers. + +We can add additional loops by calling :func:`f` with different inputs:: + + >>> f(1.,2.) + 2.0 + >>> f.types + ['ll->l', 'dd->d'] + +We can now verify that Numba added a second loop for dealing with +floating-point inputs, :code:`"dd->d"`. + +If we mix input types to :func:`f`, we can verify that `NumPy ufunc +casting rules`_ are still in effect:: + + >>> f(1,2.) + 2.0 + >>> f.types + ['ll->l', 'dd->d'] + +.. _`NumPy ufunc casting rules`: http://docs.scipy.org/doc/numpy/reference/ufuncs.html#casting-rules + +This example demonstrates that calling :func:`f` with mixed types +caused NumPy to select the floating-point loop, and cast the integer +argument to a floating-point value. Thus, Numba did not create a +special :code:`"dl->d"` kernel. + +This :class:`~numba.DUFunc` behavior leads us to a point similar to +the warning given above in "`The @vectorize decorator`_" subsection, +but instead of signature declaration order in the decorator, call +order matters. If we had passed in floating-point arguments first, +any calls with integer arguments would be cast to double-precision +floating-point values. For example:: + + >>> @vectorize + ... def g(a, b): return a / b + ... + >>> g(2.,3.) + 0.66666666666666663 + >>> g(2,3) + 0.66666666666666663 + >>> g.types + ['dd->d'] + +If you require precise support for various type signatures, you should +specify them in the :func:`~numba.vectorize` decorator, and not rely +on dynamic compilation. + +Dynamic generalized universal functions +======================================= + +Similar to a dynamic universal function, if you do not specify any types to +the :func:`~numba.guvectorize` decorator, your Python function will be used +to build a dynamic generalized universal function, or :class:`~numba.GUFunc`. +For example:: + + from numba import guvectorize + + @guvectorize('(n),()->(n)') + def g(x, y, res): + for i in range(x.shape[0]): + res[i] = x[i] + y + +We can verify the resulting function :func:`g` is a :class:`~numba.GUFunc` +instance that starts with no supported input types. For instance:: + + >>> g + + >>> g.ufunc + + >>> g.ufunc.types + [] + +Similar to a :class:`~numba.DUFunc`, as one make calls to :func:`g()`, +numba generates new kernels for previously unsupported input types. The +following set of interpreter interactions will illustrate how dynamic +compilation works for a :class:`~numba.GUFunc`:: + + >>> x = np.arange(5, dtype=np.int64) + >>> y = 10 + >>> res = np.zeros_like(x) + >>> g(x, y, res) + >>> res + array([5, 6, 7, 8, 9]) + >>> g.types + ['ll->l'] + +If this was a normal :func:`guvectorize` function, we would have seen an +exception complaining that the ufunc could not handle the given input types. +When we call :func:`g()` with the input arguments, numba creates a new loop +for the input types. + +We can add additional loops by calling :func:`g` with new arguments:: + + >>> x = np.arange(5, dtype=np.double) + >>> y = 2.2 + >>> res = np.zeros_like(x) + >>> g(x, y, res) + +We can now verify that Numba added a second loop for dealing with +floating-point inputs, :code:`"dd->d"`. + + >>> g.types # shorthand for g.ufunc.types + ['ll->l', 'dd->d'] + +One can also verify that NumPy ufunc casting rules are working as expected:: + + >>> x = np.arange(5, dtype=np.int64) + >>> y = 2.2 + >>> res = np.zeros_like(x) + >>> g(x, y, res) + >>> res + +If you need precise support for various type signatures, you should not rely on dynamic +compilation and instead, specify the types them as first +argument in the :func:`~numba.guvectorize` decorator. diff --git a/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/withobjmode.rst b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/withobjmode.rst new file mode 100644 index 0000000000000000000000000000000000000000..e94237e8bf3f494c8ffee37c64e6a8d17a28e7ba --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/docs/source/user/withobjmode.rst @@ -0,0 +1,34 @@ +============================================================ +Callback into the Python Interpreter from within JIT'ed code +============================================================ + +There are rare but real cases when a nopython-mode function needs to callback +into the Python interpreter to invoke code that cannot be compiled by Numba. +Such cases include: + +- logging progress for long running JIT'ed functions; +- use data structures that are not currently supported by Numba; +- debugging inside JIT'ed code using the Python debugger. + +When Numba callbacks into the Python interpreter, the following has to happen: + +- acquire the GIL; +- convert values in native representation back into Python objects; +- call-back into the Python interpreter; +- convert returned values from the Python-code into native representation; +- release the GIL. + +These steps can be expensive. Users **should not** rely on the feature +described here on performance-critical paths. + + +.. _with_objmode: + +The ``objmode`` context-manager +=============================== + +.. warning:: This feature can be easily mis-used. Users should first consider + alternative approaches to achieve their intended goal before using + this feature. + +.. autofunction:: numba.objmode diff --git a/cv/3d_detection/centerpoint/pytorch/numba/install_numba.sh b/cv/3d_detection/centerpoint/pytorch/numba/install_numba.sh new file mode 100644 index 0000000000000000000000000000000000000000..207cbc9c3c426960a333e23c5772a05a3197d9c0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/install_numba.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +clang_version=`clang --version | grep "clang version 16."` +if [[ "${clang_version}" != "" ]]; then + echo "Not support LLVM16 now!" + exit 0 +fi + +TARGET_DIR=${TARGET_DIR:-} +PYTHON_PATH=$(which python3) +PYTHON_DIST_PATH=${TARGET_DIR}/lib/python3/dist-packages + +PKG_DIR="build_pip" +PKG_NAME="numba" + +if [[ ! -d ${PKG_DIR} ]]; then + echo "ERROR: Package directory ${PKG_DIR} doesn't exist" + exit 1 +fi + +latest_pkg="$(ls -t ${PKG_DIR} | grep ${PKG_NAME} | head -1)" +if [[ "${latest_pkg}" == "" ]]; then + echo "ERROR: Cannot find latest ${PKG_NAME} package" + exit 1 +else + echo "INFO: Found latest package ${latest_pkg} in directory ${PKG_DIR}" +fi + +if [[ "${TARGET_DIR}" != "" ]]; then + mkdir tmp + cp -R ${PYTHON_DIST_PATH}/bin ./tmp/ + ${PYTHON_PATH} -m pip install --upgrade -t ${PYTHON_DIST_PATH} ${PKG_DIR}/${latest_pkg} || exit + cp -n ./tmp/bin/* ${PYTHON_DIST_PATH}/bin + rm -rf ./tmp + echo "${PKG_NAME} installed in ${PYTHON_DIST_PATH}; please add it to your PYTHONPATH." +else + ${PYTHON_PATH} -m pip uninstall ${PKG_NAME} -y + ${PYTHON_PATH} -m pip install ${PKG_DIR}/${latest_pkg} || exit +fi + +# finish installing successfully +exit 0 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/mypy.ini b/cv/3d_detection/centerpoint/pytorch/numba/mypy.ini new file mode 100644 index 0000000000000000000000000000000000000000..0b790befd0ede3b575fa2ba205605a4a5acf2540 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/mypy.ini @@ -0,0 +1,52 @@ +# Global options: + +[mypy] +warn_unused_configs = True +follow_imports = silent +show_error_context = True +files = **/numba/core/types/*.py, **/numba/core/datamodel/*.py, **/numba/core/rewrites/*.py, **/numba/core/unsafe/*.py + +# Per-module options: +# To classify a given module as Level 1, 2 or 3 it must be added both in files (variable above) and in the lists below. +# Level 1 - modules checked on the strictest settings. +;[mypy-] +;warn_return_any = True +;disallow_any_expr = True +;disallow_any_explicit = True +;disallow_any_generics = True +;disallow_subclassing_any = True +;disallow_untyped_calls = True +;disallow_untyped_defs = True +;disallow_incomplete_defs = True +;check_untyped_defs = True +;disallow_untyped_decorators = True +;warn_unused_ignores = True +;follow_imports = normal +;warn_unreachable = True +;strict_equality = True + +# Level 2 - module that pass reasonably strict settings. +# No untyped functions allowed. Imports must be typed or explicitly ignored. +;[mypy-] +;warn_return_any = True +;disallow_untyped_defs = True +;disallow_incomplete_defs = True +;follow_imports = normal + +# Level 3 - modules that pass mypy default settings (only those in `files` global setting and not in previous levels) +# Function/variables are annotated to avoid mypy errors, but annotations are not complete. +[mypy-numba.core.*] +warn_return_any = True + +# Level 4 - modules that do not pass mypy check: they are excluded from "files" setting in global section + +# External packages that lack annotations +[mypy-llvmlite.*] +ignore_missing_imports = True + +[mypy-numpy.*] +ignore_missing_imports = True + +[mypy-winreg.*] +# this can be removed after Mypy 0.78 is out with the latest typeshed +ignore_missing_imports = True diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/__init__.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0e437a6005dcee4934f884694a883dda13a3a53b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/__init__.py @@ -0,0 +1,224 @@ +""" +Expose top-level symbols that are safe for import * +""" + +import platform +import re +import sys +import warnings + +from ._version import get_versions +from numba.misc.init_utils import generate_version_info + +__version__ = get_versions()['version'] +version_info = generate_version_info(__version__) +del get_versions +del generate_version_info + + +from numba.core import config +from numba.core import types, errors + +# Re-export typeof +from numba.misc.special import ( + typeof, prange, pndindex, gdb, gdb_breakpoint, gdb_init, + literally, literal_unroll, +) + +# Re-export error classes +from numba.core.errors import * + +# Re-export types itself +import numba.core.types as types + +# Re-export all type names +from numba.core.types import * + +# Re-export decorators +from numba.core.decorators import (cfunc, generated_jit, jit, njit, stencil, + jit_module) + +# Re-export vectorize decorators and the thread layer querying function +from numba.np.ufunc import (vectorize, guvectorize, threading_layer, + get_num_threads, set_num_threads, + set_parallel_chunksize, get_parallel_chunksize, + get_thread_id) + +# Re-export Numpy helpers +from numba.np.numpy_support import carray, farray, from_dtype + +# Re-export experimental +from numba import experimental + +# Initialize withcontexts +import numba.core.withcontexts +from numba.core.withcontexts import objmode_context as objmode +from numba.core.withcontexts import parallel_chunksize + +# Initialize target extensions +import numba.core.target_extension + +# Initialize typed containers +import numba.typed + +# Keep this for backward compatibility. +def test(argv, **kwds): + # To speed up the import time, avoid importing `unittest` and other test + # dependencies unless the user is actually trying to run tests. + from numba.testing import _runtests as runtests + return runtests.main(argv, **kwds) + +__all__ = """ + cfunc + from_dtype + guvectorize + jit + experimental + njit + stencil + jit_module + typeof + prange + gdb + gdb_breakpoint + gdb_init + vectorize + objmode + literal_unroll + get_num_threads + set_num_threads + set_parallel_chunksize + get_parallel_chunksize + parallel_chunksize + """.split() + types.__all__ + errors.__all__ + + +_min_llvmlite_version = (0, 39, 0) +_min_llvm_version = (11, 0, 0) + +def _ensure_llvm(): + """ + Make sure llvmlite is operational. + """ + import warnings + import llvmlite + + # Only look at the the major, minor and bugfix version numbers. + # Ignore other stuffs + regex = re.compile(r'(\d+)\.(\d+).(\d+)') + m = regex.match(llvmlite.__version__) + if m: + ver = tuple(map(int, m.groups())) + if ver < _min_llvmlite_version: + msg = ("Numba requires at least version %d.%d.%d of llvmlite.\n" + "Installed version is %s.\n" + "Please update llvmlite." % + (_min_llvmlite_version + (llvmlite.__version__,))) + raise ImportError(msg) + else: + # Not matching? + warnings.warn("llvmlite version format not recognized!") + + from llvmlite.binding import llvm_version_info, check_jit_execution + + if llvm_version_info < _min_llvm_version: + msg = ("Numba requires at least version %d.%d.%d of LLVM.\n" + "Installed llvmlite is built against version %d.%d.%d.\n" + "Please update llvmlite." % + (_min_llvm_version + llvm_version_info)) + raise ImportError(msg) + + check_jit_execution() + +def _ensure_critical_deps(): + """ + Make sure Python, NumPy and SciPy have supported versions. + """ + from numba.np.numpy_support import numpy_version + from numba.core.utils import PYVERSION + + if PYVERSION < (3, 7): + raise ImportError("Numba needs Python 3.7 or greater") + + if numpy_version < (1, 18): + raise ImportError("Numba needs NumPy 1.18 or greater") + elif numpy_version > (1, 23): + raise ImportError("Numba needs NumPy 1.23 or less") + + try: + import scipy + except ImportError: + pass + else: + sp_version = tuple(map(int, scipy.__version__.split('.')[:2])) + if sp_version < (1, 0): + raise ImportError("Numba requires SciPy version 1.0 or greater") + + +def _try_enable_svml(): + """ + Tries to enable SVML if configuration permits use and the library is found. + """ + if not config.DISABLE_INTEL_SVML: + try: + if sys.platform.startswith('linux'): + llvmlite.binding.load_library_permanently("libsvml.so") + elif sys.platform.startswith('darwin'): + llvmlite.binding.load_library_permanently("libsvml.dylib") + elif sys.platform.startswith('win'): + llvmlite.binding.load_library_permanently("svml_dispmd") + else: + return False + # The SVML library is loaded, therefore SVML *could* be supported. + # Now see if LLVM has been compiled with the SVML support patch. + # If llvmlite has the checking function `has_svml` and it returns + # True, then LLVM was compiled with SVML support and the the setup + # for SVML can proceed. We err on the side of caution and if the + # checking function is missing, regardless of that being fine for + # most 0.23.{0,1} llvmlite instances (i.e. conda or pip installed), + # we assume that SVML was not compiled in. llvmlite 0.23.2 is a + # bugfix release with the checking function present that will always + # produce correct behaviour. For context see: #3006. + try: + if not getattr(llvmlite.binding.targets, "has_svml")(): + # has detection function, but no svml compiled in, therefore + # disable SVML + return False + except AttributeError: + if platform.machine() == 'x86_64' and config.DEBUG: + msg = ("SVML was found but llvmlite >= 0.23.2 is " + "needed to support it.") + warnings.warn(msg) + # does not have detection function, cannot detect reliably, + # disable SVML. + return False + + # All is well, detection function present and reports SVML is + # compiled in, set the vector library to SVML. + llvmlite.binding.set_option('SVML', '-vector-library=SVML') + return True + except: + if platform.machine() == 'x86_64' and config.DEBUG: + warnings.warn("SVML was not found/could not be loaded.") + return False + +_ensure_llvm() +_ensure_critical_deps() + +# we know llvmlite is working as the above tests passed, import it now as SVML +# needs to mutate runtime options (sets the `-vector-library`). +import llvmlite + +""" +Is set to True if Intel SVML is in use. +""" +config.USING_SVML = _try_enable_svml() + + +# ---------------------- WARNING WARNING WARNING ---------------------------- +# The following imports occur below here (SVML init) because somewhere in their +# import sequence they have a `@njit` wrapped function. This triggers too early +# a bind to the underlying LLVM libraries which then irretrievably sets the LLVM +# SVML state to "no SVML". See https://github.com/numba/numba/issues/4689 for +# context. +# ---------------------- WARNING WARNING WARNING ---------------------------- diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/__main__.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..4e85bf372d86cb67d41d024ff5495236469dcae4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/__main__.py @@ -0,0 +1,6 @@ +"""Expose Numba command via ``python -m numba``.""" +import sys +from numba.misc.numba_entry import main + +if __name__ == '__main__': + sys.exit(main()) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_arraystruct.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/_arraystruct.h new file mode 100644 index 0000000000000000000000000000000000000000..dcb866e2baca2602d112202e70d2e318b38e5f98 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_arraystruct.h @@ -0,0 +1,21 @@ +#ifndef NUMBA_ARYSTRUCT_H_ +#define NUMBA_ARYSTRUCT_H_ +/* + * Fill in the *arystruct* with information from the Numpy array *obj*. + * *arystruct*'s layout is defined in numba.targets.arrayobj (look + * for the ArrayTemplate class). + */ + +typedef struct { + void *meminfo; /* see _nrt_python.c and nrt.h in numba/core/runtime */ + PyObject *parent; + npy_intp nitems; + npy_intp itemsize; + void *data; + + npy_intp shape_and_strides[]; +} arystruct_t; + + +#endif /* NUMBA_ARYSTRUCT_H_ */ + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_devicearray.cpp b/cv/3d_detection/centerpoint/pytorch/numba/numba/_devicearray.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3d40bee8bb2ffeaaf33123ce3e438653590f49dd --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_devicearray.cpp @@ -0,0 +1,142 @@ +/* This file contains the base class implementation for all device arrays. The + * base class is implemented in C so that computing typecodes for device arrays + * can be implemented efficiently. */ + +#include "_pymodule.h" + + +/* Include _devicearray., but make sure we don't get the definitions intended + * for consumers of the Device Array API. + */ +#define NUMBA_IN_DEVICEARRAY_CPP_ +#include "_devicearray.h" + +/* DeviceArray PyObject implementation. Note that adding more members here is + * presently prohibited because mapped and managed arrays derive from both + * DeviceArray and NumPy's ndarray, which is also a C extension class - the + * layout of the object cannot be resolved if this class also has members beyond + * PyObject_HEAD. */ +class DeviceArray { + PyObject_HEAD +}; + +/* Trivial traversal - DeviceArray instances own nothing. */ +static int +DeviceArray_traverse(DeviceArray *self, visitproc visit, void *arg) +{ + return 0; +} + +/* Trivial clear of all references - DeviceArray instances own nothing. */ +static int +DeviceArray_clear(DeviceArray *self) +{ + return 0; +} + +/* The _devicearray.DeviceArray type */ +PyTypeObject DeviceArrayType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_devicearray.DeviceArray", /* tp_name */ + sizeof(DeviceArray), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call*/ + 0, /* tp_str*/ + 0, /* tp_getattro*/ + 0, /* tp_setattro*/ + 0, /* tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, + /* tp_flags*/ + "DeviceArray object", /* tp_doc */ + (traverseproc) DeviceArray_traverse, /* tp_traverse */ + (inquiry) DeviceArray_clear, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ +#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 8 + 0, /* tp_vectorcall */ + 0, /* tp_print */ +#endif +}; + +/* CUDA device array C API */ +static void *_DeviceArray_API[1] = { + (void*)&DeviceArrayType +}; + +MOD_INIT(_devicearray) { + PyObject *m = nullptr; + PyObject *d = nullptr; + PyObject *c_api = nullptr; + int error = 0; + + MOD_DEF(m, "_devicearray", "No docs", NULL) + if (m == NULL) + goto error_occurred; + + c_api = PyCapsule_New((void *)_DeviceArray_API, "numba._devicearray._DEVICEARRAY_API", NULL); + if (c_api == NULL) + goto error_occurred; + + DeviceArrayType.tp_new = PyType_GenericNew; + if (PyType_Ready(&DeviceArrayType) < 0) + goto error_occurred; + + Py_INCREF(&DeviceArrayType); + error = PyModule_AddObject(m, "DeviceArray", (PyObject*)(&DeviceArrayType)); + if (error) + goto error_occurred; + + d = PyModule_GetDict(m); + if (d == NULL) + goto error_occurred; + + error = PyDict_SetItemString(d, "_DEVICEARRAY_API", c_api); + /* Decref and set c_api to NULL, Py_XDECREF in error_occurred will have no + * effect. */ + Py_CLEAR(c_api); + + if (error) + goto error_occurred; + + return MOD_SUCCESS_VAL(m); + +error_occurred: + Py_XDECREF(m); + Py_XDECREF(c_api); + Py_XDECREF((PyObject*)&DeviceArrayType); + + return MOD_ERROR_VAL; +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_devicearray.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/_devicearray.h new file mode 100644 index 0000000000000000000000000000000000000000..5b276eacf9ce0aba248c49c2e158c8faab74879c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_devicearray.h @@ -0,0 +1,25 @@ +#ifndef NUMBA_DEVICEARRAY_H_ +#define NUMBA_DEVICEARRAY_H_ + +#ifdef __cplusplus + extern "C" { +#endif + +/* These definitions should only be used by consumers of the Device Array API. + * Consumers access the API through the opaque pointer stored in + * _devicearray._DEVICEARRAY_API. We don't want these definitions in + * _devicearray.cpp itself because they would conflict with the actual + * implementations there. + */ +#ifndef NUMBA_IN_DEVICEARRAY_CPP_ + + extern void **DeviceArray_API; + #define DeviceArrayType (*(PyTypeObject*)DeviceArray_API[0]) + +#endif /* ndef NUMBA_IN_DEVICEARRAY_CPP */ + +#ifdef __cplusplus + } +#endif + +#endif /* NUMBA_DEVICEARRAY_H_ */ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_dispatcher.cpp b/cv/3d_detection/centerpoint/pytorch/numba/numba/_dispatcher.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5ffefb2459f4cb75685bc0417c8dcc4b8452668c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_dispatcher.cpp @@ -0,0 +1,1223 @@ +#include "_pymodule.h" + +#include +#include +#include +#include + +#include "_typeof.h" +#include "frameobject.h" +#include "core/typeconv/typeconv.hpp" +#include "_devicearray.h" + +/* + * Notes on the C_TRACE macro: + * + * The original C_TRACE macro (from ceval.c) would call + * PyTrace_C_CALL et al., for which the frame argument wouldn't + * be usable. Since we explicitly synthesize a frame using the + * original Python code object, we call PyTrace_CALL instead so + * the profiler can report the correct source location. + * + * Likewise, while ceval.c would call PyTrace_C_EXCEPTION in case + * of error, the profiler would simply expect a RETURN in case of + * a Python function, so we generate that here (making sure the + * exception state is preserved correctly). + * + */ + +/* + * NOTE: There is a version split for tracing code. Python 3.10 introduced a + * trace_info structure to help make tracing more robust. See: + * https://github.com/python/cpython/pull/24726 + */ +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10) + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L36-L40 + */ +typedef struct { + PyCodeObject *code; // The code object for the bounds. May be NULL. + PyCodeAddressRange bounds; // Only valid if code != NULL. + CFrame cframe; +} PyTraceInfo; + + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Objects/codeobject.c#L1257-L1266 + * NOTE: The function is renamed. + */ +static void +_nb_PyLineTable_InitAddressRange(const char *linetable, Py_ssize_t length, int firstlineno, PyCodeAddressRange *range) +{ + range->opaque.lo_next = linetable; + range->opaque.limit = range->opaque.lo_next + length; + range->ar_start = -1; + range->ar_end = 0; + range->opaque.computed_line = firstlineno; + range->ar_line = -1; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Objects/codeobject.c#L1269-L1275 + * NOTE: The function is renamed. + */ +static int +_nb_PyCode_InitAddressRange(PyCodeObject* co, PyCodeAddressRange *bounds) +{ + const char *linetable = PyBytes_AS_STRING(co->co_linetable); + Py_ssize_t length = PyBytes_GET_SIZE(co->co_linetable); + _nb_PyLineTable_InitAddressRange(linetable, length, co->co_firstlineno, bounds); + return bounds->ar_line; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5468-L5475 + * NOTE: The call to _PyCode_InitAddressRange is renamed. + */ +static void +initialize_trace_info(PyTraceInfo *trace_info, PyFrameObject *frame) +{ + if (trace_info->code != frame->f_code) { + trace_info->code = frame->f_code; + _nb_PyCode_InitAddressRange(frame->f_code, &trace_info->bounds); + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5477-L5501 + */ +static int +call_trace(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + PyTraceInfo *trace_info, + int what, PyObject *arg) +{ + int result; + if (tstate->tracing) + return 0; + tstate->tracing++; + tstate->cframe->use_tracing = 0; + if (frame->f_lasti < 0) { + frame->f_lineno = frame->f_code->co_firstlineno; + } + else { + initialize_trace_info(trace_info, frame); + frame->f_lineno = _PyCode_CheckLineNumber(frame->f_lasti*sizeof(_Py_CODEUNIT), &trace_info->bounds); + } + result = func(obj, frame, what, arg); + frame->f_lineno = 0; + tstate->cframe->use_tracing = ((tstate->c_tracefunc != NULL) + || (tstate->c_profilefunc != NULL)); + tstate->tracing--; + return result; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5445-L5466 + */ +static int +call_trace_protected(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + PyTraceInfo *trace_info, + int what, PyObject *arg) +{ + PyObject *type, *value, *traceback; + int err; + PyErr_Fetch(&type, &value, &traceback); + err = call_trace(func, obj, tstate, frame, trace_info, what, arg); + if (err == 0) + { + PyErr_Restore(type, value, traceback); + return 0; + } + else + { + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(traceback); + return -1; + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5810-L5839 + * NOTE: The state test https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5811 + * has been removed, it's dealt with in call_cfunc. + */ +#define C_TRACE(x, call) \ +if (call_trace(tstate->c_profilefunc, tstate->c_profileobj, \ + tstate, tstate->frame, &trace_info, PyTrace_CALL,\ + cfunc)) \ + x = NULL; \ +else \ +{ \ + x = call; \ + if (tstate->c_profilefunc != NULL) \ + { \ + if (x == NULL) \ + { \ + call_trace_protected(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + &trace_info, \ + PyTrace_RETURN, cfunc); \ + /* XXX should pass (type, value, tb) */ \ + } \ + else \ + { \ + if (call_trace(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + &trace_info, \ + PyTrace_RETURN, cfunc)) \ + { \ + Py_DECREF(x); \ + x = NULL; \ + } \ + } \ + } \ +} + +#else + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4242-L4257 + */ +static int +call_trace(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + int what, PyObject *arg) +{ + int result; + if (tstate->tracing) + return 0; + tstate->tracing++; + tstate->use_tracing = 0; + result = func(obj, frame, what, arg); + tstate->use_tracing = ((tstate->c_tracefunc != NULL) + || (tstate->c_profilefunc != NULL)); + tstate->tracing--; + return result; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4220-L4240 + */ +static int +call_trace_protected(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + int what, PyObject *arg) +{ + PyObject *type, *value, *traceback; + int err; + PyErr_Fetch(&type, &value, &traceback); + err = call_trace(func, obj, tstate, frame, what, arg); + if (err == 0) + { + PyErr_Restore(type, value, traceback); + return 0; + } + else + { + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(traceback); + return -1; + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4520-L4549 + * NOTE: The state test https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4521 + * has been removed, it's dealt with in call_cfunc. + */ +#define C_TRACE(x, call) \ +if (call_trace(tstate->c_profilefunc, tstate->c_profileobj, \ + tstate, tstate->frame, PyTrace_CALL, cfunc)) \ + x = NULL; \ +else \ +{ \ + x = call; \ + if (tstate->c_profilefunc != NULL) \ + { \ + if (x == NULL) \ + { \ + call_trace_protected(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + PyTrace_RETURN, cfunc); \ + /* XXX should pass (type, value, tb) */ \ + } \ + else \ + { \ + if (call_trace(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + PyTrace_RETURN, cfunc)) \ + { \ + Py_DECREF(x); \ + x = NULL; \ + } \ + } \ + } \ +} + + +#endif + +typedef std::vector TypeTable; +typedef std::vector Functions; + +/* The Dispatcher class is the base class of all dispatchers in the CPU and + CUDA targets. Its main responsibilities are: + + - Resolving the best overload to call for a given set of arguments, and + - Calling the resolved overload. + + This logic is implemented within this class for efficiency (lookup of the + appropriate overload needs to be fast) and ease of implementation (calling + directly into a compiled function using a function pointer is easier within + the C++ code where the overload has been resolved). */ +class Dispatcher { +public: + PyObject_HEAD + /* Whether compilation of new overloads is permitted */ + char can_compile; + /* Whether fallback to object mode is permitted */ + char can_fallback; + /* Whether types must match exactly when resolving overloads. + If not, conversions (e.g. float32 -> float64) are permitted when + searching for a match. */ + char exact_match_required; + /* Borrowed reference */ + PyObject *fallbackdef; + /* Whether to fold named arguments and default values + (false for lifted loops) */ + int fold_args; + /* Whether the last positional argument is a stararg */ + int has_stararg; + /* Tuple of argument names */ + PyObject *argnames; + /* Tuple of default values */ + PyObject *defargs; + /* Number of arguments to function */ + int argct; + /* Used for selecting overloaded function implementations */ + TypeManager *tm; + /* An array of overloads */ + Functions functions; + /* A flattened array of argument types to all overloads + * (invariant: sizeof(overloads) == argct * sizeof(functions)) */ + TypeTable overloads; + + /* Add a new overload. Parameters: + + - args: An array of Type objects, one for each parameter + - callable: The callable implementing this overload. */ + void addDefinition(Type args[], PyObject *callable) { + overloads.reserve(argct + overloads.size()); + for (int i=0; iselectOverload(sig, &overloads[0], selected, argct, + ovct, allow_unsafe, + exact_match_required); + } + if (matches == 1) { + return functions[selected]; + } + return NULL; + } + + /* Remove all overloads */ + void clear() { + functions.clear(); + overloads.clear(); + } + +}; + + +static int +Dispatcher_traverse(Dispatcher *self, visitproc visit, void *arg) +{ + Py_VISIT(self->defargs); + return 0; +} + +static void +Dispatcher_dealloc(Dispatcher *self) +{ + Py_XDECREF(self->argnames); + Py_XDECREF(self->defargs); + self->clear(); + Py_TYPE(self)->tp_free((PyObject*)self); +} + + +static int +Dispatcher_init(Dispatcher *self, PyObject *args, PyObject *kwds) +{ + PyObject *tmaddrobj; + void *tmaddr; + int argct; + int can_fallback; + int has_stararg = 0; + int exact_match_required = 0; + + if (!PyArg_ParseTuple(args, "OiiO!O!i|ii", &tmaddrobj, &argct, + &self->fold_args, + &PyTuple_Type, &self->argnames, + &PyTuple_Type, &self->defargs, + &can_fallback, + &has_stararg, + &exact_match_required + )) { + return -1; + } + Py_INCREF(self->argnames); + Py_INCREF(self->defargs); + tmaddr = PyLong_AsVoidPtr(tmaddrobj); + self->tm = static_cast(tmaddr); + self->argct = argct; + self->can_compile = 1; + self->can_fallback = can_fallback; + self->fallbackdef = NULL; + self->has_stararg = has_stararg; + self->exact_match_required = exact_match_required; + return 0; +} + +static PyObject * +Dispatcher_clear(Dispatcher *self, PyObject *args) +{ + self->clear(); + Py_RETURN_NONE; +} + +static +PyObject* +Dispatcher_Insert(Dispatcher *self, PyObject *args, PyObject *kwds) +{ + /* The cuda kwarg is a temporary addition until CUDA overloads are compiled + * functions. Once they are compiled functions, kwargs can be removed from + * this function. */ + static char *keywords[] = { + (char*)"sig", + (char*)"func", + (char*)"objectmode", + (char*)"cuda", + NULL + }; + + PyObject *sigtup, *cfunc; + int i, sigsz; + int *sig; + int objectmode = 0; + int cuda = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|ip", keywords, &sigtup, + &cfunc, &objectmode, &cuda)) { + return NULL; + } + + if (!cuda && !PyObject_TypeCheck(cfunc, &PyCFunction_Type) ) { + PyErr_SetString(PyExc_TypeError, "must be builtin_function_or_method"); + return NULL; + } + + sigsz = PySequence_Fast_GET_SIZE(sigtup); + sig = new int[sigsz]; + + for (i = 0; i < sigsz; ++i) { + sig[i] = PyLong_AsLong(PySequence_Fast_GET_ITEM(sigtup, i)); + } + + /* The reference to cfunc is borrowed; this only works because the + derived Python class also stores an (owned) reference to cfunc. */ + self->addDefinition(sig, cfunc); + + /* Add pure python fallback */ + if (!self->fallbackdef && objectmode){ + self->fallbackdef = cfunc; + } + + delete[] sig; + + Py_RETURN_NONE; +} + +static +void explain_issue(PyObject *dispatcher, PyObject *args, PyObject *kws, + const char *method_name, const char *default_msg) +{ + PyObject *callback, *result; + callback = PyObject_GetAttrString(dispatcher, method_name); + if (!callback) { + PyErr_SetString(PyExc_TypeError, default_msg); + return; + } + result = PyObject_Call(callback, args, kws); + Py_DECREF(callback); + if (result != NULL) { + PyErr_Format(PyExc_RuntimeError, "%s must raise an exception", + method_name); + Py_DECREF(result); + } +} + +static +void explain_ambiguous(PyObject *dispatcher, PyObject *args, PyObject *kws) +{ + explain_issue(dispatcher, args, kws, "_explain_ambiguous", + "Ambiguous overloading"); +} + +static +void explain_matching_error(PyObject *dispatcher, PyObject *args, PyObject *kws) +{ + explain_issue(dispatcher, args, kws, "_explain_matching_error", + "No matching definition"); +} + +static +int search_new_conversions(PyObject *dispatcher, PyObject *args, PyObject *kws) +{ + PyObject *callback, *result; + int res; + + callback = PyObject_GetAttrString(dispatcher, + "_search_new_conversions"); + if (!callback) { + return -1; + } + result = PyObject_Call(callback, args, kws); + Py_DECREF(callback); + if (result == NULL) { + return -1; + } + if (!PyBool_Check(result)) { + Py_DECREF(result); + PyErr_SetString(PyExc_TypeError, + "_search_new_conversions() should return a boolean"); + return -1; + } + res = (result == Py_True) ? 1 : 0; + Py_DECREF(result); + return res; +} + + +/* A custom, fast, inlinable version of PyCFunction_Call() */ +static PyObject * +call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals) +{ + PyCFunctionWithKeywords fn; + PyThreadState *tstate; + + assert(PyCFunction_Check(cfunc)); + assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS)); + fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc); + tstate = PyThreadState_GET(); + +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10) + /* + * On Python 3.10+ trace_info comes from somewhere up in PyFrameEval et al, + * Numba doesn't have access to that so creates an equivalent struct and + * wires it up against the cframes. This is passed into the tracing + * functions. + * + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L1611-L1622 + */ + PyTraceInfo trace_info; + trace_info.code = NULL; // not initialized + CFrame *prev_cframe = tstate->cframe; + trace_info.cframe.use_tracing = prev_cframe->use_tracing; + trace_info.cframe.previous = prev_cframe; + + if (trace_info.cframe.use_tracing && tstate->c_profilefunc) +#else + /* + * On Python prior to 3.10, tracing state is a member of the threadstate + */ + if (tstate->use_tracing && tstate->c_profilefunc) +#endif + { + /* + * The following code requires some explaining: + * + * We want the jit-compiled function to be visible to the profiler, so we + * need to synthesize a frame for it. + * The PyFrame_New() constructor doesn't do anything with the 'locals' value if the 'code's + * 'CO_NEWLOCALS' flag is set (which is always the case nowadays). + * So, to get local variables into the frame, we have to manually set the 'f_locals' + * member, then call `PyFrame_LocalsToFast`, where a subsequent call to the `frame.f_locals` + * property (by virtue of the `frame_getlocals` function in frameobject.c) will find them. + */ + PyCodeObject *code = (PyCodeObject*)PyObject_GetAttrString((PyObject*)self, "__code__"); + PyObject *globals = PyDict_New(); + PyObject *builtins = PyEval_GetBuiltins(); + PyFrameObject *frame = NULL; + PyObject *result = NULL; + + if (!code) { + PyErr_Format(PyExc_RuntimeError, "No __code__ attribute found."); + goto error; + } + /* Populate builtins, which is required by some JITted functions */ + if (PyDict_SetItemString(globals, "__builtins__", builtins)) { + goto error; + } + + /* unset the CO_OPTIMIZED flag, make the frame get a new locals dict */ + code->co_flags &= 0xFFFE; + + frame = PyFrame_New(tstate, code, globals, locals); + if (frame == NULL) { + goto error; + } + /* Populate the 'fast locals' in `frame` */ + PyFrame_LocalsToFast(frame, 0); + tstate->frame = frame; + C_TRACE(result, fn(PyCFunction_GET_SELF(cfunc), args, kws)); + /* write changes back to locals? */ + PyFrame_FastToLocals(frame); + tstate->frame = frame->f_back; + + error: + Py_XDECREF(frame); + Py_XDECREF(globals); + Py_XDECREF(code); + return result; + } + else + { + return fn(PyCFunction_GET_SELF(cfunc), args, kws); + } +} + +static +PyObject* +compile_and_invoke(Dispatcher *self, PyObject *args, PyObject *kws, PyObject *locals) +{ + /* Compile a new one */ + PyObject *cfa, *cfunc, *retval; + cfa = PyObject_GetAttrString((PyObject*)self, "_compile_for_args"); + if (cfa == NULL) + return NULL; + + /* NOTE: we call the compiled function ourselves instead of + letting the Python derived class do it. This is for proper + behaviour of globals() in jitted functions (issue #476). */ + cfunc = PyObject_Call(cfa, args, kws); + Py_DECREF(cfa); + + if (cfunc == NULL) + return NULL; + + if (PyObject_TypeCheck(cfunc, &PyCFunction_Type)) { + retval = call_cfunc(self, cfunc, args, kws, locals); + } else { + /* Re-enter interpreter */ + retval = PyObject_Call(cfunc, args, kws); + } + Py_DECREF(cfunc); + + return retval; +} + +/* A copy of compile_and_invoke, that only compiles. This is needed for CUDA + * kernels, because its overloads are Python instances of the _Kernel class, + * rather than compiled functions. Once CUDA overloads are compiled functions, + * cuda_compile_only can be removed. */ +static +PyObject* +cuda_compile_only(Dispatcher *self, PyObject *args, PyObject *kws, PyObject *locals) +{ + /* Compile a new one */ + PyObject *cfa, *cfunc; + cfa = PyObject_GetAttrString((PyObject*)self, "_compile_for_args"); + if (cfa == NULL) + return NULL; + + cfunc = PyObject_Call(cfa, args, kws); + Py_DECREF(cfa); + + return cfunc; +} + +static int +find_named_args(Dispatcher *self, PyObject **pargs, PyObject **pkws) +{ + PyObject *oldargs = *pargs, *newargs; + PyObject *kws = *pkws; + Py_ssize_t pos_args = PyTuple_GET_SIZE(oldargs); + Py_ssize_t named_args, total_args, i; + Py_ssize_t func_args = PyTuple_GET_SIZE(self->argnames); + Py_ssize_t defaults = PyTuple_GET_SIZE(self->defargs); + /* Last parameter with a default value */ + Py_ssize_t last_def = (self->has_stararg) + ? func_args - 2 + : func_args - 1; + /* First parameter with a default value */ + Py_ssize_t first_def = last_def - defaults + 1; + /* Minimum number of required arguments */ + Py_ssize_t minargs = first_def; + + if (kws != NULL) + named_args = PyDict_Size(kws); + else + named_args = 0; + total_args = pos_args + named_args; + if (!self->has_stararg && total_args > func_args) { + PyErr_Format(PyExc_TypeError, + "too many arguments: expected %d, got %d", + (int) func_args, (int) total_args); + return -1; + } + else if (total_args < minargs) { + if (minargs == func_args) + PyErr_Format(PyExc_TypeError, + "not enough arguments: expected %d, got %d", + (int) minargs, (int) total_args); + else + PyErr_Format(PyExc_TypeError, + "not enough arguments: expected at least %d, got %d", + (int) minargs, (int) total_args); + return -1; + } + newargs = PyTuple_New(func_args); + if (!newargs) + return -1; + /* First pack the stararg */ + if (self->has_stararg) { + Py_ssize_t stararg_size = Py_MAX(0, pos_args - func_args + 1); + PyObject *stararg = PyTuple_New(stararg_size); + if (!stararg) { + Py_DECREF(newargs); + return -1; + } + for (i = 0; i < stararg_size; i++) { + PyObject *value = PyTuple_GET_ITEM(oldargs, func_args - 1 + i); + Py_INCREF(value); + PyTuple_SET_ITEM(stararg, i, value); + } + /* Put it in last position */ + PyTuple_SET_ITEM(newargs, func_args - 1, stararg); + + } + for (i = 0; i < pos_args; i++) { + PyObject *value = PyTuple_GET_ITEM(oldargs, i); + if (self->has_stararg && i >= func_args - 1) { + /* Skip stararg */ + break; + } + Py_INCREF(value); + PyTuple_SET_ITEM(newargs, i, value); + } + + /* Iterate over missing positional arguments, try to find them in + named arguments or default values. */ + for (i = pos_args; i < func_args; i++) { + PyObject *name = PyTuple_GET_ITEM(self->argnames, i); + if (self->has_stararg && i >= func_args - 1) { + /* Skip stararg */ + break; + } + if (kws != NULL) { + /* Named argument? */ + PyObject *value = PyDict_GetItem(kws, name); + if (value != NULL) { + Py_INCREF(value); + PyTuple_SET_ITEM(newargs, i, value); + named_args--; + continue; + } + } + if (i >= first_def && i <= last_def) { + /* Argument has a default value? */ + PyObject *value = PyTuple_GET_ITEM(self->defargs, i - first_def); + Py_INCREF(value); + PyTuple_SET_ITEM(newargs, i, value); + continue; + } + else if (i < func_args - 1 || !self->has_stararg) { + PyErr_Format(PyExc_TypeError, + "missing argument '%s'", + PyString_AsString(name)); + Py_DECREF(newargs); + return -1; + } + } + if (named_args) { + PyErr_Format(PyExc_TypeError, + "some keyword arguments unexpected"); + Py_DECREF(newargs); + return -1; + } + *pargs = newargs; + *pkws = NULL; + return 0; +} + + +/* + * Management of thread-local + */ + +#ifdef _MSC_VER +#define THREAD_LOCAL(ty) __declspec(thread) ty +#else +/* Non-standard C99 extension that's understood by gcc and clang */ +#define THREAD_LOCAL(ty) __thread ty +#endif + +static THREAD_LOCAL(bool) use_tls_target_stack; + + +struct raii_use_tls_target_stack { + bool old_setting; + + raii_use_tls_target_stack(bool new_setting) + : old_setting(use_tls_target_stack) + { + use_tls_target_stack = new_setting; + } + + ~raii_use_tls_target_stack() { + use_tls_target_stack = old_setting; + } +}; + +static PyObject* +Dispatcher_call(Dispatcher *self, PyObject *args, PyObject *kws) +{ + PyObject *tmptype, *retval = NULL; + int *tys = NULL; + int argct; + int i; + int prealloc[24]; + int matches; + PyObject *cfunc; + PyThreadState *ts = PyThreadState_Get(); + PyObject *locals = NULL; + + // Check TLS target stack + if (use_tls_target_stack) { + raii_use_tls_target_stack turn_off(false); + PyObject * meth_call_tls_target; + meth_call_tls_target = PyObject_GetAttrString((PyObject*)self, + "_call_tls_target"); + if (!meth_call_tls_target) return NULL; + // Transfer control to self._call_tls_target + retval = PyObject_Call(meth_call_tls_target, args, kws); + Py_DECREF(meth_call_tls_target); + return retval; + } + + /* If compilation is enabled, ensure that an exact match is found and if + * not compile one */ + int exact_match_required = self->can_compile ? 1 : self->exact_match_required; + +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10) + if (ts->tracing && ts->c_profilefunc) { +#else + if (ts->use_tracing && ts->c_profilefunc) { +#endif + locals = PyEval_GetLocals(); + if (locals == NULL) { + goto CLEANUP; + } + } + if (self->fold_args) { + if (find_named_args(self, &args, &kws)) + return NULL; + } + else + Py_INCREF(args); + /* Now we own a reference to args */ + + argct = PySequence_Fast_GET_SIZE(args); + + if (argct < (Py_ssize_t) (sizeof(prealloc) / sizeof(int))) + tys = prealloc; + else + tys = new int[argct]; + + for (i = 0; i < argct; ++i) { + tmptype = PySequence_Fast_GET_ITEM(args, i); + tys[i] = typeof_typecode((PyObject *) self, tmptype); + if (tys[i] == -1) { + if (self->can_fallback){ + /* We will clear the exception if fallback is allowed. */ + PyErr_Clear(); + } else { + goto CLEANUP; + } + } + } + + /* We only allow unsafe conversions if compilation of new specializations + has been disabled. + + Note that the number of matches is returned in matches by resolve, which + accepts it as a reference. */ + cfunc = self->resolve(tys, matches, !self->can_compile, + exact_match_required); + + if (matches == 0 && !self->can_compile) { + /* + * If we can't compile a new specialization, look for + * matching signatures for which conversions haven't been + * registered on the C++ TypeManager. + */ + int res = search_new_conversions((PyObject *) self, args, kws); + if (res < 0) { + retval = NULL; + goto CLEANUP; + } + if (res > 0) { + /* Retry with the newly registered conversions */ + cfunc = self->resolve(tys, matches, !self->can_compile, + exact_match_required); + } + } + if (matches == 1) { + /* Definition is found */ + retval = call_cfunc(self, cfunc, args, kws, locals); + } else if (matches == 0) { + /* No matching definition */ + if (self->can_compile) { + retval = compile_and_invoke(self, args, kws, locals); + } else if (self->fallbackdef) { + /* Have object fallback */ + retval = call_cfunc(self, self->fallbackdef, args, kws, locals); + } else { + /* Raise TypeError */ + explain_matching_error((PyObject *) self, args, kws); + retval = NULL; + } + } else if (self->can_compile) { + /* Ambiguous, but are allowed to compile */ + retval = compile_and_invoke(self, args, kws, locals); + } else { + /* Ambiguous */ + explain_ambiguous((PyObject *) self, args, kws); + retval = NULL; + } + +CLEANUP: + if (tys != prealloc) + delete[] tys; + Py_DECREF(args); + + return retval; +} + +/* Based on Dispatcher_call above, with the following differences: + 1. It does not invoke the definition of the function. + 2. It returns the definition, instead of a value returned by the function. + + This is because CUDA functions are, at present, _Kernel objects rather than + compiled functions. */ +static PyObject* +Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws) +{ + PyObject *tmptype, *retval = NULL; + int *tys = NULL; + int argct; + int i; + int prealloc[24]; + int matches; + PyObject *cfunc; + PyThreadState *ts = PyThreadState_Get(); + PyObject *locals = NULL; + + /* If compilation is enabled, ensure that an exact match is found and if + * not compile one */ + int exact_match_required = self->can_compile ? 1 : self->exact_match_required; + +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10) + if (ts->tracing && ts->c_profilefunc) { +#else + if (ts->use_tracing && ts->c_profilefunc) { +#endif + locals = PyEval_GetLocals(); + if (locals == NULL) { + goto CLEANUP; + } + } + if (self->fold_args) { + if (find_named_args(self, &args, &kws)) + return NULL; + } + else + Py_INCREF(args); + /* Now we own a reference to args */ + + argct = PySequence_Fast_GET_SIZE(args); + + if (argct < (Py_ssize_t) (sizeof(prealloc) / sizeof(int))) + tys = prealloc; + else + tys = new int[argct]; + + for (i = 0; i < argct; ++i) { + tmptype = PySequence_Fast_GET_ITEM(args, i); + tys[i] = typeof_typecode((PyObject *) self, tmptype); + if (tys[i] == -1) { + if (self->can_fallback){ + /* We will clear the exception if fallback is allowed. */ + PyErr_Clear(); + } else { + goto CLEANUP; + } + } + } + + /* We only allow unsafe conversions if compilation of new specializations + has been disabled. */ + cfunc = self->resolve(tys, matches, !self->can_compile, + exact_match_required); + + if (matches == 0 && !self->can_compile) { + /* + * If we can't compile a new specialization, look for + * matching signatures for which conversions haven't been + * registered on the C++ TypeManager. + */ + int res = search_new_conversions((PyObject *) self, args, kws); + if (res < 0) { + retval = NULL; + goto CLEANUP; + } + if (res > 0) { + /* Retry with the newly registered conversions */ + cfunc = self->resolve(tys, matches, !self->can_compile, + exact_match_required); + } + } + + if (matches == 1) { + /* Definition is found */ + retval = cfunc; + Py_INCREF(retval); + } else if (matches == 0) { + /* No matching definition */ + if (self->can_compile) { + retval = cuda_compile_only(self, args, kws, locals); + } else if (self->fallbackdef) { + /* Have object fallback */ + retval = call_cfunc(self, self->fallbackdef, args, kws, locals); + } else { + /* Raise TypeError */ + explain_matching_error((PyObject *) self, args, kws); + retval = NULL; + } + } else if (self->can_compile) { + /* Ambiguous, but are allowed to compile */ + retval = cuda_compile_only(self, args, kws, locals); + } else { + /* Ambiguous */ + explain_ambiguous((PyObject *) self, args, kws); + retval = NULL; + } + +CLEANUP: + if (tys != prealloc) + delete[] tys; + Py_DECREF(args); + + return retval; +} + +static int +import_devicearray(void) +{ + PyObject *devicearray = PyImport_ImportModule("numba._devicearray"); + if (devicearray == NULL) { + return -1; + } + Py_DECREF(devicearray); + + DeviceArray_API = (void**)PyCapsule_Import("numba._devicearray._DEVICEARRAY_API", 0); + if (DeviceArray_API == NULL) { + return -1; + } + + return 0; +} + +static PyMethodDef Dispatcher_methods[] = { + { "_clear", (PyCFunction)Dispatcher_clear, METH_NOARGS, NULL }, + { "_insert", (PyCFunction)Dispatcher_Insert, METH_VARARGS | METH_KEYWORDS, + "insert new definition"}, + { "_cuda_call", (PyCFunction)Dispatcher_cuda_call, + METH_VARARGS | METH_KEYWORDS, "CUDA call resolution" }, + { NULL }, +}; + +static PyMemberDef Dispatcher_members[] = { + {(char*)"_can_compile", T_BOOL, offsetof(Dispatcher, can_compile), 0, NULL }, + {NULL} /* Sentinel */ +}; + + +static PyTypeObject DispatcherType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_dispatcher.Dispatcher", /* tp_name */ + sizeof(Dispatcher), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)Dispatcher_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + (PyCFunctionWithKeywords)Dispatcher_call, /* tp_call*/ + 0, /* tp_str*/ + 0, /* tp_getattro*/ + 0, /* tp_setattro*/ + 0, /* tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags*/ + "Dispatcher object", /* tp_doc */ + (traverseproc) Dispatcher_traverse, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Dispatcher_methods, /* tp_methods */ + Dispatcher_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Dispatcher_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ +#if PY_MAJOR_VERSION == 3 +/* Python 3.8 has two slots, 3.9 has one. */ +#if PY_MINOR_VERSION > 7 + 0, /* tp_vectorcall */ +#if PY_MINOR_VERSION == 8 + 0, /* tp_print */ +#endif +#endif +#endif +}; + + +static PyObject *compute_fingerprint(PyObject *self, PyObject *args) +{ + PyObject *val; + if (!PyArg_ParseTuple(args, "O:compute_fingerprint", &val)) + return NULL; + return typeof_compute_fingerprint(val); +} + +static PyObject *set_use_tls_target_stack(PyObject *self, PyObject *args) +{ + int val; + if (!PyArg_ParseTuple(args, "p", &val)) + return NULL; + bool old = use_tls_target_stack; + use_tls_target_stack = val; + // return the old value + if (old) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } +} + +static PyMethodDef ext_methods[] = { +#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL } + declmethod(typeof_init), + declmethod(compute_fingerprint), + declmethod(set_use_tls_target_stack), + { NULL }, +#undef declmethod +}; + + +MOD_INIT(_dispatcher) { + if (import_devicearray() < 0) { + PyErr_Print(); + PyErr_SetString(PyExc_ImportError, "numba._devicearray failed to import"); + return MOD_ERROR_VAL; + } + + PyObject *m; + MOD_DEF(m, "_dispatcher", "No docs", ext_methods) + if (m == NULL) + return MOD_ERROR_VAL; + + DispatcherType.tp_new = PyType_GenericNew; + if (PyType_Ready(&DispatcherType) < 0) { + return MOD_ERROR_VAL; + } + Py_INCREF(&DispatcherType); + PyModule_AddObject(m, "Dispatcher", (PyObject*)(&DispatcherType)); + + return MOD_SUCCESS_VAL(m); +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_dynfunc.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/_dynfunc.c new file mode 100644 index 0000000000000000000000000000000000000000..7c228a83006ef0f245b019b038ee01b9f2ed7fb9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_dynfunc.c @@ -0,0 +1,507 @@ +/* + * Definition of Environment and Closure objects. + * This module is included by _dynfuncmod.c and by pycc-compiled modules. + */ + +#include "_pymodule.h" + +#include + +/* NOTE: EnvironmentObject and ClosureObject must be kept in sync with + * the definitions in numba/targets/base.py (EnvBody and ClosureBody). + */ + +/* + * EnvironmentObject hosts data needed for execution of compiled functions. + */ +typedef struct { + PyObject_HEAD + PyObject *globals; + /* Assorted "constants" that are needed at runtime to execute + the compiled function. This can include frozen closure variables, + lifted loops, etc. */ + PyObject *consts; +} EnvironmentObject; + + +static PyMemberDef env_members[] = { + {"globals", T_OBJECT, offsetof(EnvironmentObject, globals), READONLY, NULL}, + {"consts", T_OBJECT, offsetof(EnvironmentObject, consts), READONLY, NULL}, + {NULL} /* Sentinel */ +}; + +static int +env_traverse(EnvironmentObject *env, visitproc visit, void *arg) +{ + Py_VISIT(env->globals); + Py_VISIT(env->consts); + return 0; +} + +static int +env_clear(EnvironmentObject *env) +{ + Py_CLEAR(env->globals); + Py_CLEAR(env->consts); + return 0; +} + +static void +env_dealloc(EnvironmentObject *env) +{ + PyObject_GC_UnTrack((PyObject *) env); + env_clear(env); + Py_TYPE(env)->tp_free((PyObject *) env); +} + +static EnvironmentObject * +env_new_empty(PyTypeObject* type) +{ + return (EnvironmentObject *) PyType_GenericNew(type, NULL, NULL); +} + +static PyObject * +env_new(PyTypeObject* type, PyObject* args, PyObject* kwds) +{ + PyObject *globals; + EnvironmentObject *env; + static char *kwlist[] = {"globals", 0}; + + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "O!:function", kwlist, + &PyDict_Type, &globals)) + return NULL; + + env = env_new_empty(type); + if (env == NULL) + return NULL; + Py_INCREF(globals); + env->globals = globals; + env->consts = PyList_New(0); + if (!env->consts) { + Py_DECREF(env); + return NULL; + } + return (PyObject *) env; +} + + +static PyTypeObject EnvironmentType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_dynfunc.Environment", /*tp_name*/ + sizeof(EnvironmentObject), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor) env_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /*tp_flags*/ + 0, /* tp_doc */ + (traverseproc) env_traverse, /* tp_traverse */ + (inquiry) env_clear, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + env_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + env_new, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ +#if PY_MAJOR_VERSION == 3 +/* Python 3.8 has two slots, 3.9 has one. */ +#if PY_MINOR_VERSION > 7 + 0, /* tp_vectorcall */ +#if PY_MINOR_VERSION == 8 + 0, /* tp_print */ +#endif +#endif +#endif +}; + +/* A closure object is created for each call to make_function(), and stored + as the resulting PyCFunction object's "self" pointer. It points to an + EnvironmentObject which is constructed during compilation. This allows + for two things: + - lifetime management of dependent data (e.g. lifted loop dispatchers) + - access to the execution environment by the compiled function + (for example the globals module) + */ + +/* Closure is a variable-sized object for binary compatibility with + Generator (see below). */ +#define CLOSURE_HEAD \ + PyObject_VAR_HEAD \ + EnvironmentObject *env; + +typedef struct { + CLOSURE_HEAD + /* The dynamically-filled method definition for the PyCFunction object + using this closure. */ + PyMethodDef def; + /* Arbitrary object to keep alive during the closure's lifetime. + (put a tuple to put several objects alive). + In practice, this helps keep the LLVM module and its generated + code alive. */ + PyObject *keepalive; + PyObject *weakreflist; +} ClosureObject; + + +static int +closure_traverse(ClosureObject *clo, visitproc visit, void *arg) +{ + Py_VISIT(clo->env); + Py_VISIT(clo->keepalive); + return 0; +} + +static void +closure_dealloc(ClosureObject *clo) +{ + PyObject_GC_UnTrack((PyObject *) clo); + if (clo->weakreflist != NULL) + PyObject_ClearWeakRefs((PyObject *) clo); + PyObject_Free((void *) clo->def.ml_name); + PyObject_Free((void *) clo->def.ml_doc); + Py_XDECREF(clo->env); + Py_XDECREF(clo->keepalive); + Py_TYPE(clo)->tp_free((PyObject *) clo); +} + +static PyTypeObject ClosureType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_dynfunc._Closure", /*tp_name*/ + sizeof(ClosureObject), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor) closure_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, /*tp_flags*/ + 0, /* tp_doc */ + (traverseproc) closure_traverse, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + offsetof(ClosureObject, weakreflist), /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ +#if PY_MAJOR_VERSION == 3 +/* Python 3.8 has two slots, 3.9 has one. */ +#if PY_MINOR_VERSION > 7 + 0, /* tp_vectorcall */ +#if PY_MINOR_VERSION == 8 + 0, /* tp_print */ +#endif +#endif +#endif +}; + + +/* Return an owned piece of character data duplicating a Python string + object's value. */ +static char * +dup_string(PyObject *strobj) +{ + const char *tmp = NULL; + char *str; + tmp = PyString_AsString(strobj); + if (tmp == NULL) + return NULL; + /* Using PyObject_Malloc allows this memory to be tracked for + leaks. */ + str = PyObject_Malloc(strlen(tmp) + 1); + if (str == NULL) { + PyErr_NoMemory(); + return NULL; + } + strcpy(str, tmp); + return str; +} + +/* Create and initialize a new Closure object */ +static ClosureObject * +closure_new(PyObject *name, PyObject *doc, PyCFunction fnaddr, + EnvironmentObject *env, PyObject *keepalive) +{ + ClosureObject *clo = (ClosureObject *) PyType_GenericAlloc(&ClosureType, 0); + if (clo == NULL) + return NULL; + + clo->def.ml_name = dup_string(name); + if (!clo->def.ml_name) { + Py_DECREF(clo); + return NULL; + } + clo->def.ml_meth = fnaddr; + clo->def.ml_flags = METH_VARARGS | METH_KEYWORDS; + clo->def.ml_doc = dup_string(doc); + if (!clo->def.ml_doc) { + Py_DECREF(clo); + return NULL; + } + Py_INCREF(env); + clo->env = env; + Py_XINCREF(keepalive); + clo->keepalive = keepalive; + return clo; +} + +/* Create a new PyCFunction object wrapping a closure defined by + the given arguments. */ +static PyObject * +pycfunction_new(PyObject *module, PyObject *name, PyObject *doc, + PyCFunction fnaddr, EnvironmentObject *env, PyObject *keepalive) +{ + PyObject *funcobj; + PyObject *modname = NULL; + ClosureObject *closure = NULL; + + closure = closure_new(name, doc, fnaddr, env, keepalive); + if (closure == NULL) goto FAIL; + + modname = PyObject_GetAttrString(module, "__name__"); + if (modname == NULL) goto FAIL; + + funcobj = PyCFunction_NewEx(&closure->def, (PyObject *) closure, modname); + Py_DECREF(closure); + Py_DECREF(modname); + + return funcobj; + +FAIL: + Py_XDECREF(closure); + Py_XDECREF(modname); + return NULL; +} + +/* + * Python-facing wrapper for Numba-compiled generator. + * Note the Environment's offset inside the struct is the same as in the + * Closure object. This is required to simplify generation of Python wrappers. + */ + +typedef void (*gen_finalizer_t)(void *); + +typedef struct { + CLOSURE_HEAD + PyCFunctionWithKeywords nextfunc; + gen_finalizer_t finalizer; + PyObject *weakreflist; + union { + double dummy; /* Force alignment */ + char state[0]; + }; +} GeneratorObject; + +static int +generator_traverse(GeneratorObject *gen, visitproc visit, void *arg) +{ + /* XXX this doesn't traverse the state, which can own references to + PyObjects */ + Py_VISIT(gen->env); + return 0; +} + +static int +generator_clear(GeneratorObject *gen) +{ + if (gen->finalizer != NULL) { + gen->finalizer(gen->state); + gen->finalizer = NULL; + } + Py_CLEAR(gen->env); + gen->nextfunc = NULL; + return 0; +} + +static void +generator_dealloc(GeneratorObject *gen) +{ + PyObject_GC_UnTrack((PyObject *) gen); + if (gen->weakreflist != NULL) + PyObject_ClearWeakRefs((PyObject *) gen); + /* XXX The finalizer may be called after the LLVM module has been + destroyed (typically at interpreter shutdown) */ +#if PY_MAJOR_VERSION >= 3 +#if PY_MINOR_VERSION >= 7 + if (!_Py_IsFinalizing()) +#else + if (!_Py_Finalizing) +#endif +#endif + if (gen->finalizer != NULL) + gen->finalizer(gen->state); + Py_XDECREF(gen->env); + Py_TYPE(gen)->tp_free((PyObject *) gen); +} + +static PyObject * +generator_iternext(GeneratorObject *gen) +{ + PyObject *res, *args; + if (gen->nextfunc == NULL) { + PyErr_SetString(PyExc_RuntimeError, + "cannot call next() on finalized generator"); + return NULL; + } + args = PyTuple_Pack(1, (PyObject *) gen); + if (args == NULL) + return NULL; + res = (*gen->nextfunc)((PyObject *) gen, args, NULL); + Py_DECREF(args); + return res; +} + +static PyTypeObject GeneratorType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_dynfunc._Generator", /* tp_name*/ + offsetof(GeneratorObject, state), /* tp_basicsize*/ + 1, /* tp_itemsize*/ + (destructor) generator_dealloc, /* tp_dealloc*/ + 0, /* tp_print*/ + 0, /* tp_getattr*/ + 0, /* tp_setattr*/ + 0, /* tp_compare*/ + 0, /* tp_repr*/ + 0, /* tp_as_number*/ + 0, /* tp_as_sequence*/ + 0, /* tp_as_mapping*/ + 0, /* tp_hash */ + 0, /* tp_call*/ + 0, /* tp_str*/ + 0, /* tp_getattro*/ + 0, /* tp_setattro*/ + 0, /* tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC + | Py_TPFLAGS_BASETYPE, /* tp_flags*/ + 0, /* tp_doc */ + (traverseproc) generator_traverse, /* tp_traverse */ + (inquiry) generator_clear, /* tp_clear */ + 0, /* tp_richcompare */ + offsetof(GeneratorObject, weakreflist), /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter */ + (iternextfunc) generator_iternext, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ +#if PY_MAJOR_VERSION == 3 +/* Python 3.8 has two slots, 3.9 has one. */ +#if PY_MINOR_VERSION > 7 + 0, /* tp_vectorcall */ +#if PY_MINOR_VERSION == 8 + 0, /* tp_print */ +#endif +#endif +#endif +}; + +/* Dynamically create a new generator object */ +static PyObject * +Numba_make_generator(Py_ssize_t gen_state_size, + void *initial_state, + PyCFunctionWithKeywords nextfunc, + gen_finalizer_t finalizer, + EnvironmentObject *env) +{ + GeneratorObject *gen; + gen = (GeneratorObject *) PyType_GenericAlloc(&GeneratorType, gen_state_size); + if (gen == NULL) + return NULL; + memcpy(gen->state, initial_state, gen_state_size); + gen->nextfunc = nextfunc; + Py_XINCREF(env); + gen->env = env; + gen->finalizer = finalizer; + return (PyObject *) gen; +} + +/* Initialization subroutine for use by modules including this */ +static int +init_dynfunc_module(PyObject *module) +{ + if (PyType_Ready(&ClosureType)) + return -1; + if (PyType_Ready(&EnvironmentType)) + return -1; + if (PyType_Ready(&GeneratorType)) + return -1; + return 0; +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_dynfuncmod.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/_dynfuncmod.c new file mode 100644 index 0000000000000000000000000000000000000000..5d80529c05ce85175e0ee2327dda33b25b683555 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_dynfuncmod.c @@ -0,0 +1,93 @@ +#include "_dynfunc.c" + +/* Python-facing function to dynamically create a new C function object */ +static PyObject* +make_function(PyObject *self, PyObject *args) +{ + PyObject *module, *fname, *fdoc, *fnaddrobj; + void *fnaddr; + EnvironmentObject *env; + PyObject *keepalive; + + if (!PyArg_ParseTuple(args, "OOOOO!|O", + &module, &fname, &fdoc, &fnaddrobj, &EnvironmentType, &env, + &keepalive)) { + return NULL; + } + + fnaddr = PyLong_AsVoidPtr(fnaddrobj); + if (fnaddr == NULL && PyErr_Occurred()) + return NULL; + + return pycfunction_new(module, fname, fdoc, fnaddr, env, keepalive); +} + +static PyMethodDef ext_methods[] = { +#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL } + declmethod(make_function), + { NULL }, +#undef declmethod +}; + + +static PyObject * +build_c_helpers_dict(void) +{ + PyObject *dct = PyDict_New(); + if (dct == NULL) + goto error; + +#define _declpointer(name, value) do { \ + PyObject *o = PyLong_FromVoidPtr(value); \ + if (o == NULL) goto error; \ + if (PyDict_SetItemString(dct, name, o)) { \ + Py_DECREF(o); \ + goto error; \ + } \ + Py_DECREF(o); \ +} while (0) + +#define declmethod(func) _declpointer(#func, &Numba_##func) + +#define declpointer(ptr) _declpointer(#ptr, &ptr) + + declmethod(make_generator); + +#undef declmethod + return dct; +error: + Py_XDECREF(dct); + return NULL; +} + +MOD_INIT(_dynfunc) { + PyObject *m, *impl_info; + + MOD_DEF(m, "_dynfunc", "No docs", ext_methods) + if (m == NULL) + return MOD_ERROR_VAL; + + if (init_dynfunc_module(m)) + return MOD_ERROR_VAL; + + impl_info = Py_BuildValue( + "{snsnsn}", + "offsetof_closure_body", offsetof(ClosureObject, env), + "offsetof_env_body", offsetof(EnvironmentObject, globals), + "offsetof_generator_state", offsetof(GeneratorObject, state) + ); + if (impl_info == NULL) + return MOD_ERROR_VAL; + PyModule_AddObject(m, "_impl_info", impl_info); + + Py_INCREF(&ClosureType); + PyModule_AddObject(m, "_Closure", (PyObject *) (&ClosureType)); + Py_INCREF(&EnvironmentType); + PyModule_AddObject(m, "Environment", (PyObject *) (&EnvironmentType)); + Py_INCREF(&GeneratorType); + PyModule_AddObject(m, "_Generator", (PyObject *) (&GeneratorType)); + + PyModule_AddObject(m, "c_helpers", build_c_helpers_dict()); + + return MOD_SUCCESS_VAL(m); +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_hashtable.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/_hashtable.c new file mode 100644 index 0000000000000000000000000000000000000000..76392f79ebb20f251a7ac478d042ed215803f30b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_hashtable.c @@ -0,0 +1,530 @@ +/* + * This file and _hashtable.h are from CPython 3.5. The symbols have been + * renamed from _Py_hashxxx to _Numba_hashxxx to avoid name clashes with + * the CPython definitions (including at runtime through dynamic linking). + * Those CPython APIs are private and can change in incompatible ways at + * any time. + * + * Command line used for renaming: + * $ sed -i -r 's/\b_Py_(has[h]table)/_Numba_\1/ig' numba/_hashtable.h numba/_hashtable.c + */ + +/* The implementation of the hash table (_Numba_hashtable_t) is based on the cfuhash + project: + http://sourceforge.net/projects/libcfu/ + + Copyright of cfuhash: + ---------------------------------- + Creation date: 2005-06-24 21:22:40 + Authors: Don + Change log: + + Copyright (c) 2005 Don Owens + All rights reserved. + + This code is released under the BSD license: + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the author nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + OF THE POSSIBILITY OF SUCH DAMAGE. + ---------------------------------- +*/ + +#include "_pymodule.h" +#include "_hashtable.h" + +#define HASHTABLE_MIN_SIZE 16 +#define HASHTABLE_HIGH 0.50 +#define HASHTABLE_LOW 0.10 +#define HASHTABLE_REHASH_FACTOR 2.0 / (HASHTABLE_LOW + HASHTABLE_HIGH) + +#define BUCKETS_HEAD(SLIST) \ + ((_Numba_hashtable_entry_t *)_Py_SLIST_HEAD(&(SLIST))) +#define TABLE_HEAD(HT, BUCKET) \ + ((_Numba_hashtable_entry_t *)_Py_SLIST_HEAD(&(HT)->buckets[BUCKET])) +#define ENTRY_NEXT(ENTRY) \ + ((_Numba_hashtable_entry_t *)_Py_SLIST_ITEM_NEXT(ENTRY)) +#define HASHTABLE_ITEM_SIZE(HT) \ + (sizeof(_Numba_hashtable_entry_t) + (HT)->data_size) + +/* Forward declaration */ +static void hashtable_rehash(_Numba_hashtable_t *ht); + +static void +_Py_slist_init(_Py_slist_t *list) +{ + list->head = NULL; +} + +static void +_Py_slist_prepend(_Py_slist_t *list, _Py_slist_item_t *item) +{ + item->next = list->head; + list->head = item; +} + +static void +_Py_slist_remove(_Py_slist_t *list, _Py_slist_item_t *previous, + _Py_slist_item_t *item) +{ + if (previous != NULL) + previous->next = item->next; + else + list->head = item->next; +} + +Py_uhash_t +_Numba_hashtable_hash_int(const void *key) +{ + return (Py_uhash_t)key; +} + +Py_uhash_t +_Numba_hashtable_hash_ptr(const void *key) +{ + return (Py_uhash_t)_Py_HashPointer((void *)key); +} + +int +_Numba_hashtable_compare_direct(const void *key, const _Numba_hashtable_entry_t *entry) +{ + return entry->key == key; +} + +/* makes sure the real size of the buckets array is a power of 2 */ +static size_t +round_size(size_t s) +{ + size_t i; + if (s < HASHTABLE_MIN_SIZE) + return HASHTABLE_MIN_SIZE; + i = 1; + while (i < s) + i <<= 1; + return i; +} + +_Numba_hashtable_t * +_Numba_hashtable_new_full(size_t data_size, size_t init_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func, + _Numba_hashtable_copy_data_func copy_data_func, + _Numba_hashtable_free_data_func free_data_func, + _Numba_hashtable_get_data_size_func get_data_size_func, + _Numba_hashtable_allocator_t *allocator) +{ + _Numba_hashtable_t *ht; + size_t buckets_size; + _Numba_hashtable_allocator_t alloc; + + if (allocator == NULL) { + alloc.malloc = PyMem_RawMalloc; + alloc.free = PyMem_RawFree; + } + else + alloc = *allocator; + + ht = (_Numba_hashtable_t *)alloc.malloc(sizeof(_Numba_hashtable_t)); + if (ht == NULL) + return ht; + + ht->num_buckets = round_size(init_size); + ht->entries = 0; + ht->data_size = data_size; + + buckets_size = ht->num_buckets * sizeof(ht->buckets[0]); + ht->buckets = alloc.malloc(buckets_size); + if (ht->buckets == NULL) { + alloc.free(ht); + return NULL; + } + memset(ht->buckets, 0, buckets_size); + + ht->hash_func = hash_func; + ht->compare_func = compare_func; + ht->copy_data_func = copy_data_func; + ht->free_data_func = free_data_func; + ht->get_data_size_func = get_data_size_func; + ht->alloc = alloc; + return ht; +} + +_Numba_hashtable_t * +_Numba_hashtable_new(size_t data_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func) +{ + return _Numba_hashtable_new_full(data_size, HASHTABLE_MIN_SIZE, + hash_func, compare_func, + NULL, NULL, NULL, NULL); +} + +size_t +_Numba_hashtable_size(_Numba_hashtable_t *ht) +{ + size_t size; + size_t hv; + + size = sizeof(_Numba_hashtable_t); + + /* buckets */ + size += ht->num_buckets * sizeof(_Numba_hashtable_entry_t *); + + /* entries */ + size += ht->entries * HASHTABLE_ITEM_SIZE(ht); + + /* data linked from entries */ + if (ht->get_data_size_func) { + for (hv = 0; hv < ht->num_buckets; hv++) { + _Numba_hashtable_entry_t *entry; + + for (entry = TABLE_HEAD(ht, hv); entry; entry = ENTRY_NEXT(entry)) { + void *data; + + data = _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry); + size += ht->get_data_size_func(data); + } + } + } + return size; +} + +#ifdef Py_DEBUG +void +_Numba_hashtable_print_stats(_Numba_hashtable_t *ht) +{ + size_t size; + size_t chain_len, max_chain_len, total_chain_len, nchains; + _Numba_hashtable_entry_t *entry; + size_t hv; + double load; + + size = _Numba_hashtable_size(ht); + + load = (double)ht->entries / ht->num_buckets; + + max_chain_len = 0; + total_chain_len = 0; + nchains = 0; + for (hv = 0; hv < ht->num_buckets; hv++) { + entry = TABLE_HEAD(ht, hv); + if (entry != NULL) { + chain_len = 0; + for (; entry; entry = ENTRY_NEXT(entry)) { + chain_len++; + } + if (chain_len > max_chain_len) + max_chain_len = chain_len; + total_chain_len += chain_len; + nchains++; + } + } + printf("hash table %p: entries=%" + PY_FORMAT_SIZE_T "u/%" PY_FORMAT_SIZE_T "u (%.0f%%), ", + ht, ht->entries, ht->num_buckets, load * 100.0); + if (nchains) + printf("avg_chain_len=%.1f, ", (double)total_chain_len / nchains); + printf("max_chain_len=%" PY_FORMAT_SIZE_T "u, %" PY_FORMAT_SIZE_T "u kB\n", + max_chain_len, size / 1024); +} +#endif + +/* Get an entry. Return NULL if the key does not exist. */ +_Numba_hashtable_entry_t * +_Numba_hashtable_get_entry(_Numba_hashtable_t *ht, const void *key) +{ + Py_uhash_t key_hash; + size_t index; + _Numba_hashtable_entry_t *entry; + + key_hash = ht->hash_func(key); + index = key_hash & (ht->num_buckets - 1); + + for (entry = TABLE_HEAD(ht, index); entry != NULL; entry = ENTRY_NEXT(entry)) { + if (entry->key_hash == key_hash && ht->compare_func(key, entry)) + break; + } + + return entry; +} + +static int +_hashtable_pop_entry(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size) +{ + Py_uhash_t key_hash; + size_t index; + _Numba_hashtable_entry_t *entry, *previous; + + key_hash = ht->hash_func(key); + index = key_hash & (ht->num_buckets - 1); + + previous = NULL; + for (entry = TABLE_HEAD(ht, index); entry != NULL; entry = ENTRY_NEXT(entry)) { + if (entry->key_hash == key_hash && ht->compare_func(key, entry)) + break; + previous = entry; + } + + if (entry == NULL) + return 0; + + _Py_slist_remove(&ht->buckets[index], (_Py_slist_item_t *)previous, + (_Py_slist_item_t *)entry); + ht->entries--; + + if (data != NULL) + _Numba_HASHTABLE_ENTRY_READ_DATA(ht, data, data_size, entry); + ht->alloc.free(entry); + + if ((float)ht->entries / (float)ht->num_buckets < HASHTABLE_LOW) + hashtable_rehash(ht); + return 1; +} + +/* Add a new entry to the hash. The key must not be present in the hash table. + Return 0 on success, -1 on memory error. */ +int +_Numba_hashtable_set(_Numba_hashtable_t *ht, const void *key, + void *data, size_t data_size) +{ + Py_uhash_t key_hash; + size_t index; + _Numba_hashtable_entry_t *entry; + + assert(data != NULL || data_size == 0); +#ifndef NDEBUG + /* Don't write the assertion on a single line because it is interesting + to know the duplicated entry if the assertion failed. The entry can + be read using a debugger. */ + entry = _Numba_hashtable_get_entry(ht, key); + assert(entry == NULL); +#endif + + key_hash = ht->hash_func(key); + index = key_hash & (ht->num_buckets - 1); + + entry = ht->alloc.malloc(HASHTABLE_ITEM_SIZE(ht)); + if (entry == NULL) { + /* memory allocation failed */ + return -1; + } + + entry->key = (void *)key; + entry->key_hash = key_hash; + + assert(data_size == ht->data_size); + memcpy(_Numba_HASHTABLE_ENTRY_DATA(entry), data, data_size); + + _Py_slist_prepend(&ht->buckets[index], (_Py_slist_item_t*)entry); + ht->entries++; + + if ((float)ht->entries / (float)ht->num_buckets > HASHTABLE_HIGH) + hashtable_rehash(ht); + return 0; +} + +/* Get data from an entry. Copy entry data into data and return 1 if the entry + exists, return 0 if the entry does not exist. */ +int +_Numba_hashtable_get(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size) +{ + _Numba_hashtable_entry_t *entry; + + assert(data != NULL); + + entry = _Numba_hashtable_get_entry(ht, key); + if (entry == NULL) + return 0; + _Numba_HASHTABLE_ENTRY_READ_DATA(ht, data, data_size, entry); + return 1; +} + +int +_Numba_hashtable_pop(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size) +{ + assert(data != NULL); + assert(ht->free_data_func == NULL); + return _hashtable_pop_entry(ht, key, data, data_size); +} + +/* Delete an entry. The entry must exist. */ +void +_Numba_hashtable_delete(_Numba_hashtable_t *ht, const void *key) +{ +#ifndef NDEBUG + int found = _hashtable_pop_entry(ht, key, NULL, 0); + assert(found); +#else + (void)_hashtable_pop_entry(ht, key, NULL, 0); +#endif +} + +/* Prototype for a pointer to a function to be called foreach + key/value pair in the hash by hashtable_foreach(). Iteration + stops if a non-zero value is returned. */ +int +_Numba_hashtable_foreach(_Numba_hashtable_t *ht, + int (*func) (_Numba_hashtable_entry_t *entry, void *arg), + void *arg) +{ + _Numba_hashtable_entry_t *entry; + size_t hv; + + for (hv = 0; hv < ht->num_buckets; hv++) { + for (entry = TABLE_HEAD(ht, hv); entry; entry = ENTRY_NEXT(entry)) { + int res = func(entry, arg); + if (res) + return res; + } + } + return 0; +} + +static void +hashtable_rehash(_Numba_hashtable_t *ht) +{ + size_t buckets_size, new_size, bucket; + _Py_slist_t *old_buckets = NULL; + size_t old_num_buckets; + + new_size = round_size((size_t)(ht->entries * HASHTABLE_REHASH_FACTOR)); + if (new_size == ht->num_buckets) + return; + + old_num_buckets = ht->num_buckets; + + buckets_size = new_size * sizeof(ht->buckets[0]); + old_buckets = ht->buckets; + ht->buckets = ht->alloc.malloc(buckets_size); + if (ht->buckets == NULL) { + /* cancel rehash on memory allocation failure */ + ht->buckets = old_buckets ; + /* memory allocation failed */ + return; + } + memset(ht->buckets, 0, buckets_size); + + ht->num_buckets = new_size; + + for (bucket = 0; bucket < old_num_buckets; bucket++) { + _Numba_hashtable_entry_t *entry, *next; + for (entry = BUCKETS_HEAD(old_buckets[bucket]); entry != NULL; entry = next) { + size_t entry_index; + + assert(ht->hash_func(entry->key) == entry->key_hash); + next = ENTRY_NEXT(entry); + entry_index = entry->key_hash & (new_size - 1); + + _Py_slist_prepend(&ht->buckets[entry_index], (_Py_slist_item_t*)entry); + } + } + + ht->alloc.free(old_buckets); +} + +void +_Numba_hashtable_clear(_Numba_hashtable_t *ht) +{ + _Numba_hashtable_entry_t *entry, *next; + size_t i; + + for (i=0; i < ht->num_buckets; i++) { + for (entry = TABLE_HEAD(ht, i); entry != NULL; entry = next) { + next = ENTRY_NEXT(entry); + if (ht->free_data_func) + ht->free_data_func(_Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry)); + ht->alloc.free(entry); + } + _Py_slist_init(&ht->buckets[i]); + } + ht->entries = 0; + hashtable_rehash(ht); +} + +void +_Numba_hashtable_destroy(_Numba_hashtable_t *ht) +{ + size_t i; + + for (i = 0; i < ht->num_buckets; i++) { + _Py_slist_item_t *entry = ht->buckets[i].head; + while (entry) { + _Py_slist_item_t *entry_next = entry->next; + if (ht->free_data_func) + ht->free_data_func(_Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry)); + ht->alloc.free(entry); + entry = entry_next; + } + } + + ht->alloc.free(ht->buckets); + ht->alloc.free(ht); +} + +/* Return a copy of the hash table */ +_Numba_hashtable_t * +_Numba_hashtable_copy(_Numba_hashtable_t *src) +{ + _Numba_hashtable_t *dst; + _Numba_hashtable_entry_t *entry; + size_t bucket; + int err; + void *data, *new_data; + + dst = _Numba_hashtable_new_full(src->data_size, src->num_buckets, + src->hash_func, src->compare_func, + src->copy_data_func, src->free_data_func, + src->get_data_size_func, &src->alloc); + if (dst == NULL) + return NULL; + + for (bucket=0; bucket < src->num_buckets; bucket++) { + entry = TABLE_HEAD(src, bucket); + for (; entry; entry = ENTRY_NEXT(entry)) { + if (src->copy_data_func) { + data = _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry); + new_data = src->copy_data_func(data); + if (new_data != NULL) + err = _Numba_hashtable_set(dst, entry->key, + &new_data, src->data_size); + else + err = 1; + } + else { + data = _Numba_HASHTABLE_ENTRY_DATA(entry); + err = _Numba_hashtable_set(dst, entry->key, data, src->data_size); + } + if (err) { + _Numba_hashtable_destroy(dst); + return NULL; + } + } + } + return dst; +} + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_hashtable.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/_hashtable.h new file mode 100644 index 0000000000000000000000000000000000000000..37430429d8730bd1c5c15784a472fb1e16b96967 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_hashtable.h @@ -0,0 +1,132 @@ +/* + * See _hashtable.c for more information about this file. + */ + +#ifndef Py_HASHTABLE_H +#define Py_HASHTABLE_H + +/* The whole API is private */ +#ifndef Py_LIMITED_API + +typedef struct _Py_slist_item_s { + struct _Py_slist_item_s *next; +} _Py_slist_item_t; + +typedef struct { + _Py_slist_item_t *head; +} _Py_slist_t; + +#define _Py_SLIST_ITEM_NEXT(ITEM) (((_Py_slist_item_t *)ITEM)->next) + +#define _Py_SLIST_HEAD(SLIST) (((_Py_slist_t *)SLIST)->head) + +typedef struct { + /* used by _Numba_hashtable_t.buckets to link entries */ + _Py_slist_item_t _Py_slist_item; + + const void *key; + Py_uhash_t key_hash; + + /* data follows */ +} _Numba_hashtable_entry_t; + +#define _Numba_HASHTABLE_ENTRY_DATA(ENTRY) \ + ((char *)(ENTRY) + sizeof(_Numba_hashtable_entry_t)) + +#define _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(ENTRY) \ + (*(void **)_Numba_HASHTABLE_ENTRY_DATA(ENTRY)) + +#define _Numba_HASHTABLE_ENTRY_READ_DATA(TABLE, DATA, DATA_SIZE, ENTRY) \ + do { \ + assert((DATA_SIZE) == (TABLE)->data_size); \ + memcpy(DATA, _Numba_HASHTABLE_ENTRY_DATA(ENTRY), DATA_SIZE); \ + } while (0) + +typedef Py_uhash_t (*_Numba_hashtable_hash_func) (const void *key); +typedef int (*_Numba_hashtable_compare_func) (const void *key, const _Numba_hashtable_entry_t *he); +typedef void* (*_Numba_hashtable_copy_data_func)(void *data); +typedef void (*_Numba_hashtable_free_data_func)(void *data); +typedef size_t (*_Numba_hashtable_get_data_size_func)(void *data); + +typedef struct { + /* allocate a memory block */ + void* (*malloc) (size_t size); + + /* release a memory block */ + void (*free) (void *ptr); +} _Numba_hashtable_allocator_t; + +typedef struct { + size_t num_buckets; + size_t entries; /* Total number of entries in the table. */ + _Py_slist_t *buckets; + size_t data_size; + + _Numba_hashtable_hash_func hash_func; + _Numba_hashtable_compare_func compare_func; + _Numba_hashtable_copy_data_func copy_data_func; + _Numba_hashtable_free_data_func free_data_func; + _Numba_hashtable_get_data_size_func get_data_size_func; + _Numba_hashtable_allocator_t alloc; +} _Numba_hashtable_t; + +/* hash and compare functions for integers and pointers */ +PyAPI_FUNC(Py_uhash_t) _Numba_hashtable_hash_ptr(const void *key); +PyAPI_FUNC(Py_uhash_t) _Numba_hashtable_hash_int(const void *key); +PyAPI_FUNC(int) _Numba_hashtable_compare_direct(const void *key, const _Numba_hashtable_entry_t *entry); + +PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_new( + size_t data_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func); +PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_new_full( + size_t data_size, + size_t init_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func, + _Numba_hashtable_copy_data_func copy_data_func, + _Numba_hashtable_free_data_func free_data_func, + _Numba_hashtable_get_data_size_func get_data_size_func, + _Numba_hashtable_allocator_t *allocator); +PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_copy(_Numba_hashtable_t *src); +PyAPI_FUNC(void) _Numba_hashtable_clear(_Numba_hashtable_t *ht); +PyAPI_FUNC(void) _Numba_hashtable_destroy(_Numba_hashtable_t *ht); + +typedef int (*_Numba_hashtable_foreach_func) (_Numba_hashtable_entry_t *entry, void *arg); + +PyAPI_FUNC(int) _Numba_hashtable_foreach( + _Numba_hashtable_t *ht, + _Numba_hashtable_foreach_func func, void *arg); +PyAPI_FUNC(size_t) _Numba_hashtable_size(_Numba_hashtable_t *ht); + +PyAPI_FUNC(_Numba_hashtable_entry_t*) _Numba_hashtable_get_entry( + _Numba_hashtable_t *ht, + const void *key); +PyAPI_FUNC(int) _Numba_hashtable_set( + _Numba_hashtable_t *ht, + const void *key, + void *data, + size_t data_size); +PyAPI_FUNC(int) _Numba_hashtable_get( + _Numba_hashtable_t *ht, + const void *key, + void *data, + size_t data_size); +PyAPI_FUNC(int) _Numba_hashtable_pop( + _Numba_hashtable_t *ht, + const void *key, + void *data, + size_t data_size); +PyAPI_FUNC(void) _Numba_hashtable_delete( + _Numba_hashtable_t *ht, + const void *key); + +#define _Numba_HASHTABLE_SET(TABLE, KEY, DATA) \ + _Numba_hashtable_set(TABLE, KEY, &(DATA), sizeof(DATA)) + +#define _Numba_HASHTABLE_GET(TABLE, KEY, DATA) \ + _Numba_hashtable_get(TABLE, KEY, &(DATA), sizeof(DATA)) + +#endif /* Py_LIMITED_API */ + +#endif diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_helperlib.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/_helperlib.c new file mode 100644 index 0000000000000000000000000000000000000000..c1da22477b22064fab598a5d281e76b9986f3b9b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_helperlib.c @@ -0,0 +1,1186 @@ +/* + * Helper functions used by Numba at runtime. + * This C file is meant to be included after defining the + * NUMBA_EXPORT_FUNC() and NUMBA_EXPORT_DATA() macros. + */ + +#include "_pymodule.h" +#include +#include +#include +#ifdef _MSC_VER + #define int64_t signed __int64 + #define uint64_t unsigned __int64 + #define uint32_t unsigned __int32 +#else + #include +#endif +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#include +#include +#include + +#include "_arraystruct.h" + +/* + * Other helpers. + */ + + +/* Fix fmod() and fmodf() for windows x64 VC 9.0 (VS 2008) +https://support.microsoft.com/en-us/kb/982107 +*/ +static void (*fnclex)(void) = NULL; + +NUMBA_EXPORT_FUNC(double) +numba_fixed_fmod(double x, double y){ + fnclex(); /* no inline asm in x64 =( */ + return fmod(x, y); +} + +NUMBA_EXPORT_FUNC(float) +numba_fixed_fmodf(float x, float y) { + fnclex(); /* no inline asm in x64 =( */ + return fmodf(x, y); +} + +NUMBA_EXPORT_FUNC(void) +numba_set_fnclex(void *fn){ + fnclex = fn; +} + +/* provide 64-bit division function to 32-bit platforms */ +NUMBA_EXPORT_FUNC(int64_t) +numba_sdiv(int64_t a, int64_t b) { + return a / b; +} + +NUMBA_EXPORT_FUNC(uint64_t) +numba_udiv(uint64_t a, uint64_t b) { + return a / b; +} + +/* provide 64-bit remainder function to 32-bit platforms */ +NUMBA_EXPORT_FUNC(int64_t) +numba_srem(int64_t a, int64_t b) { + return a % b; +} + +NUMBA_EXPORT_FUNC(uint64_t) +numba_urem(uint64_t a, uint64_t b) { + return a % b; +} + +/* provide frexp and ldexp; these wrappers deal with special cases + * (zero, nan, infinity) directly, to sidestep platform differences. + */ +NUMBA_EXPORT_FUNC(double) +numba_frexp(double x, int *exp) +{ + if (!Py_IS_FINITE(x) || !x) + *exp = 0; + else + x = frexp(x, exp); + return x; +} + +NUMBA_EXPORT_FUNC(float) +numba_frexpf(float x, int *exp) +{ + if (Py_IS_NAN(x) || Py_IS_INFINITY(x) || !x) + *exp = 0; + else + x = frexpf(x, exp); + return x; +} + +NUMBA_EXPORT_FUNC(double) +numba_ldexp(double x, int exp) +{ + if (Py_IS_FINITE(x) && x && exp) + x = ldexp(x, exp); + return x; +} + +NUMBA_EXPORT_FUNC(float) +numba_ldexpf(float x, int exp) +{ + if (Py_IS_FINITE(x) && x && exp) + x = ldexpf(x, exp); + return x; +} + +/* provide complex power */ +NUMBA_EXPORT_FUNC(void) +numba_cpow(Py_complex *a, Py_complex *b, Py_complex *out) { + errno = 0; + *out = _Py_c_pow(*a, *b); + if (errno == EDOM) { + /* _Py_c_pow() doesn't bother returning the right value + in this case, as Python raises ZeroDivisionError */ + out->real = out->imag = Py_NAN; + } +} + +NUMBA_EXPORT_FUNC(void) +numba_cpowf(npy_cfloat *a, npy_cfloat *b, npy_cfloat *out) { + Py_complex _a, _b, _out; + _a.real = npy_crealf(*a); + _a.imag = npy_cimagf(*a); + _b.real = npy_crealf(*b); + _b.imag = npy_cimagf(*b); + numba_cpow(&_a, &_b, &_out); + *out = npy_cpackf((float) _out.real, (float) _out.imag); +} + +/* C99 math functions: redirect to system implementations */ + +NUMBA_EXPORT_FUNC(double) +numba_gamma(double x) +{ + return tgamma(x); +} + +NUMBA_EXPORT_FUNC(float) +numba_gammaf(float x) +{ + return tgammaf(x); +} + +NUMBA_EXPORT_FUNC(double) +numba_lgamma(double x) +{ + return lgamma(x); +} + +NUMBA_EXPORT_FUNC(float) +numba_lgammaf(float x) +{ + return lgammaf(x); +} + +NUMBA_EXPORT_FUNC(double) +numba_erf(double x) +{ + return erf(x); +} + +NUMBA_EXPORT_FUNC(float) +numba_erff(float x) +{ + return erff(x); +} + +NUMBA_EXPORT_FUNC(double) +numba_erfc(double x) +{ + return erfc(x); +} + +NUMBA_EXPORT_FUNC(float) +numba_erfcf(float x) +{ + return erfcf(x); +} + +/* Note npy_signbit() is actually a polymorphic macro */ +NUMBA_EXPORT_FUNC(int) +numba_signbitf(float a) +{ + return npy_signbit(a); +} + +NUMBA_EXPORT_FUNC(int) +numba_signbit(npy_double a) +{ + return npy_signbit(a); +} + +/* Unpack any Python complex-like object into a Py_complex structure */ +NUMBA_EXPORT_FUNC(int) +numba_complex_adaptor(PyObject* obj, Py_complex *out) { + PyObject* fobj; + PyArray_Descr *dtype; + double val[2]; + + // Convert from python complex or numpy complex128 + if (PyComplex_Check(obj)) { + out->real = PyComplex_RealAsDouble(obj); + out->imag = PyComplex_ImagAsDouble(obj); + } + // Convert from numpy complex64 + else if (PyArray_IsScalar(obj, ComplexFloating)) { + dtype = PyArray_DescrFromScalar(obj); + if (dtype == NULL) { + return 0; + } + if (PyArray_CastScalarDirect(obj, dtype, &val[0], NPY_CDOUBLE) < 0) { + Py_DECREF(dtype); + return 0; + } + out->real = val[0]; + out->imag = val[1]; + Py_DECREF(dtype); + } else { + fobj = PyNumber_Float(obj); + if (!fobj) return 0; + out->real = PyFloat_AsDouble(fobj); + out->imag = 0.; + Py_DECREF(fobj); + } + return 1; +} + +/* Minimum PyBufferObject structure to hack inside it */ +typedef struct { + PyObject_HEAD + PyObject *b_base; + void *b_ptr; + Py_ssize_t b_size; + Py_ssize_t b_offset; +} PyBufferObject_Hack; + +/* +Get data address of record data buffer +*/ +NUMBA_EXPORT_FUNC(void *) +numba_extract_record_data(PyObject *recordobj, Py_buffer *pbuf) { + PyObject *attrdata; + void *ptr; + + attrdata = PyObject_GetAttrString(recordobj, "data"); + if (!attrdata) return NULL; + + if (-1 == PyObject_GetBuffer(attrdata, pbuf, 0)){ + Py_DECREF(attrdata); + return NULL; + } else { + ptr = pbuf->buf; + } + Py_DECREF(attrdata); + return ptr; +} + +/* + * Return a record instance with dtype as the record type, and backed + * by a copy of the memory area pointed to by (pdata, size). + */ +NUMBA_EXPORT_FUNC(PyObject *) +numba_recreate_record(void *pdata, int size, PyObject *dtype) { + PyObject *numpy = NULL; + PyObject *numpy_record = NULL; + PyObject *aryobj = NULL; + PyObject *dtypearg = NULL; + PyObject *record = NULL; + PyArray_Descr *descr = NULL; + + if (dtype == NULL) { + PyErr_Format(PyExc_RuntimeError, + "In 'numba_recreate_record', 'dtype' is NULL"); + return NULL; + } + + numpy = PyImport_ImportModuleNoBlock("numpy"); + if (!numpy) goto CLEANUP; + + numpy_record = PyObject_GetAttrString(numpy, "record"); + if (!numpy_record) goto CLEANUP; + + dtypearg = PyTuple_Pack(2, numpy_record, dtype); + if (!dtypearg || !PyArray_DescrConverter(dtypearg, &descr)) + goto CLEANUP; + + /* This steals a reference to descr, so we don't have to DECREF it */ + aryobj = PyArray_FromString(pdata, size, descr, 1, NULL); + if (!aryobj) goto CLEANUP; + + record = PySequence_GetItem(aryobj, 0); + +CLEANUP: + Py_XDECREF(numpy); + Py_XDECREF(numpy_record); + Py_XDECREF(aryobj); + Py_XDECREF(dtypearg); + + return record; +} + +NUMBA_EXPORT_FUNC(int) +numba_adapt_ndarray(PyObject *obj, arystruct_t* arystruct) { + PyArrayObject *ndary; + int i, ndim; + npy_intp *p; + + if (!PyArray_Check(obj)) { + return -1; + } + + ndary = (PyArrayObject*)obj; + ndim = PyArray_NDIM(ndary); + + arystruct->data = PyArray_DATA(ndary); + arystruct->nitems = PyArray_SIZE(ndary); + arystruct->itemsize = PyArray_ITEMSIZE(ndary); + arystruct->parent = obj; + p = arystruct->shape_and_strides; + for (i = 0; i < ndim; i++, p++) { + *p = PyArray_DIM(ndary, i); + } + for (i = 0; i < ndim; i++, p++) { + *p = PyArray_STRIDE(ndary, i); + } + arystruct->meminfo = NULL; + return 0; +} + +NUMBA_EXPORT_FUNC(int) +numba_get_buffer(PyObject *obj, Py_buffer *buf) +{ + /* Ask for shape and strides, but no suboffsets */ + return PyObject_GetBuffer(obj, buf, PyBUF_RECORDS_RO); +} + +NUMBA_EXPORT_FUNC(void) +numba_adapt_buffer(Py_buffer *buf, arystruct_t *arystruct) +{ + int i; + npy_intp *p; + + arystruct->data = buf->buf; + arystruct->itemsize = buf->itemsize; + arystruct->parent = buf->obj; + arystruct->nitems = 1; + p = arystruct->shape_and_strides; + for (i = 0; i < buf->ndim; i++, p++) { + *p = buf->shape[i]; + arystruct->nitems *= buf->shape[i]; + } + for (i = 0; i < buf->ndim; i++, p++) { + *p = buf->strides[i]; + } + arystruct->meminfo = NULL; +} + +NUMBA_EXPORT_FUNC(void) +numba_release_buffer(Py_buffer *buf) +{ + PyBuffer_Release(buf); +} + +NUMBA_EXPORT_FUNC(PyObject *) +numba_ndarray_new(int nd, + npy_intp *dims, /* shape */ + npy_intp *strides, + void* data, + int type_num, + int itemsize) +{ + PyObject *ndary; + int flags = NPY_ARRAY_BEHAVED; + ndary = PyArray_New((PyTypeObject*)&PyArray_Type, nd, dims, type_num, + strides, data, 0, flags, NULL); + return ndary; +} + + +/* + * Handle reshaping of zero-sized array. + * See numba_attempt_nocopy_reshape() below. + */ +static int +nocopy_empty_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides, + npy_intp newnd, const npy_intp *newdims, + npy_intp *newstrides, npy_intp itemsize, + int is_f_order) +{ + int i; + /* Just make the strides vaguely reasonable + * (they can have any value in theory). + */ + for (i = 0; i < newnd; i++) + newstrides[i] = itemsize; + return 1; /* reshape successful */ +} + +/* + * Straight from Numpy's _attempt_nocopy_reshape() + * (np/core/src/multiarray/shape.c). + * Attempt to reshape an array without copying data + * + * This function should correctly handle all reshapes, including + * axes of length 1. Zero strides should work but are untested. + * + * If a copy is needed, returns 0 + * If no copy is needed, returns 1 and fills `npy_intp *newstrides` + * with appropriate strides + */ + +NUMBA_EXPORT_FUNC(int) +numba_attempt_nocopy_reshape(npy_intp nd, const npy_intp *dims, const npy_intp *strides, + npy_intp newnd, const npy_intp *newdims, + npy_intp *newstrides, npy_intp itemsize, + int is_f_order) +{ + int oldnd; + npy_intp olddims[NPY_MAXDIMS]; + npy_intp oldstrides[NPY_MAXDIMS]; + npy_intp np, op, last_stride; + int oi, oj, ok, ni, nj, nk; + + oldnd = 0; + /* + * Remove axes with dimension 1 from the old array. They have no effect + * but would need special cases since their strides do not matter. + */ + for (oi = 0; oi < nd; oi++) { + if (dims[oi]!= 1) { + olddims[oldnd] = dims[oi]; + oldstrides[oldnd] = strides[oi]; + oldnd++; + } + } + + np = 1; + for (ni = 0; ni < newnd; ni++) { + np *= newdims[ni]; + } + op = 1; + for (oi = 0; oi < oldnd; oi++) { + op *= olddims[oi]; + } + if (np != op) { + /* different total sizes; no hope */ + return 0; + } + + if (np == 0) { + /* the Numpy code does not handle 0-sized arrays */ + return nocopy_empty_reshape(nd, dims, strides, + newnd, newdims, newstrides, + itemsize, is_f_order); + } + + /* oi to oj and ni to nj give the axis ranges currently worked with */ + oi = 0; + oj = 1; + ni = 0; + nj = 1; + while (ni < newnd && oi < oldnd) { + np = newdims[ni]; + op = olddims[oi]; + + while (np != op) { + if (np < op) { + /* Misses trailing 1s, these are handled later */ + np *= newdims[nj++]; + } else { + op *= olddims[oj++]; + } + } + + /* Check whether the original axes can be combined */ + for (ok = oi; ok < oj - 1; ok++) { + if (is_f_order) { + if (oldstrides[ok+1] != olddims[ok]*oldstrides[ok]) { + /* not contiguous enough */ + return 0; + } + } + else { + /* C order */ + if (oldstrides[ok] != olddims[ok+1]*oldstrides[ok+1]) { + /* not contiguous enough */ + return 0; + } + } + } + + /* Calculate new strides for all axes currently worked with */ + if (is_f_order) { + newstrides[ni] = oldstrides[oi]; + for (nk = ni + 1; nk < nj; nk++) { + newstrides[nk] = newstrides[nk - 1]*newdims[nk - 1]; + } + } + else { + /* C order */ + newstrides[nj - 1] = oldstrides[oj - 1]; + for (nk = nj - 1; nk > ni; nk--) { + newstrides[nk - 1] = newstrides[nk]*newdims[nk]; + } + } + ni = nj++; + oi = oj++; + } + + /* + * Set strides corresponding to trailing 1s of the new shape. + */ + if (ni >= 1) { + last_stride = newstrides[ni - 1]; + } + else { + last_stride = itemsize; + } + if (is_f_order) { + last_stride *= newdims[ni - 1]; + } + for (nk = ni; nk < newnd; nk++) { + newstrides[nk] = last_stride; + } + + return 1; +} + +/* + * Cython utilities. + */ + +/* Fetch the address of the given function, as exposed by + a cython module */ +static void * +import_cython_function(const char *module_name, const char *function_name) +{ + PyObject *module, *capi, *cobj; + void *res = NULL; + const char *capsule_name; + + module = PyImport_ImportModule(module_name); + if (module == NULL) + return NULL; + capi = PyObject_GetAttrString(module, "__pyx_capi__"); + Py_DECREF(module); + if (capi == NULL) + return NULL; + cobj = PyMapping_GetItemString(capi, (char *)function_name); + Py_DECREF(capi); + if (cobj == NULL) { + PyErr_Clear(); + PyErr_Format(PyExc_ValueError, + "No function '%s' found in __pyx_capi__ of '%s'", + function_name, module_name); + return NULL; + } + /* 2.7+ => Cython exports a PyCapsule */ + capsule_name = PyCapsule_GetName(cobj); + if (capsule_name != NULL) { + res = PyCapsule_GetPointer(cobj, capsule_name); + } + Py_DECREF(cobj); + return res; +} + +NUMBA_EXPORT_FUNC(PyObject *) +_numba_import_cython_function(PyObject *self, PyObject *args) +{ + const char *module_name; + const char *function_name; + void *p = NULL; + PyObject *res; + + if (!PyArg_ParseTuple(args, "ss", &module_name, &function_name)) { + return NULL; + } + p = import_cython_function(module_name, function_name); + if (p == NULL) { + return NULL; + } + res = PyLong_FromVoidPtr(p); + if (res == NULL) { + PyErr_SetString(PyExc_RuntimeError, + "Could not convert function address to int"); + return NULL; + } + return res; +} + +/* We use separate functions for datetime64 and timedelta64, to ensure + * proper type checking. + */ +NUMBA_EXPORT_FUNC(npy_int64) +numba_extract_np_datetime(PyObject *td) +{ + if (!PyArray_IsScalar(td, Datetime)) { + PyErr_SetString(PyExc_TypeError, + "expected a numpy.datetime64 object"); + return -1; + } + return PyArrayScalar_VAL(td, Timedelta); +} + +NUMBA_EXPORT_FUNC(npy_int64) +numba_extract_np_timedelta(PyObject *td) +{ + if (!PyArray_IsScalar(td, Timedelta)) { + PyErr_SetString(PyExc_TypeError, + "expected a numpy.timedelta64 object"); + return -1; + } + return PyArrayScalar_VAL(td, Timedelta); +} + +NUMBA_EXPORT_FUNC(PyObject *) +numba_create_np_datetime(npy_int64 value, int unit_code) +{ + PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *) + PyArrayScalar_New(Datetime); + if (obj != NULL) { + obj->obval = value; + obj->obmeta.base = unit_code; + obj->obmeta.num = 1; + } + return (PyObject *) obj; +} + +NUMBA_EXPORT_FUNC(PyObject *) +numba_create_np_timedelta(npy_int64 value, int unit_code) +{ + PyTimedeltaScalarObject *obj = (PyTimedeltaScalarObject *) + PyArrayScalar_New(Timedelta); + if (obj != NULL) { + obj->obval = value; + obj->obmeta.base = unit_code; + obj->obmeta.num = 1; + } + return (PyObject *) obj; +} + +NUMBA_EXPORT_FUNC(uint64_t) +numba_fptoui(double x) { + /* First cast to signed int of the full width to make sure sign extension + happens (this can make a difference on some platforms...). */ + return (uint64_t) (int64_t) x; +} + +NUMBA_EXPORT_FUNC(uint64_t) +numba_fptouif(float x) { + return (uint64_t) (int64_t) x; +} + +NUMBA_EXPORT_FUNC(void) +numba_gil_ensure(PyGILState_STATE *state) { + *state = PyGILState_Ensure(); +} + +NUMBA_EXPORT_FUNC(void) +numba_gil_release(PyGILState_STATE *state) { + PyGILState_Release(*state); +} + +NUMBA_EXPORT_FUNC(PyObject *) +numba_py_type(PyObject *obj) { + return (PyObject *) Py_TYPE(obj); +} + + +/* + * Functions for tagging an arbitrary Python object with an arbitrary pointer. + * These functions make strong lifetime assumptions, see below. + */ + +static PyObject *private_data_dict = NULL; + +static PyObject * +_get_private_data_dict(void) +{ + if (private_data_dict == NULL) + private_data_dict = PyDict_New(); + return private_data_dict; +} + +NUMBA_EXPORT_FUNC(void) +numba_set_pyobject_private_data(PyObject *obj, void *ptr) +{ + PyObject *dct = _get_private_data_dict(); + /* This assumes the reference to setobj is kept alive until the + call to numba_reset_set_private_data()! */ + PyObject *key = PyLong_FromVoidPtr((void *) obj); + PyObject *value = PyLong_FromVoidPtr(ptr); + + if (!dct || !value || !key) + goto error; + if (PyDict_SetItem(dct, key, value)) + goto error; + Py_DECREF(key); + Py_DECREF(value); + return; + +error: + Py_FatalError("unable to set private data"); +} + +NUMBA_EXPORT_FUNC(void *) +numba_get_pyobject_private_data(PyObject *obj) +{ + PyObject *dct = _get_private_data_dict(); + PyObject *value, *key = PyLong_FromVoidPtr((void *) obj); + void *ptr; + if (!dct || !key) + goto error; + + value = PyDict_GetItem(dct, key); + Py_DECREF(key); + if (!value) + return NULL; + else { + ptr = PyLong_AsVoidPtr(value); + if (ptr == NULL && PyErr_Occurred()) + goto error; + return ptr; + } + +error: + Py_FatalError("unable to get private data"); + return NULL; +} + +NUMBA_EXPORT_FUNC(void) +numba_reset_pyobject_private_data(PyObject *obj) +{ + PyObject *dct = _get_private_data_dict(); + PyObject *key = PyLong_FromVoidPtr((void *) obj); + + if (!key) + goto error; + if (PyDict_DelItem(dct, key)) + PyErr_Clear(); + Py_DECREF(key); + return; + +error: + Py_FatalError("unable to reset private data"); +} + +NUMBA_EXPORT_FUNC(int) +numba_unpack_slice(PyObject *obj, + Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step) +{ + PySliceObject *slice = (PySliceObject *) obj; + if (!PySlice_Check(obj)) { + PyErr_Format(PyExc_TypeError, + "Expected a slice object, got '%s'", + Py_TYPE(slice)->tp_name); + return -1; + } +#define FETCH_MEMBER(NAME, DEFAULT) \ + if (slice->NAME != Py_None) { \ + Py_ssize_t v = PyNumber_AsSsize_t(slice->NAME, \ + PyExc_OverflowError); \ + if (v == -1 && PyErr_Occurred()) \ + return -1; \ + *NAME = v; \ + } \ + else { \ + *NAME = DEFAULT; \ + } + FETCH_MEMBER(step, 1) + FETCH_MEMBER(stop, (*step > 0) ? PY_SSIZE_T_MAX : PY_SSIZE_T_MIN) + FETCH_MEMBER(start, (*step > 0) ? 0 : PY_SSIZE_T_MAX) + return 0; + +#undef FETCH_MEMBER +} + +NUMBA_EXPORT_FUNC(int) +numba_fatal_error(void) +{ + PyGILState_Ensure(); + Py_FatalError("in Numba-compiled function"); + return 0; /* unreachable */ +} + +/* Insert a frame into the traceback for (funcname, filename, lineno). */ +/* This function is CPython's _PyTraceback_Add, renamed, see: + * https://github.com/python/cpython/blob/d545869d084e70d4838310e79b52a25a72a1ca56/Python/traceback.c#L246 + * and modified for Python 2.x based on + * https://github.com/python/cpython/blob/2e1a34025cde19bddf12a2eac8fedb6afcca8339/Modules/_ctypes/callbacks.c#L151-L174 + */ +static void traceback_add(const char *funcname, const char *filename, int lineno) +{ + PyObject *globals = NULL; + PyCodeObject *code = NULL; + PyFrameObject *frame = NULL; + PyObject *exc, *val, *tb; + + /* Save and clear the current exception. Python functions must not be + called with an exception set. Calling Python functions happens when + the codec of the filesystem encoding is implemented in pure Python. */ + PyErr_Fetch(&exc, &val, &tb); + + globals = PyDict_New(); + if (!globals) + goto error; + code = PyCode_NewEmpty(filename, funcname, lineno); + if (!code) { + goto error; + } + frame = PyFrame_New(PyThreadState_Get(), code, globals, NULL); + Py_DECREF(globals); + Py_DECREF(code); + if (!frame) + goto error; + frame->f_lineno = lineno; + + PyErr_Restore(exc, val, tb); + PyTraceBack_Here(frame); + Py_DECREF(frame); + return; + +error: + _PyErr_ChainExceptions(exc, val, tb); +} + + +/* + * Add traceback information to *loc* to the active exception. + * loc can be NULL, which causes this function to become a no-op. + */ +static +void traceback_add_loc(PyObject *loc) { + const char *function_name_str = NULL, *filename_str = NULL; + PyObject *function_name = NULL, *filename = NULL, *lineno = NULL; + Py_ssize_t pos; + + /* instance is instantiated/internal exception is raised, if loc is present + * add a frame for it into the traceback */ + if(loc && loc != Py_None && PyTuple_Check(loc)) + { + pos = 0; + function_name = PyTuple_GET_ITEM(loc, pos); + function_name_str = PyString_AsString(function_name); + pos = 1; + filename = PyTuple_GET_ITEM(loc, pos); + filename_str = PyString_AsString(filename); + pos = 2; + lineno = PyTuple_GET_ITEM(loc, pos); + traceback_add(function_name_str, filename_str, \ + (int)PyLong_AsLong(lineno)); + } +} + +/** + * Re-raise the current active exception. + * Called internal by process_raise() when *exc* is None. + */ +static +int reraise_exc_is_none(void) { + /* Reraise */ + PyThreadState *tstate = PyThreadState_GET(); + PyObject *tb, *type, *value; +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 7) + _PyErr_StackItem *tstate_exc = tstate->exc_info; +#else + PyThreadState *tstate_exc = tstate; +#endif + type = tstate_exc->exc_type; + value = tstate_exc->exc_value; + tb = tstate_exc->exc_traceback; + if (type == Py_None) { + PyErr_SetString(PyExc_RuntimeError, + "No active exception to reraise"); + return 0; + } + /* incref needed because PyErr_Restore DOES NOT */ + Py_XINCREF(type); + Py_XINCREF(value); + Py_XINCREF(tb); + PyErr_Restore(type, value, tb); + return 1; +} + +/* + * Set exception given the Exception type and the constructor argument. + * Equivalent to ``raise exc(value)``. + * PyExceptionClass_Check(exc) must be True. + * value can be NULL. + */ +static +int process_exception_class(PyObject *exc, PyObject *value) { + PyObject *type; + /* It is a class, type used here just as a tmp var */ + type = PyObject_CallObject(exc, value); + if (type == NULL){ + return 0; + } + if (!PyExceptionInstance_Check(type)) { + PyErr_SetString(PyExc_TypeError, + "exceptions must derive from BaseException"); + Py_DECREF(type); + return 0; + } + /* all ok, set type to the exc */ + Py_DECREF(type); + type = exc; + PyErr_SetObject(type, value); + return 1; +} + +/* + * Internal routine to process exceptions. + * exc cannot be NULL. It can be a None, Exception type, or Exception instance. + * value can be NULL for absent, or any PyObject valid for the exception. + */ +static +int process_raise(PyObject *exc, PyObject *value) { + /* exc is None */ + if (exc == Py_None) { + return reraise_exc_is_none(); + } + /* exc should be an exception class */ + else if (PyExceptionClass_Check(exc)) { + return process_exception_class(exc, value); + } + /* exc is an instance of an Exception */ + else if (PyExceptionInstance_Check(exc)) { + PyObject *type = PyExceptionInstance_Class(exc); + PyErr_SetObject(type, exc); + return 0; + } + else { + /* Not something you can raise. You get an exception + anyway, just not what you specified :-) */ + PyErr_SetString(PyExc_TypeError, + "exceptions must derive from BaseException"); + return 0; + } +} + +/* Logic for raising an arbitrary object. Adapted from CPython's ceval.c. + This *consumes* a reference count to its argument. */ +NUMBA_EXPORT_FUNC(int) +numba_do_raise(PyObject *exc_packed) +{ + int status; + PyObject *exc = NULL, *value = NULL, *loc = NULL; + + /* We support the following forms of raise: + raise + raise + raise */ + + /* could be a tuple from npm (some exc like thing, args, location) */ + if (PyTuple_CheckExact(exc_packed)) { + /* Unpack a (class/inst/tuple, arguments, location) tuple. */ + if (!PyArg_ParseTuple(exc_packed, "OOO", &exc, &value, &loc)) { + traceback_add_loc(loc); + return 0; + } + } else { + /* could be a reraise or an exception from objmode */ + exc = exc_packed; + /* branch exit with value = NULL and loc = NULL */ + } + /* value is either NULL or borrowed */ + status = process_raise(exc, value); + traceback_add_loc(loc); + Py_DECREF(exc_packed); + return status; +} + +#ifdef PYCC_COMPILING +/* AOT avoid the use of `numba.core.serialize` */ +NUMBA_EXPORT_FUNC(PyObject *) +numba_unpickle(const char *data, int n, const char *hashed) +{ + PyObject *buf, *obj; + static PyObject *loads; + + /* Caching the pickle.loads function shaves a couple µs here. */ + if (loads == NULL) { + PyObject *picklemod; + picklemod = PyImport_ImportModule("pickle"); + if (picklemod == NULL) + return NULL; + loads = PyObject_GetAttrString(picklemod, "loads"); + Py_DECREF(picklemod); + if (loads == NULL) + return NULL; + } + + buf = PyBytes_FromStringAndSize(data, n); + if (buf == NULL) + return NULL; + obj = PyObject_CallFunctionObjArgs(loads, buf, NULL); + Py_DECREF(buf); + return obj; +} + +#else + +NUMBA_EXPORT_FUNC(PyObject *) +numba_unpickle(const char *data, int n, const char *hashed) +{ + PyObject *buf=NULL, *obj=NULL, *addr=NULL, *hashedbuf=NULL; + static PyObject *loads=NULL; + + /* Caching the pickle.loads function shaves a couple µs here. */ + if (loads == NULL) { + PyObject *picklemod; + picklemod = PyImport_ImportModule("numba.core.serialize"); + if (picklemod == NULL) + return NULL; + loads = PyObject_GetAttrString(picklemod, "_numba_unpickle"); + Py_DECREF(picklemod); + if (loads == NULL) + return NULL; + } + + buf = PyBytes_FromStringAndSize(data, n); + if (buf == NULL) + return NULL; + /* SHA1 produces 160 bit or 20 bytes */ + hashedbuf = PyBytes_FromStringAndSize(hashed, 20); + if (hashedbuf == NULL) + goto error; + addr = PyLong_FromVoidPtr((void*)data); + if (addr == NULL) + goto error; + obj = PyObject_CallFunctionObjArgs(loads, addr, buf, hashedbuf, NULL); +error: + Py_XDECREF(addr); + Py_XDECREF(hashedbuf); + Py_DECREF(buf); + return obj; +} +#endif + +/* + * Unicode helpers + */ + +/* Developer note: + * + * The hash value of unicode objects is obtained via: + * ((PyASCIIObject *)(obj))->hash; + * The use comes from this definition: + * https://github.com/python/cpython/blob/6d43f6f081023b680d9db4542d19b9e382149f0a/Objects/unicodeobject.c#L119-L120 + * and it's used extensively throughout the `cpython/Object/unicodeobject.c` + * source, not least in `unicode_hash` itself: + * https://github.com/python/cpython/blob/6d43f6f081023b680d9db4542d19b9e382149f0a/Objects/unicodeobject.c#L11662-L11679 + * + * The Unicode string struct layouts are described here: + * https://github.com/python/cpython/blob/6d43f6f081023b680d9db4542d19b9e382149f0a/Include/cpython/unicodeobject.h#L82-L161 + * essentially, all the unicode string layouts start with a `PyASCIIObject` at + * offset 0 (as of commit 6d43f6f081023b680d9db4542d19b9e382149f0a, somewhere + * in the 3.8 development cycle). + * + * For safety against future CPython internal changes, the code checks that the + * _base members of the unicode structs are what is expected in 3.7, and that + * their offset is 0. It then walks the struct to the hash location to make sure + * the offset is indeed the same as PyASCIIObject->hash. + * Note: The large condition in the if should evaluate to a compile time + * constant. + */ + +#define MEMBER_SIZE(structure, member) sizeof(((structure *)0)->member) + +NUMBA_EXPORT_FUNC(void *) +numba_extract_unicode(PyObject *obj, Py_ssize_t *length, int *kind, + unsigned int *ascii, Py_ssize_t *hash) { + if (!PyUnicode_READY(obj)) { + *length = PyUnicode_GET_LENGTH(obj); + *kind = PyUnicode_KIND(obj); + /* could also use PyUnicode_IS_ASCII but it is not publicly advertised in https://docs.python.org/3/c-api/unicode.html */ + *ascii = (unsigned int)(PyUnicode_MAX_CHAR_VALUE(obj) == (0x7f)); + /* this is here as a crude check for safe casting of all unicode string + * structs to a PyASCIIObject */ + if (MEMBER_SIZE(PyCompactUnicodeObject, _base) == sizeof(PyASCIIObject) && + MEMBER_SIZE(PyUnicodeObject, _base) == sizeof(PyCompactUnicodeObject) && + offsetof(PyCompactUnicodeObject, _base) == 0 && + offsetof(PyUnicodeObject, _base) == 0 && + offsetof(PyCompactUnicodeObject, _base.hash) == offsetof(PyASCIIObject, hash) && + offsetof(PyUnicodeObject, _base._base.hash) == offsetof(PyASCIIObject, hash) + ) { + /* Grab the hash from the type object cache, do not compute it. */ + *hash = ((PyASCIIObject *)(obj))->hash; + } + else { + /* cast is not safe, fail */ + return NULL; + } + return PyUnicode_DATA(obj); + } else { + return NULL; + } +} + +/* this is late included as it #defines e.g. SHIFT that should not impact + * the above */ +#include "_unicodetype_db.h" + +/* This function is a modified copy of the private function gettyperecord from + * CPython's Objects/unicodectype.c + * + * See:https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L45-L59 + */ +NUMBA_EXPORT_FUNC(void) +numba_gettyperecord(Py_UCS4 code, int *upper, int *lower, int *title, + unsigned char *decimal, unsigned char *digit, + unsigned short *flags) +{ + int index; + const numba_PyUnicode_TypeRecord *rec; + + if (code >= 0x110000) + index = 0; + else + { + index = index1[(code>>SHIFT)]; + index = index2[(index<upper; + *lower = rec->lower; + *title = rec->title; + *decimal = rec->decimal; + *digit = rec->digit; + *flags = rec->flags; +} + +/* This function provides a consistent access point for the + * _PyUnicode_ExtendedCase array defined in CPython's Objects/unicodectype.c + * and now also as numba_PyUnicode_ExtendedCase in Numba's _unicodetype_db.h + */ +NUMBA_EXPORT_FUNC(Py_UCS4) +numba_get_PyUnicode_ExtendedCase(int code) +{ + return numba_PyUnicode_ExtendedCase[code]; +} + +/* from _unicodetype_db.h */ +#undef SHIFT + +/* + * defined break point for gdb + */ +NUMBA_EXPORT_FUNC(void) +numba_gdb_breakpoint(void) { + /* does nothing */ +} + +/* + * Define bridge for all math functions + */ + +#define MATH_UNARY(F, R, A) \ + NUMBA_EXPORT_FUNC(R) numba_##F(A a) { return F(a); } +#define MATH_BINARY(F, R, A, B) \ + NUMBA_EXPORT_FUNC(R) numba_##F(A a, B b) { return F(a, b); } + +#include "mathnames.h" + +#undef MATH_UNARY +#undef MATH_BINARY + +/* + * BLAS and LAPACK wrappers + */ + +#include "_lapack.c" + +/* + * PRNG support + */ + +#include "_random.c" diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_helpermod.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/_helpermod.c new file mode 100644 index 0000000000000000000000000000000000000000..bbbac20bddc8894e83e64e9c3f0386cd3c5867c5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_helpermod.c @@ -0,0 +1,307 @@ +/* +Expose all functions as pointers in a dedicated C extension. +*/ +#include "cext/cext.h" +/* Import _pymodule.h first, for a recent _POSIX_C_SOURCE */ +#include "_pymodule.h" + +#include +#ifdef _MSC_VER + #define false 0 + #define true 1 + #define bool int +#else + #include +#endif + +/* +Include C-extension here +*/ +#include "cext/cext.h" + +/* Numba C helpers */ +#include "_helperlib.c" + +/* Numpy C math function exports */ +#include "_npymath_exports.c" + +static PyObject * +build_c_helpers_dict(void) +{ + PyObject *dct = PyDict_New(); + if (dct == NULL) + goto error; + +#define _declpointer(name, value) do { \ + PyObject *o = PyLong_FromVoidPtr(value); \ + if (o == NULL) goto error; \ + if (PyDict_SetItemString(dct, name, o)) { \ + Py_DECREF(o); \ + goto error; \ + } \ + Py_DECREF(o); \ +} while (0) + +#define declmethod(func) _declpointer(#func, &numba_##func) + +#define declpointer(ptr) _declpointer(#ptr, &numba_##ptr) + + declmethod(fixed_fmod); + declmethod(fixed_fmodf); + declmethod(set_fnclex); + + declmethod(sdiv); + declmethod(srem); + declmethod(udiv); + declmethod(urem); + declmethod(frexp); + declmethod(frexpf); + declmethod(ldexp); + declmethod(ldexpf); + declmethod(cpow); + declmethod(cpowf); + declmethod(erf); + declmethod(erff); + declmethod(erfc); + declmethod(erfcf); + declmethod(gamma); + declmethod(gammaf); + declmethod(lgamma); + declmethod(lgammaf); + declmethod(signbit); + declmethod(signbitf); + declmethod(complex_adaptor); + declmethod(adapt_ndarray); + declmethod(ndarray_new); + declmethod(extract_record_data); + declmethod(get_buffer); + declmethod(adapt_buffer); + declmethod(release_buffer); + declmethod(extract_np_datetime); + declmethod(create_np_datetime); + declmethod(extract_np_timedelta); + declmethod(create_np_timedelta); + declmethod(recreate_record); + declmethod(fptoui); + declmethod(fptouif); + declmethod(gil_ensure); + declmethod(gil_release); + declmethod(fatal_error); + declmethod(py_type); + declmethod(unpack_slice); + declmethod(do_raise); + declmethod(unpickle); + declmethod(attempt_nocopy_reshape); + declmethod(get_pyobject_private_data); + declmethod(set_pyobject_private_data); + declmethod(reset_pyobject_private_data); + + /* BLAS / LAPACK */ + declmethod(xxgemm); + declmethod(xxgemv); + declmethod(xxdot); + declmethod(xxgetrf); + declmethod(ez_xxgetri); + declmethod(xxpotrf); + declmethod(ez_rgeev); + declmethod(ez_cgeev); + declmethod(ez_xxxevd); + declmethod(ez_gesdd); + declmethod(ez_geqrf); + declmethod(ez_xxgqr); + declmethod(ez_gelsd); + declmethod(xgesv); + declmethod(xxnrm2); + + /* PRNG support */ + declmethod(get_py_random_state); + declmethod(get_np_random_state); + declmethod(get_internal_random_state); + declmethod(rnd_shuffle); + declmethod(rnd_init); + declmethod(poisson_ptrs); + + /* Unicode string support */ + declmethod(extract_unicode); + declmethod(gettyperecord); + declmethod(get_PyUnicode_ExtendedCase); + + /* for gdb breakpoint */ + declmethod(gdb_breakpoint); + + /* for dictionary support */ + declmethod(test_dict); + declmethod(dict_new_minsize); + declmethod(dict_set_method_table); + declmethod(dict_free); + declmethod(dict_length); + declmethod(dict_lookup); + declmethod(dict_insert); + declmethod(dict_insert_ez); + declmethod(dict_delitem); + declmethod(dict_popitem); + declmethod(dict_iter_sizeof); + declmethod(dict_iter); + declmethod(dict_iter_next); + declmethod(dict_dump); + + /* for list support */ + declmethod(test_list); + declmethod(list_new); + declmethod(list_set_method_table); + declmethod(list_free); + declmethod(list_base_ptr); + declmethod(list_size_address); + declmethod(list_length); + declmethod(list_allocated); + declmethod(list_is_mutable); + declmethod(list_set_is_mutable); + declmethod(list_setitem); + declmethod(list_getitem); + declmethod(list_append); + declmethod(list_delitem); + declmethod(list_delete_slice); + declmethod(list_iter_sizeof); + declmethod(list_iter); + declmethod(list_iter_next); + +#define MATH_UNARY(F, R, A) declmethod(F); +#define MATH_BINARY(F, R, A, B) declmethod(F); + #include "mathnames.h" +#undef MATH_UNARY +#undef MATH_BINARY + +#undef declmethod + return dct; +error: + Py_XDECREF(dct); + return NULL; +} + +static int +register_npymath_exports(PyObject *dct) +{ + size_t count = sizeof(npymath_exports) / sizeof(npymath_exports[0]); + size_t i; + + for (i = 0; i < count; ++i) { + PyObject *ptr = PyLong_FromVoidPtr(npymath_exports[i].func); + if (ptr == NULL) + return -1; + if (PyDict_SetItemString(dct, npymath_exports[i].name, ptr) < 0) { + Py_DECREF(ptr); + return -1; + } + Py_DECREF(ptr); + } + + return 0; +} + +static PyObject * +build_npymath_exports_dict(void) +{ + PyObject *dct = PyDict_New(); + if (dct != NULL) { + if (register_npymath_exports(dct) < 0) + Py_CLEAR(dct); + } + return dct; +} + + +/* + * Helper to deal with flushing stdout + */ +PyAPI_FUNC(void) _numba_flush_stdout(void) ; + +void +_numba_flush_stdout(void) { + fflush(stdout); +} + + +static PyMethodDef ext_methods[] = { + { "rnd_get_state", (PyCFunction) _numba_rnd_get_state, METH_O, NULL }, + { "rnd_get_py_state_ptr", (PyCFunction) _numba_rnd_get_py_state_ptr, METH_NOARGS, NULL }, + { "rnd_get_np_state_ptr", (PyCFunction) _numba_rnd_get_np_state_ptr, METH_NOARGS, NULL }, + { "rnd_seed", (PyCFunction) _numba_rnd_seed, METH_VARARGS, NULL }, + { "rnd_set_state", (PyCFunction) _numba_rnd_set_state, METH_VARARGS, NULL }, + { "rnd_shuffle", (PyCFunction) _numba_rnd_shuffle, METH_O, NULL }, + { "_import_cython_function", (PyCFunction) _numba_import_cython_function, METH_VARARGS, NULL }, + { NULL }, +}; + +/* + * These functions are exported by the module's DLL, to exercise ctypes / cffi + * without relying on libc availability (see https://bugs.python.org/issue23606) + */ + +PyAPI_FUNC(double) _numba_test_sin(double x); +PyAPI_FUNC(double) _numba_test_cos(double x); +PyAPI_FUNC(double) _numba_test_exp(double x); +PyAPI_FUNC(void) _numba_test_vsquare(int n, double *x, double *out); +PyAPI_FUNC(double) _numba_test_funcptr(double (*func)(double)); +PyAPI_FUNC(bool) _numba_test_boolean(void); + +double _numba_test_sin(double x) +{ + return sin(x); +} + +double _numba_test_cos(double x) +{ + return cos(x); +} + +double _numba_test_exp(double x) +{ + return exp(x); +} + +void _numba_test_vsquare(int n, double *x, double *out) +{ + int i; + for (i = 0; i < n; i++) + out[i] = pow(x[i], 2.0); +} + +void _numba_test_vcube(int n, double *x, double *out) +{ + int i; + for (i = 0; i < n; i++) + out[i] = pow(x[i], 3.0); +} + +double _numba_test_funcptr(double (*func)(double)) +{ + return func(1.5); +} + +bool _numba_test_boolean() +{ + return true; +} + +MOD_INIT(_helperlib) { + PyObject *m; + MOD_DEF(m, "_helperlib", "No docs", ext_methods) + if (m == NULL) + return MOD_ERROR_VAL; + + import_array(); + + PyModule_AddObject(m, "c_helpers", build_c_helpers_dict()); + PyModule_AddObject(m, "npymath_exports", build_npymath_exports_dict()); + PyModule_AddIntConstant(m, "long_min", LONG_MIN); + PyModule_AddIntConstant(m, "long_max", LONG_MAX); + PyModule_AddIntConstant(m, "py_buffer_size", sizeof(Py_buffer)); + PyModule_AddIntConstant(m, "py_gil_state_size", sizeof(PyGILState_STATE)); + PyModule_AddIntConstant(m, "py_unicode_1byte_kind", PyUnicode_1BYTE_KIND); + PyModule_AddIntConstant(m, "py_unicode_2byte_kind", PyUnicode_2BYTE_KIND); + PyModule_AddIntConstant(m, "py_unicode_4byte_kind", PyUnicode_4BYTE_KIND); + PyModule_AddIntConstant(m, "py_unicode_wchar_kind", PyUnicode_WCHAR_KIND); + numba_rnd_ensure_global_init(); + + return MOD_SUCCESS_VAL(m); +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_lapack.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/_lapack.c new file mode 100644 index 0000000000000000000000000000000000000000..6d5d183ecc015dfdb7fa235c9f640f34cd2a55d4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_lapack.c @@ -0,0 +1,1946 @@ +/* + * This file contains wrappers of BLAS and LAPACK functions + */ +/* + * BLAS calling helpers. The helpers can be called without the GIL held. + * The caller is responsible for checking arguments (especially dimensions). + */ + +/* Fast getters caching the value of a function's address after + the first call to import_cblas_function(). */ + +#define EMIT_GET_CBLAS_FUNC(name) \ + static void *cblas_ ## name = NULL; \ + static void *get_cblas_ ## name(void) { \ + if (cblas_ ## name == NULL) { \ + PyGILState_STATE st = PyGILState_Ensure(); \ + const char *mod = "scipy.linalg.cython_blas"; \ + cblas_ ## name = import_cython_function(mod, # name); \ + PyGILState_Release(st); \ + } \ + return cblas_ ## name; \ + } + +EMIT_GET_CBLAS_FUNC(dgemm) +EMIT_GET_CBLAS_FUNC(sgemm) +EMIT_GET_CBLAS_FUNC(cgemm) +EMIT_GET_CBLAS_FUNC(zgemm) +EMIT_GET_CBLAS_FUNC(dgemv) +EMIT_GET_CBLAS_FUNC(sgemv) +EMIT_GET_CBLAS_FUNC(cgemv) +EMIT_GET_CBLAS_FUNC(zgemv) +EMIT_GET_CBLAS_FUNC(ddot) +EMIT_GET_CBLAS_FUNC(sdot) +EMIT_GET_CBLAS_FUNC(cdotu) +EMIT_GET_CBLAS_FUNC(zdotu) +EMIT_GET_CBLAS_FUNC(cdotc) +EMIT_GET_CBLAS_FUNC(zdotc) +EMIT_GET_CBLAS_FUNC(snrm2) +EMIT_GET_CBLAS_FUNC(dnrm2) +EMIT_GET_CBLAS_FUNC(scnrm2) +EMIT_GET_CBLAS_FUNC(dznrm2) + + +#undef EMIT_GET_CBLAS_FUNC + +/* + * NOTE: On return value convention. + * For LAPACK wrapper development the following conventions are followed: + * Publicly exposed wrapper functions must return:- + * STATUS_ERROR : For an unrecoverable error e.g. caught by xerbla, this is so + * a Py_FatalError can be raised. + * STATUS_SUCCESS: For successful execution + * +n : Where n is an integer for a routine specific error + * (typically derived from an `info` argument). + * + * The caller is responsible for checking and handling the error status. + */ + +/* return STATUS_SUCCESS if everything went ok */ +#define STATUS_SUCCESS (0) + +/* return STATUS_ERROR if an unrecoverable error is encountered */ +#define STATUS_ERROR (-1) + +/* + * A union of all the types accepted by BLAS/LAPACK for use in cases where + * stack based allocation is needed (typically for work space query args length + * 1). + */ +typedef union all_dtypes_ +{ + float s; + double d; + npy_complex64 c; + npy_complex128 z; +} all_dtypes; + +/* + * A checked PyMem_RawMalloc, ensures that the var is either NULL + * and an exception is raised, or that the allocation was successful. + * Returns zero on success for status checking. + */ +static int checked_PyMem_RawMalloc(void** var, size_t bytes) +{ + *var = NULL; + *var = PyMem_RawMalloc(bytes); + if (!(*var)) + { + { + PyGILState_STATE st = PyGILState_Ensure(); + + PyErr_SetString(PyExc_MemoryError, + "Insufficient memory for buffer allocation\ + required by LAPACK."); + PyGILState_Release(st); + } + return 1; + } + return 0; +} + +/* + * Checks that the char kind is valid (one of [s,d,c,z]) for use in blas/lapack. + * Returns zero on success for status checking. + */ +static int check_kind(char kind) +{ + switch (kind) + { + case 's': + case 'd': + case 'c': + case 'z': + break; + default: + { + PyGILState_STATE st = PyGILState_Ensure(); + PyErr_SetString(PyExc_ValueError, + "invalid data type (kind) found"); + PyGILState_Release(st); + } + return 1; + } + return 0; +} + +/* + * Guard macro for ensuring a valid data "kind" is being used. + * Place at the top of all routines with switches on "kind" that accept + * one of [s,d,c,z]. + */ +#define ENSURE_VALID_KIND(__KIND) \ +if (check_kind( __KIND )) \ +{ \ + return STATUS_ERROR; \ +} \ + +/* + * Checks that the char kind is valid for the real domain (one of [s,d]) + * for use in blas/lapack. + * Returns zero on success for status checking. + */ +static int check_real_kind(char kind) +{ + switch (kind) + { + case 's': + case 'd': + break; + default: + { + PyGILState_STATE st = PyGILState_Ensure(); + PyErr_SetString(PyExc_ValueError, + "invalid data type (kind) found"); + PyGILState_Release(st); + } + return 1; + } + return 0; +} + +/* + * Guard macro for ensuring a valid data "kind" is being used for the + * real domain routines. + * Place at the top of all routines with switches on "kind" that accept + * one of [s,d]. + */ +#define ENSURE_VALID_REAL_KIND(__KIND) \ +if (check_real_kind( __KIND )) \ +{ \ + return STATUS_ERROR; \ +} \ + + +/* + * Checks that the char kind is valid for the complex domain (one of [c,z]) + * for use in blas/lapack. + * Returns zero on success for status checking. + */ +static int check_complex_kind(char kind) +{ + switch (kind) + { + case 'c': + case 'z': + break; + default: + { + PyGILState_STATE st = PyGILState_Ensure(); + PyErr_SetString(PyExc_ValueError, + "invalid data type (kind) found"); + PyGILState_Release(st); + } + return 1; + } + return 0; +} + +/* + * Guard macro for ensuring a valid data "kind" is being used for the + * real domain routines. + * Place at the top of all routines with switches on "kind" that accept + * one of [c,z]. + */ +#define ENSURE_VALID_COMPLEX_KIND(__KIND) \ +if (check_complex_kind( __KIND )) \ +{ \ + return STATUS_ERROR; \ +} \ + + +/* + * Checks that a function is found (i.e. not null) + * Returns zero on success for status checking. + */ +static int check_func(void *func) +{ + if (func == NULL) + { + PyGILState_STATE st = PyGILState_Ensure(); + PyErr_SetString(PyExc_RuntimeError, + "Specified LAPACK function could not be found."); + PyGILState_Release(st); + return STATUS_ERROR; + } + return STATUS_SUCCESS; +} + + +/* + * Guard macro for ensuring a valid function is found. + */ +#define ENSURE_VALID_FUNC(__FUNC) \ +if (check_func(__FUNC)) \ +{ \ + return STATUS_ERROR; \ +} \ + + +/* + * Define what a Fortran "int" is, some LAPACKs have 64 bit integer support + * numba presently opts for a 32 bit C int. + * This definition allows scope for later configuration time magic to adjust + * the size of int at all the call sites. + */ +#define F_INT int + + +typedef float (*sdot_t)(F_INT *n, void *dx, F_INT *incx, void *dy, F_INT *incy); +typedef double (*ddot_t)(F_INT *n, void *dx, F_INT *incx, void *dy, F_INT + *incy); +typedef npy_complex64 (*cdot_t)(F_INT *n, void *dx, F_INT *incx, void *dy, + F_INT *incy); +typedef npy_complex128 (*zdot_t)(F_INT *n, void *dx, F_INT *incx, void *dy, + F_INT *incy); + +typedef void (*xxgemv_t)(char *trans, F_INT *m, F_INT *n, + void *alpha, void *a, F_INT *lda, + void *x, F_INT *incx, void *beta, + void *y, F_INT *incy); + +typedef void (*xxgemm_t)(char *transa, char *transb, + F_INT *m, F_INT *n, F_INT *k, + void *alpha, void *a, F_INT *lda, + void *b, F_INT *ldb, void *beta, + void *c, F_INT *ldc); + +typedef float (*sxnrm2_t) (F_INT *n, void *x, F_INT *incx); +typedef double (*dxnrm2_t) (F_INT *n, void *x, F_INT *incx); + +/* Vector * vector: result = dx * dy */ +NUMBA_EXPORT_FUNC(int) +numba_xxdot(char kind, char conjugate, Py_ssize_t n, void *dx, void *dy, + void *result) +{ + void *raw_func = NULL; + F_INT _n; + F_INT inc = 1; + + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_cblas_sdot(); + break; + case 'd': + raw_func = get_cblas_ddot(); + break; + case 'c': + raw_func = conjugate ? get_cblas_cdotc() : get_cblas_cdotu(); + break; + case 'z': + raw_func = conjugate ? get_cblas_zdotc() : get_cblas_zdotu(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _n = (F_INT) n; + + switch (kind) + { + case 's': + *(float *) result = (*(sdot_t) raw_func)(&_n, dx, &inc, dy, &inc);; + break; + case 'd': + *(double *) result = (*(ddot_t) raw_func)(&_n, dx, &inc, dy, &inc);; + break; + case 'c': + *(npy_complex64 *) result = (*(cdot_t) raw_func)(&_n, dx, &inc, dy,\ + &inc);; + break; + case 'z': + *(npy_complex128 *) result = (*(zdot_t) raw_func)(&_n, dx, &inc,\ + dy, &inc);; + break; + } + + return 0; +} + +/* Matrix * vector: y = alpha * a * x + beta * y */ +NUMBA_EXPORT_FUNC(int) +numba_xxgemv(char kind, char trans, Py_ssize_t m, Py_ssize_t n, + void *alpha, void *a, Py_ssize_t lda, + void *x, void *beta, void *y) +{ + void *raw_func = NULL; + F_INT _m, _n; + F_INT _lda; + F_INT inc = 1; + + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_cblas_sgemv(); + break; + case 'd': + raw_func = get_cblas_dgemv(); + break; + case 'c': + raw_func = get_cblas_cgemv(); + break; + case 'z': + raw_func = get_cblas_zgemv(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _m = (F_INT) m; + _n = (F_INT) n; + _lda = (F_INT) lda; + + (*(xxgemv_t) raw_func)(&trans, &_m, &_n, alpha, a, &_lda, + x, &inc, beta, y, &inc); + return 0; +} + +/* Matrix * matrix: c = alpha * a * b + beta * c */ +NUMBA_EXPORT_FUNC(int) +numba_xxgemm(char kind, char transa, char transb, + Py_ssize_t m, Py_ssize_t n, Py_ssize_t k, + void *alpha, void *a, Py_ssize_t lda, + void *b, Py_ssize_t ldb, void *beta, + void *c, Py_ssize_t ldc) +{ + void *raw_func = NULL; + F_INT _m, _n, _k; + F_INT _lda, _ldb, _ldc; + + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_cblas_sgemm(); + break; + case 'd': + raw_func = get_cblas_dgemm(); + break; + case 'c': + raw_func = get_cblas_cgemm(); + break; + case 'z': + raw_func = get_cblas_zgemm(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _m = (F_INT) m; + _n = (F_INT) n; + _k = (F_INT) k; + _lda = (F_INT) lda; + _ldb = (F_INT) ldb; + _ldc = (F_INT) ldc; + + (*(xxgemm_t) raw_func)(&transa, &transb, &_m, &_n, &_k, alpha, a, &_lda, + b, &_ldb, beta, c, &_ldc); + return 0; +} + + +/* L2-norms */ +NUMBA_EXPORT_FUNC(F_INT) +numba_xxnrm2(char kind, Py_ssize_t n, void * x, Py_ssize_t incx, void * result) +{ + void *raw_func = NULL; + F_INT _incx; + F_INT _n; + + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_cblas_snrm2(); + break; + case 'd': + raw_func = get_cblas_dnrm2(); + break; + case 'c': + raw_func = get_cblas_scnrm2(); + break; + case 'z': + raw_func = get_cblas_dznrm2(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _n = (F_INT) n; + _incx = (F_INT) incx; + + switch (kind) + { + case 's': + *(float *) result = (*(sxnrm2_t) raw_func)(&_n, x, &_incx);; + break; + case 'd': + *(double *) result = (*(dxnrm2_t) raw_func)(&_n, x, &_incx);; + break; + case 'c': + *(float *) result = (*(sxnrm2_t) raw_func)(&_n, x, &_incx);; + break; + case 'z': + *(double *) result = (*(dxnrm2_t) raw_func)(&_n, x, &_incx);; + break; + } + + return 0; +} + + +/* + * LAPACK calling helpers. The helpers can be called without the GIL held. + * The caller is responsible for checking arguments (especially dimensions). + */ + +/* Fast getters caching the value of a function's address after + the first call to import_clapack_function(). */ + +#define EMIT_GET_CLAPACK_FUNC(name) \ + static void *clapack_ ## name = NULL; \ + static void *get_clapack_ ## name(void) { \ + if (clapack_ ## name == NULL) { \ + PyGILState_STATE st = PyGILState_Ensure(); \ + const char *mod = "scipy.linalg.cython_lapack"; \ + clapack_ ## name = import_cython_function(mod, # name); \ + PyGILState_Release(st); \ + } \ + return clapack_ ## name; \ + } + +/* Computes an LU factorization of a general M-by-N matrix A + * using partial pivoting with row interchanges. + */ +EMIT_GET_CLAPACK_FUNC(sgetrf) +EMIT_GET_CLAPACK_FUNC(dgetrf) +EMIT_GET_CLAPACK_FUNC(cgetrf) +EMIT_GET_CLAPACK_FUNC(zgetrf) + +/* Computes the inverse of a matrix using the LU factorization + * computed by xGETRF. + */ +EMIT_GET_CLAPACK_FUNC(sgetri) +EMIT_GET_CLAPACK_FUNC(dgetri) +EMIT_GET_CLAPACK_FUNC(cgetri) +EMIT_GET_CLAPACK_FUNC(zgetri) + +/* Compute Cholesky factorizations */ +EMIT_GET_CLAPACK_FUNC(spotrf) +EMIT_GET_CLAPACK_FUNC(dpotrf) +EMIT_GET_CLAPACK_FUNC(cpotrf) +EMIT_GET_CLAPACK_FUNC(zpotrf) + +/* Computes for an N-by-N real nonsymmetric matrix A, the + * eigenvalues and, optionally, the left and/or right eigenvectors. + */ +EMIT_GET_CLAPACK_FUNC(sgeev) +EMIT_GET_CLAPACK_FUNC(dgeev) +EMIT_GET_CLAPACK_FUNC(cgeev) +EMIT_GET_CLAPACK_FUNC(zgeev) + +/* Computes for an N-by-N Hermitian matrix A, the + * eigenvalues and, optionally, the left and/or right eigenvectors. + */ +EMIT_GET_CLAPACK_FUNC(ssyevd) +EMIT_GET_CLAPACK_FUNC(dsyevd) +EMIT_GET_CLAPACK_FUNC(cheevd) +EMIT_GET_CLAPACK_FUNC(zheevd) + +/* Computes generalised SVD */ +EMIT_GET_CLAPACK_FUNC(sgesdd) +EMIT_GET_CLAPACK_FUNC(dgesdd) +EMIT_GET_CLAPACK_FUNC(cgesdd) +EMIT_GET_CLAPACK_FUNC(zgesdd) + +/* Computes QR decompositions */ +EMIT_GET_CLAPACK_FUNC(sgeqrf) +EMIT_GET_CLAPACK_FUNC(dgeqrf) +EMIT_GET_CLAPACK_FUNC(cgeqrf) +EMIT_GET_CLAPACK_FUNC(zgeqrf) + +/* Computes columns of Q from elementary reflectors produced by xgeqrf() (QR). + */ +EMIT_GET_CLAPACK_FUNC(sorgqr) +EMIT_GET_CLAPACK_FUNC(dorgqr) +EMIT_GET_CLAPACK_FUNC(cungqr) +EMIT_GET_CLAPACK_FUNC(zungqr) + +/* Computes the minimum norm solution to linear least squares problems */ +EMIT_GET_CLAPACK_FUNC(sgelsd) +EMIT_GET_CLAPACK_FUNC(dgelsd) +EMIT_GET_CLAPACK_FUNC(cgelsd) +EMIT_GET_CLAPACK_FUNC(zgelsd) + +// Computes the solution to a system of linear equations +EMIT_GET_CLAPACK_FUNC(sgesv) +EMIT_GET_CLAPACK_FUNC(dgesv) +EMIT_GET_CLAPACK_FUNC(cgesv) +EMIT_GET_CLAPACK_FUNC(zgesv) + + +#undef EMIT_GET_CLAPACK_FUNC + +typedef void (*xxgetrf_t)(F_INT *m, F_INT *n, void *a, F_INT *lda, F_INT *ipiv, + F_INT *info); + +typedef void (*xxgetri_t)(F_INT *n, void *a, F_INT *lda, F_INT *ipiv, void + *work, F_INT *lwork, F_INT *info); + +typedef void (*xxpotrf_t)(char *uplo, F_INT *n, void *a, F_INT *lda, F_INT + *info); + +typedef void (*rgeev_t)(char *jobvl, char *jobvr, F_INT *n, void *a, F_INT *lda, + void *wr, void *wi, void *vl, F_INT *ldvl, void *vr, + F_INT *ldvr, void *work, F_INT *lwork, F_INT *info); + +typedef void (*cgeev_t)(char *jobvl, char *jobvr, F_INT *n, void *a, F_INT + *lda, void *w, void *vl, F_INT *ldvl, void *vr, + F_INT *ldvr, void *work, F_INT *lwork, void *rwork, + F_INT *info); + +typedef void (*rgesdd_t)(char *jobz, F_INT *m, F_INT *n, void *a, F_INT *lda, + void *s, void *u, F_INT *ldu, void *vt, F_INT *ldvt, + void *work, F_INT *lwork, F_INT *iwork, F_INT *info); + +typedef void (*cgesdd_t)(char *jobz, F_INT *m, F_INT *n, void *a, F_INT *lda, + void *s, void * u, F_INT *ldu, void * vt, F_INT *ldvt, + void *work, F_INT *lwork, void *rwork, F_INT *iwork, + F_INT *info); + +typedef void (*xsyevd_t)(char *jobz, char *uplo, F_INT *n, void *a, F_INT *lda, + void *w, void *work, F_INT *lwork, F_INT *iwork, + F_INT *liwork, F_INT *info); + +typedef void (*xheevd_t)(char *jobz, char *uplo, F_INT *n, void *a, F_INT *lda, + void *w, void *work, F_INT *lwork, void *rwork, + F_INT *lrwork, F_INT *iwork, F_INT *liwork, + F_INT *info); + +typedef void (*xgeqrf_t)(F_INT *m, F_INT *n, void *a, F_INT *lda, void *tau, + void *work, F_INT *lwork, F_INT *info); + +typedef void (*xxxgqr_t)(F_INT *m, F_INT *n, F_INT *k, void *a, F_INT *lda, + void *tau, void *work, F_INT *lwork, F_INT *info); + +typedef void (*rgelsd_t)(F_INT *m, F_INT *n, F_INT *nrhs, void *a, F_INT *lda, + void *b, F_INT *ldb, void *s, void *rcond, F_INT *rank, + void *work, F_INT *lwork, F_INT *iwork, F_INT *info); + +typedef void (*cgelsd_t)(F_INT *m, F_INT *n, F_INT *nrhs, void *a, F_INT *lda, + void *b, F_INT *ldb, void *s, void *rcond, F_INT *rank, + void *work, F_INT *lwork, void *rwork, F_INT *iwork, + F_INT *info); + +typedef void (*xgesv_t)(F_INT *n, F_INT *nrhs, void *a, F_INT *lda, F_INT *ipiv, + void *b, F_INT *ldb, F_INT *info); + + + +/* + * kind_size() + * gets the data size appropriate for a specified kind. + * + * Input: + * kind - the kind, one of: + * (s, d, c, z) = (float, double, complex, double complex). + * + * Returns: + * data_size - the appropriate data size. + * + */ +static size_t kind_size(char kind) +{ + size_t data_size = 0; + switch (kind) + { + case 's': + data_size = sizeof(float); + break; + case 'd': + data_size = sizeof(double); + break; + case 'c': + data_size = sizeof(npy_complex64); + break; + case 'z': + data_size = sizeof(npy_complex128); + break; + } + return data_size; + +} + +/* + * underlying_float_kind() + * gets the underlying float kind for a given kind. + * + * Input: + * kind - the kind, one of: + * (s, d, c, z) = (float, double, complex, double complex). + * + * Returns: + * underlying_float_kind - the underlying float kind, one of: + * (s, d) = (float, double). + * + * This function essentially provides a map between the char kind + * of a type and the char kind of the underlying float used in the + * type. Essentially: + * --------------- + * Input -> Output + * --------------- + * s -> s + * d -> d + * c -> s + * z -> d + * --------------- + * + */ +static char underlying_float_kind(char kind) +{ + switch(kind) + { + case 's': + case 'c': + return 's'; + case 'd': + case 'z': + return 'd'; + default: + { + PyGILState_STATE st = PyGILState_Ensure(); + PyErr_SetString(PyExc_ValueError, + "invalid kind in underlying_float_kind()"); + PyGILState_Release(st); + } + } + return -1; +} + +/* + * cast_from_X() + * cast from a kind (s, d, c, z) = (float, double, complex, double complex) + * to a Fortran integer. + * + * Parameters: + * kind the kind of val + * val a pointer to the value to cast + * + * Returns: + * A Fortran int from a cast of val (in complex case, takes the real part). + * + * Struct access via non c99 (python only) cmplx types, used for compatibility. + */ +static F_INT +cast_from_X(char kind, void *val) +{ + switch(kind) + { + case 's': + return (F_INT)(*((float *) val)); + case 'd': + return (F_INT)(*((double *) val)); + case 'c': + return (F_INT)(*((npy_complex64 *)val)).real; + case 'z': + return (F_INT)(*((npy_complex128 *)val)).real; + default: + { + PyGILState_STATE st = PyGILState_Ensure(); + PyErr_SetString(PyExc_ValueError, + "invalid kind in cast"); + PyGILState_Release(st); + } + } + return -1; +} + + +#define CATCH_LAPACK_INVALID_ARG(__routine, info) \ + do { \ + if (info < 0) { \ + PyGILState_STATE st = PyGILState_Ensure(); \ + PyErr_Format(PyExc_RuntimeError, \ + "LAPACK Error: Routine " #__routine ". On input %d\n",\ + -(int) info); \ + PyGILState_Release(st); \ + return STATUS_ERROR; \ + } \ + } while(0) + +/* Compute LU decomposition of A + * NOTE: ipiv is an array of Fortran integers allocated by the caller, + * which is therefore expected to use the right dtype. + */ +NUMBA_EXPORT_FUNC(int) +numba_xxgetrf(char kind, Py_ssize_t m, Py_ssize_t n, void *a, Py_ssize_t lda, + F_INT *ipiv) +{ + void *raw_func = NULL; + F_INT _m, _n, _lda, info; + + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_clapack_sgetrf(); + break; + case 'd': + raw_func = get_clapack_dgetrf(); + break; + case 'c': + raw_func = get_clapack_cgetrf(); + break; + case 'z': + raw_func = get_clapack_zgetrf(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _m = (F_INT) m; + _n = (F_INT) n; + _lda = (F_INT) lda; + + (*(xxgetrf_t) raw_func)(&_m, &_n, a, &_lda, ipiv, &info); + CATCH_LAPACK_INVALID_ARG("xxgetrf", info); + + return (int)info; +} + +/* Compute the inverse of a matrix given its LU decomposition + * Args are as per LAPACK. + */ +static int +numba_raw_xxgetri(char kind, F_INT n, void *a, F_INT lda, + F_INT *ipiv, void *work, F_INT *lwork, F_INT *info) +{ + void *raw_func = NULL; + + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_clapack_sgetri(); + break; + case 'd': + raw_func = get_clapack_dgetri(); + break; + case 'c': + raw_func = get_clapack_cgetri(); + break; + case 'z': + raw_func = get_clapack_zgetri(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + (*(xxgetri_t) raw_func)(&n, a, &lda, ipiv, work, lwork, info); + + return 0; +} + +/* Compute the inverse of a matrix from the factorization provided by + * xxgetrf. (see numba_xxgetrf() about ipiv) + * Args are as per LAPACK. + */ +NUMBA_EXPORT_FUNC(int) +numba_ez_xxgetri(char kind, Py_ssize_t n, void *a, Py_ssize_t lda, + F_INT *ipiv) +{ + F_INT _n, _lda; + F_INT lwork = -1; + F_INT info = 0; + size_t base_size = -1; + void * work = NULL; + all_dtypes stack_slot; + + ENSURE_VALID_KIND(kind) + + _n = (F_INT)n; + _lda = (F_INT)lda; + + base_size = kind_size(kind); + + work = &stack_slot; + + numba_raw_xxgetri(kind, _n, a, _lda, ipiv, work, &lwork, &info); + CATCH_LAPACK_INVALID_ARG("xxgetri", info); + + lwork = cast_from_X(kind, work); + + if (checked_PyMem_RawMalloc(&work, base_size * lwork)) + { + return STATUS_ERROR; + } + + numba_raw_xxgetri(kind, _n, a, _lda, ipiv, work, &lwork, &info); + PyMem_RawFree(work); + CATCH_LAPACK_INVALID_ARG("xxgetri", info); + + return (int)info; +} + +/* Compute the Cholesky factorization of a matrix. */ +NUMBA_EXPORT_FUNC(int) +numba_xxpotrf(char kind, char uplo, Py_ssize_t n, void *a, Py_ssize_t lda) +{ + void *raw_func = NULL; + F_INT _n, _lda, info; + + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_clapack_spotrf(); + break; + case 'd': + raw_func = get_clapack_dpotrf(); + break; + case 'c': + raw_func = get_clapack_cpotrf(); + break; + case 'z': + raw_func = get_clapack_zpotrf(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _n = (F_INT) n; + _lda = (F_INT) lda; + + (*(xxpotrf_t) raw_func)(&uplo, &_n, a, &_lda, &info); + CATCH_LAPACK_INVALID_ARG("xxpotrf", info); + return (int)info; +} + + +/* real space eigen systems info from dgeev/sgeev */ +static int +numba_raw_rgeev(char kind, char jobvl, char jobvr, + Py_ssize_t n, void *a, Py_ssize_t lda, void *wr, void *wi, + void *vl, Py_ssize_t ldvl, void *vr, Py_ssize_t ldvr, + void *work, Py_ssize_t lwork, F_INT *info) +{ + void *raw_func = NULL; + F_INT _n, _lda, _ldvl, _ldvr, _lwork; + + ENSURE_VALID_REAL_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_clapack_sgeev(); + break; + case 'd': + raw_func = get_clapack_dgeev(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _n = (F_INT) n; + _lda = (F_INT) lda; + _ldvl = (F_INT) ldvl; + _ldvr = (F_INT) ldvr; + _lwork = (F_INT) lwork; + + (*(rgeev_t) raw_func)(&jobvl, &jobvr, &_n, a, &_lda, wr, wi, vl, &_ldvl, vr, + &_ldvr, work, &_lwork, info); + return 0; +} + +/* Real space eigen systems info from dgeev/sgeev + * as numba_raw_rgeev but the allocation and error handling is done for the user. + * Args are as per LAPACK. + */ +NUMBA_EXPORT_FUNC(int) +numba_ez_rgeev(char kind, char jobvl, char jobvr, Py_ssize_t n, void *a, + Py_ssize_t lda, void *wr, void *wi, void *vl, Py_ssize_t ldvl, + void *vr, Py_ssize_t ldvr) +{ + F_INT info = 0; + F_INT lwork = -1; + F_INT _n, _lda, _ldvl, _ldvr; + size_t base_size = -1; + void * work = NULL; + all_dtypes stack_slot; + + ENSURE_VALID_REAL_KIND(kind) + + _n = (F_INT) n; + _lda = (F_INT) lda; + _ldvl = (F_INT) ldvl; + _ldvr = (F_INT) ldvr; + + base_size = kind_size(kind); + + work = &stack_slot; + numba_raw_rgeev(kind, jobvl, jobvr, _n, a, _lda, wr, wi, vl, _ldvl, + vr, _ldvr, work, lwork, &info); + CATCH_LAPACK_INVALID_ARG("numba_raw_rgeev", info); + + lwork = cast_from_X(kind, work); + if (checked_PyMem_RawMalloc(&work, base_size * lwork)) + { + return STATUS_ERROR; + } + numba_raw_rgeev(kind, jobvl, jobvr, _n, a, _lda, wr, wi, vl, _ldvl, + vr, _ldvr, work, lwork, &info); + PyMem_RawFree(work); + + CATCH_LAPACK_INVALID_ARG("numba_raw_rgeev", info); + + return (int)info; +} + +/* Complex space eigen systems info from cgeev/zgeev + * Args are as per LAPACK. + */ +static int +numba_raw_cgeev(char kind, char jobvl, char jobvr, + Py_ssize_t n, void *a, Py_ssize_t lda, void *w, void *vl, + Py_ssize_t ldvl, void *vr, Py_ssize_t ldvr, void *work, + Py_ssize_t lwork, void *rwork, F_INT *info) +{ + void *raw_func = NULL; + F_INT _n, _lda, _ldvl, _ldvr, _lwork; + + ENSURE_VALID_COMPLEX_KIND(kind) + + _n = (F_INT) n; + _lda = (F_INT) lda; + _ldvl = (F_INT) ldvl; + _ldvr = (F_INT) ldvr; + _lwork = (F_INT) lwork; + + switch (kind) + { + case 'c': + raw_func = get_clapack_cgeev(); + break; + case 'z': + raw_func = get_clapack_zgeev(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + (*(cgeev_t) raw_func)(&jobvl, &jobvr, &_n, a, &_lda, w, vl, &_ldvl, vr, + &_ldvr, work, &_lwork, rwork, info); + return 0; +} + + +/* Complex space eigen systems info from cgeev/zgeev + * as numba_raw_cgeev but the allocation and error handling is done for the user. + * Args are as per LAPACK. + */ +NUMBA_EXPORT_FUNC(int) +numba_ez_cgeev(char kind, char jobvl, char jobvr, Py_ssize_t n, void *a, + Py_ssize_t lda, void *w, void *vl, Py_ssize_t ldvl, void *vr, + Py_ssize_t ldvr) +{ + F_INT info = 0; + F_INT lwork = -1; + F_INT _n, _lda, _ldvl, _ldvr; + size_t base_size = -1; + all_dtypes stack_slot, wk; + void * work = NULL; + void * rwork = (void *)&wk; + + ENSURE_VALID_COMPLEX_KIND(kind) + + _n = (F_INT) n; + _lda = (F_INT) lda; + _ldvl = (F_INT) ldvl; + _ldvr = (F_INT) ldvr; + + base_size = kind_size(kind); + + work = &stack_slot; + numba_raw_cgeev(kind, jobvl, jobvr, n, a, lda, w, vl, ldvl, + vr, ldvr, work, lwork, rwork, &info); + CATCH_LAPACK_INVALID_ARG("numba_raw_cgeev", info); + + lwork = cast_from_X(kind, work); + if (checked_PyMem_RawMalloc((void**)&rwork, 2*n*base_size)) + { + return STATUS_ERROR; + } + if (checked_PyMem_RawMalloc(&work, base_size * lwork)) + { + PyMem_RawFree(rwork); + return STATUS_ERROR; + } + numba_raw_cgeev(kind, jobvl, jobvr, _n, a, _lda, w, vl, _ldvl, + vr, _ldvr, work, lwork, rwork, &info); + PyMem_RawFree(work); + PyMem_RawFree(rwork); + CATCH_LAPACK_INVALID_ARG("numba_raw_cgeev", info); + + return (int)info; +} + +/* real space symmetric eigen systems info from ssyevd/dsyevd */ +static int +numba_raw_rsyevd(char kind, char jobz, char uplo, Py_ssize_t n, void *a, + Py_ssize_t lda, void *w, void *work, Py_ssize_t lwork, + F_INT *iwork, Py_ssize_t liwork, F_INT *info) +{ + void *raw_func = NULL; + F_INT _n, _lda, _lwork, _liwork; + + ENSURE_VALID_REAL_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_clapack_ssyevd(); + break; + case 'd': + raw_func = get_clapack_dsyevd(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _n = (F_INT) n; + _lda = (F_INT) lda; + _lwork = (F_INT) lwork; + _liwork = (F_INT) liwork; + + (*(xsyevd_t) raw_func)(&jobz, &uplo, &_n, a, &_lda, w, work, &_lwork, iwork, &_liwork, info); + return 0; +} + +/* Real space eigen systems info from dsyevd/ssyevd + * as numba_raw_rsyevd but the allocation and error handling is done for the user. + * Args are as per LAPACK. + */ +static int +numba_ez_rsyevd(char kind, char jobz, char uplo, Py_ssize_t n, void *a, Py_ssize_t lda, void *w) +{ + F_INT info = 0; + F_INT lwork = -1, liwork=-1; + F_INT _n, _lda; + size_t base_size = -1; + void *work = NULL; + F_INT *iwork = NULL; + all_dtypes stack_slot; + int stack_int = -1; + + ENSURE_VALID_REAL_KIND(kind) + + _n = (F_INT) n; + _lda = (F_INT) lda; + + base_size = kind_size(kind); + + work = &stack_slot; + iwork = &stack_int; + numba_raw_rsyevd(kind, jobz, uplo, _n, a, _lda, w, work, lwork, iwork, liwork, &info); + CATCH_LAPACK_INVALID_ARG("numba_raw_rsyevd", info); + + lwork = cast_from_X(kind, work); + if (checked_PyMem_RawMalloc(&work, base_size * lwork)) + { + return STATUS_ERROR; + } + liwork = *iwork; + if (checked_PyMem_RawMalloc((void**)&iwork, base_size * liwork)) + { + PyMem_RawFree(work); + return STATUS_ERROR; + } + numba_raw_rsyevd(kind, jobz, uplo, _n, a, _lda, w, work, lwork, iwork, liwork, &info); + PyMem_RawFree(work); + PyMem_RawFree(iwork); + + CATCH_LAPACK_INVALID_ARG("numba_raw_rsyevd", info); + + return (int)info; +} + + +/* complex space symmetric eigen systems info from cheevd/zheevd*/ +static int +numba_raw_cheevd(char kind, char jobz, char uplo, Py_ssize_t n, void *a, + Py_ssize_t lda, void *w, void *work, Py_ssize_t lwork, + void *rwork, Py_ssize_t lrwork, F_INT *iwork, + Py_ssize_t liwork, F_INT *info) +{ + void *raw_func = NULL; + F_INT _n, _lda, _lwork, _lrwork, _liwork; + + ENSURE_VALID_COMPLEX_KIND(kind) + + switch (kind) + { + case 'c': + raw_func = get_clapack_cheevd(); + break; + case 'z': + raw_func = get_clapack_zheevd(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _n = (F_INT) n; + _lda = (F_INT) lda; + _lwork = (F_INT) lwork; + _lrwork = (F_INT) lrwork; + _liwork = (F_INT) liwork; + + (*(xheevd_t) raw_func)(&jobz, &uplo, &_n, a, &_lda, w, work, &_lwork, rwork, &_lrwork, iwork, &_liwork, info); + return 0; +} + +/* complex space eigen systems info from cheevd/zheevd + * as numba_raw_cheevd but the allocation and error handling is done for the user. + * Args are as per LAPACK. + */ +static int +numba_ez_cheevd(char kind, char jobz, char uplo, Py_ssize_t n, void *a, Py_ssize_t lda, void *w) +{ + F_INT info = 0; + F_INT lwork = -1, lrwork = -1, liwork=-1; + F_INT _n, _lda; + size_t base_size = -1, underlying_float_size = -1; + void *work = NULL, *rwork = NULL; + F_INT *iwork = NULL; + all_dtypes stack_slot1, stack_slot2; + char uf_kind; + int stack_int = -1; + + ENSURE_VALID_COMPLEX_KIND(kind) + + _n = (F_INT) n; + _lda = (F_INT) lda; + + base_size = kind_size(kind); + uf_kind = underlying_float_kind(kind); + underlying_float_size = kind_size(uf_kind); + + work = &stack_slot1; + rwork = &stack_slot2; + iwork = &stack_int; + numba_raw_cheevd(kind, jobz, uplo, _n, a, _lda, w, work, lwork, rwork, lrwork, iwork, liwork, &info); + CATCH_LAPACK_INVALID_ARG("numba_raw_cheevd", info); + + lwork = cast_from_X(uf_kind, work); + if (checked_PyMem_RawMalloc(&work, base_size * lwork)) + { + return STATUS_ERROR; + } + + lrwork = cast_from_X(uf_kind, rwork); + if (checked_PyMem_RawMalloc(&rwork, underlying_float_size * lrwork)) + { + PyMem_RawFree(work); + return STATUS_ERROR; + } + + liwork = *iwork; + if (checked_PyMem_RawMalloc((void**)&iwork, base_size * liwork)) + { + PyMem_RawFree(work); + PyMem_RawFree(rwork); + return STATUS_ERROR; + } + numba_raw_cheevd(kind, jobz, uplo, _n, a, _lda, w, work, lwork, rwork, lrwork, iwork, liwork, &info); + PyMem_RawFree(work); + PyMem_RawFree(rwork); + PyMem_RawFree(iwork); + + CATCH_LAPACK_INVALID_ARG("numba_raw_cheevd", info); + + return (int)info; +} + +/* Hermitian eigenvalue systems info from *syevd and *heevd. + * This routine hides the type and general complexity involved with making the + * calls. The work space computation and error handling etc is hidden. + * Args are as per LAPACK. + */ +NUMBA_EXPORT_FUNC(int) +numba_ez_xxxevd(char kind, char jobz, char uplo, Py_ssize_t n, void *a, Py_ssize_t lda, void *w) +{ + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + case 'd': + return numba_ez_rsyevd(kind, jobz, uplo, n, a, lda, w); + case 'c': + case 'z': + return numba_ez_cheevd(kind, jobz, uplo, n, a, lda, w); + } + return STATUS_ERROR; /* unreachable */ +} + +/* Real space svd systems info from dgesdd/sgesdd + * Args are as per LAPACK. + */ +static int +numba_raw_rgesdd(char kind, char jobz, Py_ssize_t m, Py_ssize_t n, void *a, + Py_ssize_t lda, void *s, void *u, Py_ssize_t ldu, void *vt, + Py_ssize_t ldvt, void *work, Py_ssize_t lwork, + F_INT *iwork, F_INT *info) +{ + void *raw_func = NULL; + F_INT _m, _n, _lda, _ldu, _ldvt, _lwork; + + ENSURE_VALID_REAL_KIND(kind) + + _m = (F_INT) m; + _n = (F_INT) n; + _lda = (F_INT) lda; + _ldu = (F_INT) ldu; + _ldvt = (F_INT) ldvt; + _lwork = (F_INT) lwork; + + switch (kind) + { + case 's': + raw_func = get_clapack_sgesdd(); + break; + case 'd': + raw_func = get_clapack_dgesdd(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + (*(rgesdd_t) raw_func)(&jobz, &_m, &_n, a, &_lda, s, u, &_ldu, vt, &_ldvt, + work, &_lwork, iwork, info); + return 0; +} + +/* Real space svd info from dgesdd/sgesdd. + * As numba_raw_rgesdd but the allocation and error handling is done for the + * user. + * Args are as per LAPACK. + */ +static int +numba_ez_rgesdd(char kind, char jobz, Py_ssize_t m, Py_ssize_t n, void *a, + Py_ssize_t lda, void *s, void *u, Py_ssize_t ldu, void *vt, + Py_ssize_t ldvt) +{ + F_INT info = 0; + Py_ssize_t minmn = -1; + Py_ssize_t lwork = -1; + all_dtypes stack_slot, wk; + size_t base_size = -1; + F_INT *iwork = (F_INT *)&wk; + void *work = NULL; + + ENSURE_VALID_REAL_KIND(kind) + + base_size = kind_size(kind); + + work = &stack_slot; + + /* Compute optimal work size (lwork) */ + numba_raw_rgesdd(kind, jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work, + lwork, iwork, &info); + CATCH_LAPACK_INVALID_ARG("numba_raw_rgesdd", info); + + /* Allocate work array */ + lwork = cast_from_X(kind, work); + if (checked_PyMem_RawMalloc(&work, base_size * lwork)) + return -1; + minmn = m > n ? n : m; + if (checked_PyMem_RawMalloc((void**) &iwork, 8 * minmn * sizeof(F_INT))) + { + PyMem_RawFree(work); + return STATUS_ERROR; + } + numba_raw_rgesdd(kind, jobz, m, n, a, lda, s, u ,ldu, vt, ldvt, work, lwork, + iwork, &info); + PyMem_RawFree(work); + PyMem_RawFree(iwork); + CATCH_LAPACK_INVALID_ARG("numba_raw_rgesdd", info); + + return (int)info; +} + +/* Complex space svd systems info from cgesdd/zgesdd + * Args are as per LAPACK. + */ +static int +numba_raw_cgesdd(char kind, char jobz, Py_ssize_t m, Py_ssize_t n, void *a, + Py_ssize_t lda, void *s, void *u, Py_ssize_t ldu, void *vt, + Py_ssize_t ldvt, void *work, Py_ssize_t lwork, void *rwork, + F_INT *iwork, F_INT *info) +{ + void *raw_func = NULL; + F_INT _m, _n, _lda, _ldu, _ldvt, _lwork; + + ENSURE_VALID_COMPLEX_KIND(kind) + + _m = (F_INT) m; + _n = (F_INT) n; + _lda = (F_INT) lda; + _ldu = (F_INT) ldu; + _ldvt = (F_INT) ldvt; + _lwork = (F_INT) lwork; + + switch (kind) + { + case 'c': + raw_func = get_clapack_cgesdd(); + break; + case 'z': + raw_func = get_clapack_zgesdd(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + (*(cgesdd_t) raw_func)(&jobz, &_m, &_n, a, &_lda, s, u, &_ldu, vt, &_ldvt, + work, &_lwork, rwork, iwork, info); + return 0; +} + +/* complex space svd info from cgesdd/zgesdd. + * As numba_raw_cgesdd but the allocation and error handling is done for the + * user. + * Args are as per LAPACK. + */ +static int +numba_ez_cgesdd(char kind, char jobz, Py_ssize_t m, Py_ssize_t n, void *a, + Py_ssize_t lda, void *s, void *u, Py_ssize_t ldu, void *vt, + Py_ssize_t ldvt) +{ + F_INT info = 0; + Py_ssize_t lwork = -1; + Py_ssize_t lrwork = -1; + Py_ssize_t minmn = -1; + Py_ssize_t tmp1, tmp2; + Py_ssize_t maxmn = -1; + size_t real_base_size = -1; + size_t complex_base_size = -1; + all_dtypes stack_slot, wk1, wk2; + void *work = NULL; + void *rwork = (void *)&wk1; + F_INT *iwork = (F_INT *)&wk2; + + ENSURE_VALID_COMPLEX_KIND(kind) + + switch (kind) + { + case 'c': + real_base_size = sizeof(float); + complex_base_size = sizeof(npy_complex64); + break; + case 'z': + real_base_size = sizeof(double); + complex_base_size = sizeof(npy_complex128); + break; + default: + { + PyGILState_STATE st = PyGILState_Ensure(); + PyErr_SetString(PyExc_ValueError,\ + "Invalid kind in numba_ez_rgesdd"); + PyGILState_Release(st); + } + return STATUS_ERROR; + } + + work = &stack_slot; + + /* Compute optimal work size (lwork) */ + numba_raw_cgesdd(kind, jobz, m, n, a, lda, s, u ,ldu, vt, ldvt, work, lwork, + rwork, iwork, &info); + CATCH_LAPACK_INVALID_ARG("numba_raw_cgesdd", info); + + /* Allocate work array */ + lwork = cast_from_X(kind, work); + if (checked_PyMem_RawMalloc(&work, complex_base_size * lwork)) + return STATUS_ERROR; + + minmn = m > n ? n : m; + if (jobz == 'n') + { + lrwork = 7 * minmn; + } + else + { + maxmn = m > n ? m : n; + tmp1 = 5 * minmn + 7; + tmp2 = 2 * maxmn + 2 * minmn + 1; + lrwork = minmn * (tmp1 > tmp2 ? tmp1: tmp2); + } + + if (checked_PyMem_RawMalloc(&rwork, + real_base_size * (lrwork > 1 ? lrwork : 1))) + { + PyMem_RawFree(work); + return STATUS_ERROR; + } + if (checked_PyMem_RawMalloc((void **) &iwork, + 8 * minmn * sizeof(F_INT))) + { + PyMem_RawFree(work); + PyMem_RawFree(rwork); + return STATUS_ERROR; + } + numba_raw_cgesdd(kind, jobz, m, n, a, lda, s, u ,ldu, vt, ldvt, work, lwork, + rwork, iwork, &info); + PyMem_RawFree(work); + PyMem_RawFree(rwork); + PyMem_RawFree(iwork); + CATCH_LAPACK_INVALID_ARG("numba_raw_cgesdd", info); + + return (int)info; +} + + +/* SVD systems info from *gesdd. + * This routine hides the type and general complexity involved with making the + * calls to *gesdd. The work space computation and error handling etc is hidden. + * Args are as per LAPACK. + */ +NUMBA_EXPORT_FUNC(int) +numba_ez_gesdd(char kind, char jobz, Py_ssize_t m, Py_ssize_t n, void *a, + Py_ssize_t lda, void *s, void *u, Py_ssize_t ldu, void *vt, + Py_ssize_t ldvt) +{ + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + case 'd': + return numba_ez_rgesdd(kind, jobz, m, n, a, lda, s, u, ldu, vt, + ldvt); + case 'c': + case 'z': + return numba_ez_cgesdd(kind, jobz, m, n, a, lda, s, u, ldu, vt, + ldvt); + } + return STATUS_ERROR; /* unreachable */ +} + + +/* + * Compute the QR factorization of a matrix. + * Return -1 on internal error, 0 on success, > 0 on failure. + */ +static int +numba_raw_xgeqrf(char kind, Py_ssize_t m, Py_ssize_t n, void *a, Py_ssize_t + lda, void *tau, void *work, Py_ssize_t lwork, F_INT *info) +{ + void *raw_func = NULL; + F_INT _m, _n, _lda, _lwork; + + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_clapack_sgeqrf(); + break; + case 'd': + raw_func = get_clapack_dgeqrf(); + break; + case 'c': + raw_func = get_clapack_cgeqrf(); + break; + case 'z': + raw_func = get_clapack_zgeqrf(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _m = (F_INT) m; + _n = (F_INT) n; + _lda = (F_INT) lda; + _lwork = (F_INT) lwork; + + (*(xgeqrf_t) raw_func)(&_m, &_n, a, &_lda, tau, work, &_lwork, info); + return 0; +} + +/* + * Compute the QR factorization of a matrix. + * This routine hides the type and general complexity involved with making the + * xgeqrf calls. The work space computation and error handling etc is hidden. + * Args are as per LAPACK. + */ +NUMBA_EXPORT_FUNC(int) +numba_ez_geqrf(char kind, Py_ssize_t m, Py_ssize_t n, void *a, Py_ssize_t + lda, void *tau) +{ + F_INT info = 0; + Py_ssize_t lwork = -1; + size_t base_size = -1; + all_dtypes stack_slot; + void *work = NULL; + + base_size = kind_size(kind); + + work = &stack_slot; + + /* Compute optimal work size (lwork) */ + numba_raw_xgeqrf(kind, m, n, a, lda, tau, work, lwork, &info); + CATCH_LAPACK_INVALID_ARG("numba_raw_xgeqrf", info); + + /* Allocate work array */ + lwork = cast_from_X(kind, work); + if (checked_PyMem_RawMalloc(&work, base_size * lwork)) + return STATUS_ERROR; + + numba_raw_xgeqrf(kind, m, n, a, lda, tau, work, lwork, &info); + PyMem_RawFree(work); + CATCH_LAPACK_INVALID_ARG("numba_raw_xgeqrf", info); + + return 0; /* info cannot be >0 */ + +} + + +/* + * Compute the orthogonal Q matrix (in QR) from elementary relectors. + */ +static int +numba_raw_xxxgqr(char kind, Py_ssize_t m, Py_ssize_t n, Py_ssize_t k, void *a, + Py_ssize_t lda, void *tau, void * work, Py_ssize_t lwork, F_INT *info) +{ + void *raw_func = NULL; + F_INT _m, _n, _k, _lda, _lwork; + + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_clapack_sorgqr(); + break; + case 'd': + raw_func = get_clapack_dorgqr(); + break; + case 'c': + raw_func = get_clapack_cungqr(); + break; + case 'z': + raw_func = get_clapack_zungqr(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _m = (F_INT) m; + _n = (F_INT) n; + _k = (F_INT) k; + _lda = (F_INT) lda; + _lwork = (F_INT) lwork; + + (*(xxxgqr_t) raw_func)(&_m, &_n, &_k, a, &_lda, tau, work, &_lwork, info); + return 0; +} + + +/* + * Compute the orthogonal Q matrix (in QR) from elementary reflectors. + * This routine hides the type and general complexity involved with making the + * x{or,un}qrf calls. The work space computation and error handling etc is + * hidden. Args are as per LAPACK. + */ +NUMBA_EXPORT_FUNC(int) +numba_ez_xxgqr(char kind, Py_ssize_t m, Py_ssize_t n, Py_ssize_t k, void *a, + Py_ssize_t lda, void *tau) +{ + F_INT info = 0; + Py_ssize_t lwork = -1; + size_t base_size = -1; + all_dtypes stack_slot; + void *work = NULL; + + work = &stack_slot; + + /* Compute optimal work size (lwork) */ + numba_raw_xxxgqr(kind, m, n, k, a, lda, tau, work, lwork, &info); + CATCH_LAPACK_INVALID_ARG("numba_raw_xxxgqr", info); + + base_size = kind_size(kind); + + /* Allocate work array */ + lwork = cast_from_X(kind, work); + if (checked_PyMem_RawMalloc(&work, base_size * lwork)) + return STATUS_ERROR; + + numba_raw_xxxgqr(kind, m, n, k, a, lda, tau, work, lwork, &info); + PyMem_RawFree(work); + CATCH_LAPACK_INVALID_ARG("numba_raw_xxxgqr", info); + + return 0; /* info cannot be >0 */ + +} + + +/* + * Compute the minimum-norm solution to a real linear least squares problem. + */ +static int +numba_raw_rgelsd(char kind, Py_ssize_t m, Py_ssize_t n, Py_ssize_t nrhs, + void *a, Py_ssize_t lda, void *b, Py_ssize_t ldb, void *S, + void * rcond, Py_ssize_t * rank, void * work, + Py_ssize_t lwork, F_INT *iwork, F_INT *info) +{ + void *raw_func = NULL; + F_INT _m, _n, _nrhs, _lda, _ldb, _rank, _lwork; + + ENSURE_VALID_REAL_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_clapack_sgelsd(); + break; + case 'd': + raw_func = get_clapack_dgelsd(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _m = (F_INT) m; + _n = (F_INT) n; + _nrhs = (F_INT) nrhs; + _lda = (F_INT) lda; + _ldb = (F_INT) ldb; + _lwork = (F_INT) lwork; + + (*(rgelsd_t) raw_func)(&_m, &_n, &_nrhs, a, &_lda, b, &_ldb, S, rcond, + &_rank, work, &_lwork, iwork, info); + *rank = (Py_ssize_t) _rank; + return 0; +} + +/* + * Compute the minimum-norm solution to a real linear least squares problem. + * This routine hides the type and general complexity involved with making the + * {s,d}gelsd calls. The work space computation and error handling etc is + * hidden. Args are as per LAPACK. + */ +static int +numba_ez_rgelsd(char kind, Py_ssize_t m, Py_ssize_t n, Py_ssize_t nrhs, + void *a, Py_ssize_t lda, void *b, Py_ssize_t ldb, void *S, + double rcond, Py_ssize_t * rank) +{ + F_INT info = 0; + Py_ssize_t lwork = -1; + size_t base_size = -1; + all_dtypes stack_slot; + void *work = NULL, *rcond_cast = NULL; + F_INT *iwork = NULL; + F_INT iwork_tmp; + float tmpf; + + ENSURE_VALID_REAL_KIND(kind) + + base_size = kind_size(kind); + + work = &stack_slot; + rcond_cast = work; /* stop checks on null ptr complaining */ + + /* Compute optimal work size (lwork) */ + numba_raw_rgelsd(kind, m, n, nrhs, a, lda, b, ldb, S, rcond_cast, rank, + work, lwork, &iwork_tmp, &info); + CATCH_LAPACK_INVALID_ARG("numba_raw_rgelsd", info); + + /* Allocate work array */ + lwork = cast_from_X(kind, work); + if (checked_PyMem_RawMalloc(&work, base_size * lwork)) + return STATUS_ERROR; + + /* Allocate iwork array */ + if (checked_PyMem_RawMalloc((void **)&iwork, sizeof(F_INT) * iwork_tmp)) + { + PyMem_RawFree(work); + return STATUS_ERROR; + } + + /* cast rcond to the right type */ + switch (kind) + { + case 's': + tmpf = (float)rcond; + rcond_cast = (void * )&tmpf; + break; + case 'd': + rcond_cast = (void * )&rcond; + break; + } + + numba_raw_rgelsd(kind, m, n, nrhs, a, lda, b, ldb, S, rcond_cast, rank, + work, lwork, iwork, &info); + PyMem_RawFree(work); + PyMem_RawFree(iwork); + CATCH_LAPACK_INVALID_ARG("numba_raw_rgelsd", info); + + return (int)info; +} + + +/* + * Compute the minimum-norm solution to a complex linear least squares problem. + */ +static int +numba_raw_cgelsd(char kind, Py_ssize_t m, Py_ssize_t n, Py_ssize_t nrhs, + void *a, Py_ssize_t lda, void *b, Py_ssize_t ldb, void *S, + void *rcond, Py_ssize_t * rank, void * work, + Py_ssize_t lwork, void * rwork, F_INT *iwork, F_INT *info) +{ + void *raw_func = NULL; + F_INT _m, _n, _nrhs, _lda, _ldb, _rank, _lwork; + + ENSURE_VALID_COMPLEX_KIND(kind) + + switch (kind) + { + case 'c': + raw_func = get_clapack_cgelsd(); + break; + case 'z': + raw_func = get_clapack_zgelsd(); + break; + } + ENSURE_VALID_FUNC(raw_func) + + _m = (F_INT) m; + _n = (F_INT) n; + _nrhs = (F_INT) nrhs; + _lda = (F_INT) lda; + _ldb = (F_INT) ldb; + _lwork = (F_INT) lwork; + + (*(cgelsd_t) raw_func)(&_m, &_n, &_nrhs, a, &_lda, b, &_ldb, S, rcond, + &_rank, work, &_lwork, rwork, iwork, info); + *rank = (Py_ssize_t) _rank; + return 0; +} + + +/* + * Compute the minimum-norm solution to a complex linear least squares problem. + * This routine hides the type and general complexity involved with making the + * {c,z}gelsd calls. The work space computation and error handling etc is + * hidden. Args are as per LAPACK. + */ +static int +numba_ez_cgelsd(char kind, Py_ssize_t m, Py_ssize_t n, Py_ssize_t nrhs, + void *a, Py_ssize_t lda, void *b, Py_ssize_t ldb, void *S, + double rcond, Py_ssize_t * rank) +{ + F_INT info = 0; + Py_ssize_t lwork = -1; + size_t base_size = -1; + all_dtypes stack_slot1, stack_slot2; + size_t real_base_size = 0; + void *work = NULL, *rwork = NULL, *rcond_cast = NULL; + Py_ssize_t lrwork; + F_INT *iwork = NULL; + F_INT iwork_tmp; + char real_kind = '-'; + float tmpf; + + ENSURE_VALID_COMPLEX_KIND(kind) + + base_size = kind_size(kind); + + work = &stack_slot1; + rwork = &stack_slot2; + rcond_cast = work; /* stop checks on null ptr complaining */ + + /* Compute optimal work size */ + numba_raw_cgelsd(kind, m, n, nrhs, a, lda, b, ldb, S, rcond_cast, rank, + work, lwork, rwork, &iwork_tmp, &info); + CATCH_LAPACK_INVALID_ARG("numba_raw_cgelsd", info); + + /* Allocate work array */ + lwork = cast_from_X(kind, work); + if (checked_PyMem_RawMalloc(&work, base_size * lwork)) + return STATUS_ERROR; + + /* Allocate iwork array */ + if (checked_PyMem_RawMalloc((void **)&iwork, sizeof(F_INT) * iwork_tmp)) + { + PyMem_RawFree(work); + return STATUS_ERROR; + } + + switch (kind) + { + case 'c': + real_kind = 's'; + tmpf = (float)rcond; + rcond_cast = (void * )&tmpf; + break; + case 'z': + real_kind = 'd'; + rcond_cast = (void * )&rcond; + break; + } + + real_base_size = kind_size(real_kind); + + lrwork = cast_from_X(real_kind, rwork); + if (checked_PyMem_RawMalloc((void **)&rwork, real_base_size * lrwork)) + { + PyMem_RawFree(work); + PyMem_RawFree(iwork); + return STATUS_ERROR; + } + + numba_raw_cgelsd(kind, m, n, nrhs, a, lda, b, ldb, S, rcond_cast, rank, + work, lwork, rwork, iwork, &info); + PyMem_RawFree(work); + PyMem_RawFree(rwork); + PyMem_RawFree(iwork); + CATCH_LAPACK_INVALID_ARG("numba_raw_cgelsd", info); + + return (int)info; +} + + +/* + * Compute the minimum-norm solution to a linear least squares problems. + * This routine hides the type and general complexity involved with making the + * calls to *gelsd. The work space computation and error handling etc is hidden. + * Args are as per LAPACK. + */ +NUMBA_EXPORT_FUNC(int) +numba_ez_gelsd(char kind, Py_ssize_t m, Py_ssize_t n, Py_ssize_t nrhs, + void *a, Py_ssize_t lda, void *b, Py_ssize_t ldb, void *S, + double rcond, Py_ssize_t * rank) +{ + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + case 'd': + return numba_ez_rgelsd(kind, m, n, nrhs, a, lda, b, ldb, S, rcond, + rank); + case 'c': + case 'z': + return numba_ez_cgelsd(kind, m, n, nrhs, a, lda, b, ldb, S, rcond, + rank); + } + return STATUS_ERROR; /* unreachable */ +} + + +/* + * Compute the solution to a system of linear equations + */ +NUMBA_EXPORT_FUNC(int) +numba_xgesv(char kind, Py_ssize_t n, Py_ssize_t nrhs, void *a, Py_ssize_t lda, + F_INT *ipiv, void *b, Py_ssize_t ldb) +{ + void *raw_func = NULL; + F_INT _n, _nrhs, _lda, _ldb, info; + + ENSURE_VALID_KIND(kind) + + switch (kind) + { + case 's': + raw_func = get_clapack_sgesv(); + break; + case 'd': + raw_func = get_clapack_dgesv(); + break; + case 'c': + raw_func = get_clapack_cgesv(); + break; + case 'z': + raw_func = get_clapack_zgesv(); + break; + } + + ENSURE_VALID_FUNC(raw_func) + + _n = (F_INT) n; + _nrhs = (F_INT) nrhs; + _lda = (F_INT) lda; + _ldb = (F_INT) ldb; + + (*(xgesv_t) raw_func)(&_n, &_nrhs, a, &_lda, ipiv, b, &_ldb, &info); + CATCH_LAPACK_INVALID_ARG("xgesv", info); + + return (int)info; +} + +/* undef defines and macros */ +#undef STATUS_SUCCESS +#undef STATUS_ERROR +#undef ENSURE_VALID_KIND +#undef ENSURE_VALID_REAL_KIND +#undef ENSURE_VALID_COMPLEX_KIND +#undef ENSURE_VALID_FUNC +#undef F_INT +#undef EMIT_GET_CLAPACK_FUNC +#undef CATCH_LAPACK_INVALID_ARG diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_npymath_exports.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/_npymath_exports.c new file mode 100644 index 0000000000000000000000000000000000000000..881b56c911805577d52d1a85e6c1b231ba307e23 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_npymath_exports.c @@ -0,0 +1,46 @@ +/* + * This file contains exports of Numpy math functions needed by numba. + */ + +#include "_pymodule.h" +#include +#include + + +/* + * Map Numpy C function symbols to their addresses. + */ + +struct npymath_entry { + const char *name; + void *func; +}; + +#define NPYMATH_SYMBOL(name) \ + { "npy_" #name, (void*) npy_##name } + +static struct npymath_entry npymath_exports[] = { + /* double functions */ + NPYMATH_SYMBOL(exp2), + NPYMATH_SYMBOL(log2), + + NPYMATH_SYMBOL(logaddexp), + NPYMATH_SYMBOL(logaddexp2), + NPYMATH_SYMBOL(nextafter), + NPYMATH_SYMBOL(spacing), + + NPYMATH_SYMBOL(modf), + + /* float functions */ + NPYMATH_SYMBOL(exp2f), + NPYMATH_SYMBOL(log2f), + + NPYMATH_SYMBOL(logaddexpf), + NPYMATH_SYMBOL(logaddexp2f), + NPYMATH_SYMBOL(nextafterf), + NPYMATH_SYMBOL(spacingf), + + NPYMATH_SYMBOL(modff), +}; + +#undef NPYMATH_SYMBOL diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_numba_common.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/_numba_common.h new file mode 100644 index 0000000000000000000000000000000000000000..c5e67d9c6a38f3a3b09fa4c542d7fd02fa89f9ac --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_numba_common.h @@ -0,0 +1,39 @@ +#ifndef NUMBA_COMMON_H_ +#define NUMBA_COMMON_H_ + +/* __has_attribute() is a clang / gcc-5 macro */ +#ifndef __has_attribute +# define __has_attribute(x) 0 +#endif + +/* This attribute marks symbols that can be shared across C objects + * but are not exposed outside of a shared library or executable. + * Note this is default behaviour for global symbols under Windows. + */ +#if (__has_attribute(visibility) || \ + (defined(__GNUC__) && __GNUC__ >= 4)) +#define VISIBILITY_HIDDEN __attribute__ ((visibility("hidden"))) +#else +#define VISIBILITY_HIDDEN +#endif + +/* + * Numba's version of the PyArray_DescrCheck macro from NumPy, use it as a + * direct replacement of NumPy's PyArray_DescrCheck to ensure binary + * compatibility. + * + * Details of why this is needed: + * NumPy 1.18 changed the definition of the PyArray_DescrCheck macro here: + * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698 + * the result of this being that building against NumPy <1.18 would prevent + * Numba running against NumPy >= 1.20 as noted here: + * https://github.com/numba/numba/issues/6041#issuecomment-665132199 + * + * This macro definition is copied from: + * https://github.com/numpy/numpy/commit/6108b5d1e138d07e3c9f2a4e3b1933749ad0e698#diff-ad2213da23136c5fc5883d9eb2d88666R26 + * + * NOTE: This is the NumPy 1.18 and above version of the macro. + */ +#define NUMBA_PyArray_DescrCheck(op) PyObject_TypeCheck(op, &PyArrayDescr_Type) + +#endif /* NUMBA_COMMON_H_ */ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_pymodule.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/_pymodule.h new file mode 100644 index 0000000000000000000000000000000000000000..8622598a7f7bfe6408df3532bef4fcb75ace7883 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_pymodule.h @@ -0,0 +1,32 @@ +#ifndef NUMBA_PY_MODULE_H_ +#define NUMBA_PY_MODULE_H_ + +#define PY_SSIZE_T_CLEAN + +#include +#include +#include + +#define MOD_ERROR_VAL NULL +#define MOD_SUCCESS_VAL(val) val +#define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void) +#define MOD_DEF(ob, name, doc, methods) { \ + static struct PyModuleDef moduledef = { \ + PyModuleDef_HEAD_INIT, name, doc, -1, methods, NULL, NULL, NULL, NULL }; \ + ob = PyModule_Create(&moduledef); } +#define MOD_INIT_EXEC(name) PyInit_##name(); + +#define PyString_AsString PyUnicode_AsUTF8 +#define PyString_Check PyUnicode_Check +#define PyString_FromFormat PyUnicode_FromFormat +#define PyString_FromString PyUnicode_FromString +#define PyString_InternFromString PyUnicode_InternFromString +#define PyInt_Type PyLong_Type +#define PyInt_Check PyLong_Check +#define PyInt_CheckExact PyLong_CheckExact +#define SetAttrStringFromVoidPointer(m, name) do { \ + PyObject *tmp = PyLong_FromVoidPtr((void *) &name); \ + PyObject_SetAttrString(m, #name, tmp); \ + Py_DECREF(tmp); } while (0) + +#endif /* NUMBA_PY_MODULE_H_ */ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_random.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/_random.c new file mode 100644 index 0000000000000000000000000000000000000000..bf95a3639d9fdf9b9e8ee636335716e1fa4a7052 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_random.c @@ -0,0 +1,492 @@ +/* + * PRNG support. + */ + +#ifdef _MSC_VER +#define HAVE_PTHREAD_ATFORK 0 +#else +#define HAVE_PTHREAD_ATFORK 1 +#include +#endif + + +/* Magic Mersenne Twister constants */ +#define MT_N 624 +#define MT_M 397 +#define MT_MATRIX_A 0x9908b0dfU +#define MT_UPPER_MASK 0x80000000U +#define MT_LOWER_MASK 0x7fffffffU + +/* + * Note this structure is accessed in numba.targets.randomimpl, + * any changes here should be reflected there too. + */ +typedef struct { + int index; + /* unsigned int is sufficient on modern machines as we only need 32 bits */ + unsigned int mt[MT_N]; + int has_gauss; + double gauss; + int is_initialized; +} rnd_state_t; + +/* Some code portions below from CPython's _randommodule.c, some others + from Numpy's and Jean-Sebastien Roy's randomkit.c. */ + +NUMBA_EXPORT_FUNC(void) +numba_rnd_shuffle(rnd_state_t *state) +{ + int i; + unsigned int y; + + for (i = 0; i < MT_N - MT_M; i++) { + y = (state->mt[i] & MT_UPPER_MASK) | (state->mt[i+1] & MT_LOWER_MASK); + state->mt[i] = state->mt[i+MT_M] ^ (y >> 1) ^ + (-(int) (y & 1) & MT_MATRIX_A); + } + for (; i < MT_N - 1; i++) { + y = (state->mt[i] & MT_UPPER_MASK) | (state->mt[i+1] & MT_LOWER_MASK); + state->mt[i] = state->mt[i+(MT_M-MT_N)] ^ (y >> 1) ^ + (-(int) (y & 1) & MT_MATRIX_A); + } + y = (state->mt[MT_N - 1] & MT_UPPER_MASK) | (state->mt[0] & MT_LOWER_MASK); + state->mt[MT_N - 1] = state->mt[MT_M - 1] ^ (y >> 1) ^ + (-(int) (y & 1) & MT_MATRIX_A); +} + +/* Initialize mt[] with an integer seed */ +NUMBA_EXPORT_FUNC(void) +numba_rnd_init(rnd_state_t *state, unsigned int seed) +{ + unsigned int pos; + seed &= 0xffffffffU; + + /* Knuth's PRNG as used in the Mersenne Twister reference implementation */ + for (pos = 0; pos < MT_N; pos++) { + state->mt[pos] = seed; + seed = (1812433253U * (seed ^ (seed >> 30)) + pos + 1) & 0xffffffffU; + } + state->index = MT_N; + state->has_gauss = 0; + state->gauss = 0.0; + state->is_initialized = 1; +} + +/* Perturb mt[] with a key array */ +static void +rnd_init_by_array(rnd_state_t *state, unsigned int init_key[], size_t key_length) +{ + size_t i, j, k; + unsigned int *mt = state->mt; + + numba_rnd_init(state, 19650218U); + i = 1; j = 0; + k = (MT_N > key_length ? MT_N : key_length); + for (; k; k--) { + mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525U)) + + init_key[j] + (unsigned int) j; /* non linear */ + mt[i] &= 0xffffffffU; + i++; j++; + if (i >= MT_N) { mt[0] = mt[MT_N - 1]; i = 1; } + if (j >= key_length) j = 0; + } + for (k = MT_N - 1; k; k--) { + mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941U)) + - (unsigned int) i; /* non linear */ + mt[i] &= 0xffffffffU; + i++; + if (i >= MT_N) { mt[0] = mt[MT_N - 1]; i=1; } + } + + mt[0] = 0x80000000U; /* MSB is 1; ensuring non-zero initial array */ + state->index = MT_N; + state->has_gauss = 0; + state->gauss = 0.0; + state->is_initialized = 1; +} + +/* + * Management of thread-local random state. + */ + +static int rnd_globally_initialized; + +#ifdef _MSC_VER +#define THREAD_LOCAL(ty) __declspec(thread) ty +#else +/* Non-standard C99 extension that's understood by gcc and clang */ +#define THREAD_LOCAL(ty) __thread ty +#endif + +static THREAD_LOCAL(rnd_state_t) numba_py_random_state; +static THREAD_LOCAL(rnd_state_t) numba_np_random_state; +static THREAD_LOCAL(rnd_state_t) numba_internal_random_state; + +/* Seed the state with random bytes */ +static int +rnd_seed_with_bytes(rnd_state_t *state, Py_buffer *buf) +{ + unsigned int *keys; + unsigned char *bytes; + size_t i, nkeys; + + nkeys = buf->len / sizeof(unsigned int); + keys = (unsigned int *) PyMem_Malloc(nkeys * sizeof(unsigned int)); + if (keys == NULL) { + PyBuffer_Release(buf); + return -1; + } + bytes = (unsigned char *) buf->buf; + /* Convert input bytes to int32 keys, without violating alignment + * constraints. + */ + for (i = 0; i < nkeys; i++, bytes += 4) { + keys[i] = + ((unsigned int)bytes[3] << 24) + + ((unsigned int)bytes[2] << 16) + + ((unsigned int)bytes[1] << 8) + + ((unsigned int)bytes[0] << 0); + } + PyBuffer_Release(buf); + rnd_init_by_array(state, keys, nkeys); + PyMem_Free(keys); + return 0; +} + +#if HAVE_PTHREAD_ATFORK +/* After a fork(), the child should reseed its random states. + * Since only the main thread survives in the child, it's enough to mark + * the current thread-local states as uninitialized. + */ +static void +rnd_atfork_child(void) +{ + numba_py_random_state.is_initialized = 0; + numba_np_random_state.is_initialized = 0; + numba_internal_random_state.is_initialized = 0; +} +#endif + +/* Global initialization routine. It must be called as early as possible. + */ +NUMBA_EXPORT_FUNC(void) +numba_rnd_ensure_global_init(void) +{ + if (!rnd_globally_initialized) { +#if HAVE_PTHREAD_ATFORK + pthread_atfork(NULL, NULL, rnd_atfork_child); +#endif + numba_py_random_state.is_initialized = 0; + numba_np_random_state.is_initialized = 0; + numba_internal_random_state.is_initialized = 0; + rnd_globally_initialized = 1; + } +} + +/* First-time init a random state */ +static void +rnd_implicit_init(rnd_state_t *state) +{ + /* Initialize with random bytes. The easiest way to get good-quality + * cross-platform random bytes is still to call os.urandom() + * using the Python interpreter... + */ + PyObject *module, *bufobj; + Py_buffer buf; + PyGILState_STATE gilstate = PyGILState_Ensure(); + + module = PyImport_ImportModuleNoBlock("os"); + if (module == NULL) + goto error; + /* Read as many bytes as necessary to get the full entropy + * exploitable by the MT generator. + */ + bufobj = PyObject_CallMethod(module, "urandom", "i", + (int) (MT_N * sizeof(unsigned int))); + Py_DECREF(module); + if (bufobj == NULL) + goto error; + if (PyObject_GetBuffer(bufobj, &buf, PyBUF_SIMPLE)) + goto error; + Py_DECREF(bufobj); + if (rnd_seed_with_bytes(state, &buf)) + goto error; + /* state->is_initialized is set now */ + + PyGILState_Release(gilstate); + return; + +error: + /* In normal conditions, os.urandom() and PyMem_Malloc() shouldn't fail, + * and we don't want the caller to deal with errors, so just bail out. + */ + if (PyErr_Occurred()) + PyErr_Print(); + Py_FatalError(NULL); +} + +/* Functions returning the thread-local random state pointer. + * The LLVM JIT doesn't support thread-local variables so we rely + * on the C compiler instead. + */ + +NUMBA_EXPORT_FUNC(rnd_state_t *) +numba_get_py_random_state(void) +{ + rnd_state_t *state = &numba_py_random_state; + if (!state->is_initialized) + rnd_implicit_init(state); + return state; +} + +NUMBA_EXPORT_FUNC(rnd_state_t *) +numba_get_np_random_state(void) +{ + rnd_state_t *state = &numba_np_random_state; + if (!state->is_initialized) + rnd_implicit_init(state); + return state; +} + +NUMBA_EXPORT_FUNC(rnd_state_t *) +numba_get_internal_random_state(void) +{ + rnd_state_t *state = &numba_internal_random_state; + if (!state->is_initialized) + rnd_implicit_init(state); + return state; +} + +/* + * Python-exposed helpers for state management and testing. + */ +static int +rnd_state_converter(PyObject *obj, rnd_state_t **state) +{ + *state = (rnd_state_t *) PyLong_AsVoidPtr(obj); + return (*state != NULL || !PyErr_Occurred()); +} + +NUMBA_EXPORT_FUNC(PyObject *) +_numba_rnd_get_py_state_ptr(PyObject *self) +{ + return PyLong_FromVoidPtr(numba_get_py_random_state()); +} + +NUMBA_EXPORT_FUNC(PyObject *) +_numba_rnd_get_np_state_ptr(PyObject *self) +{ + return PyLong_FromVoidPtr(numba_get_np_random_state()); +} + +NUMBA_EXPORT_FUNC(PyObject *) +_numba_rnd_shuffle(PyObject *self, PyObject *arg) +{ + rnd_state_t *state; + if (!rnd_state_converter(arg, &state)) + return NULL; + numba_rnd_shuffle(state); + Py_RETURN_NONE; +} + +NUMBA_EXPORT_FUNC(PyObject *) +_numba_rnd_set_state(PyObject *self, PyObject *args) +{ + int i, index; + rnd_state_t *state; + PyObject *tuplearg, *intlist; + + if (!PyArg_ParseTuple(args, "O&O!:rnd_set_state", + rnd_state_converter, &state, + &PyTuple_Type, &tuplearg)) + return NULL; + if (!PyArg_ParseTuple(tuplearg, "iO!", &index, &PyList_Type, &intlist)) + return NULL; + if (PyList_GET_SIZE(intlist) != MT_N) { + PyErr_SetString(PyExc_ValueError, "list object has wrong size"); + return NULL; + } + state->index = index; + for (i = 0; i < MT_N; i++) { + PyObject *v = PyList_GET_ITEM(intlist, i); + unsigned long x = PyLong_AsUnsignedLong(v); + if (x == (unsigned long) -1 && PyErr_Occurred()) + return NULL; + state->mt[i] = (unsigned int) x; + } + state->has_gauss = 0; + state->gauss = 0.0; + state->is_initialized = 1; + Py_RETURN_NONE; +} + +NUMBA_EXPORT_FUNC(PyObject *) +_numba_rnd_get_state(PyObject *self, PyObject *arg) +{ + PyObject *intlist; + int i; + rnd_state_t *state; + if (!rnd_state_converter(arg, &state)) + return NULL; + + intlist = PyList_New(MT_N); + if (intlist == NULL) + return NULL; + for (i = 0; i < MT_N; i++) { + PyObject *v = PyLong_FromUnsignedLong(state->mt[i]); + if (v == NULL) { + Py_DECREF(intlist); + return NULL; + } + PyList_SET_ITEM(intlist, i, v); + } + return Py_BuildValue("iN", state->index, intlist); +} + +NUMBA_EXPORT_FUNC(PyObject *) +_numba_rnd_seed(PyObject *self, PyObject *args) +{ + unsigned int seed; + rnd_state_t *state; + + if (!PyArg_ParseTuple(args, "O&I:rnd_seed", + rnd_state_converter, &state, &seed)) { + /* rnd_seed_*(bytes-like object) */ + Py_buffer buf; + + PyErr_Clear(); + if (!PyArg_ParseTuple(args, "O&s*:rnd_seed", + rnd_state_converter, &state, &buf)) + return NULL; + + if (rnd_seed_with_bytes(state, &buf)) + return NULL; + else + Py_RETURN_NONE; + } + else { + /* rnd_seed_*(int32) */ + numba_rnd_init(state, seed); + Py_RETURN_NONE; + } +} + +/* + * Random distribution helpers. + * Most code straight from Numpy's distributions.c. + */ + +#ifndef M_PI +#define M_PI 3.14159265358979323846264338328 +#endif + +NUMBA_EXPORT_FUNC(unsigned int) +get_next_int32(rnd_state_t *state) +{ + unsigned int y; + + if (state->index == MT_N) { + numba_rnd_shuffle(state); + state->index = 0; + } + y = state->mt[state->index++]; + /* Tempering */ + y ^= (y >> 11); + y ^= (y << 7) & 0x9d2c5680U; + y ^= (y << 15) & 0xefc60000U; + y ^= (y >> 18); + return y; +} + +NUMBA_EXPORT_FUNC(double) +get_next_double(rnd_state_t *state) +{ + double a = get_next_int32(state) >> 5; + double b = get_next_int32(state) >> 6; + return (a * 67108864.0 + b) / 9007199254740992.0; +} + +NUMBA_EXPORT_FUNC(double) +loggam(double x) +{ + double x0, x2, xp, gl, gl0; + long k, n; + + static double a[10] = {8.333333333333333e-02,-2.777777777777778e-03, + 7.936507936507937e-04,-5.952380952380952e-04, + 8.417508417508418e-04,-1.917526917526918e-03, + 6.410256410256410e-03,-2.955065359477124e-02, + 1.796443723688307e-01,-1.39243221690590e+00}; + x0 = x; + n = 0; + if ((x == 1.0) || (x == 2.0)) + { + return 0.0; + } + else if (x <= 7.0) + { + n = (long)(7 - x); + x0 = x + n; + } + x2 = 1.0/(x0*x0); + xp = 2*M_PI; + gl0 = a[9]; + for (k=8; k>=0; k--) + { + gl0 *= x2; + gl0 += a[k]; + } + gl = gl0/x0 + 0.5*log(xp) + (x0-0.5)*log(x0) - x0; + if (x <= 7.0) + { + for (k=1; k<=n; k++) + { + gl -= log(x0-1.0); + x0 -= 1.0; + } + } + return gl; +} + + +NUMBA_EXPORT_FUNC(int64_t) +numba_poisson_ptrs(rnd_state_t *state, double lam) +{ + /* This method is invoked only if the parameter lambda of this + * distribution is big enough ( >= 10 ). The algorithm used is + * described in "Hörmann, W. 1992. 'The Transformed Rejection + * Method for Generating Poisson Random Variables'. + * The implementation comes straight from Numpy. + */ + int64_t k; + double U, V, slam, loglam, a, b, invalpha, vr, us; + + slam = sqrt(lam); + loglam = log(lam); + b = 0.931 + 2.53*slam; + a = -0.059 + 0.02483*b; + invalpha = 1.1239 + 1.1328/(b-3.4); + vr = 0.9277 - 3.6224/(b-2); + + while (1) + { + U = get_next_double(state) - 0.5; + V = get_next_double(state); + us = 0.5 - fabs(U); + k = (int64_t) floor((2*a/us + b)*U + lam + 0.43); + if ((us >= 0.07) && (V <= vr)) + { + return k; + } + if ((k < 0) || + ((us < 0.013) && (V > us))) + { + continue; + } + if ((log(V) + log(invalpha) - log(a/(us*us)+b)) <= + (-lam + (double) k*loglam - loggam((double) k+1))) + { + return k; + } + } +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_typeof.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/_typeof.c new file mode 100644 index 0000000000000000000000000000000000000000..334ff2b999f501c097d1697877c017bf7d632e44 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_typeof.c @@ -0,0 +1,1133 @@ +#include "_pymodule.h" + +#include +#include +#include + +#include "_numba_common.h" +#include "_typeof.h" +#include "_hashtable.h" +#include "_devicearray.h" +#include "pyerrors.h" + +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#include + + +/* Cached typecodes for basic scalar types */ +static int tc_int8; +static int tc_int16; +static int tc_int32; +static int tc_int64; +static int tc_uint8; +static int tc_uint16; +static int tc_uint32; +static int tc_uint64; +static int tc_float32; +static int tc_float64; +static int tc_complex64; +static int tc_complex128; +static int BASIC_TYPECODES[12]; + +static int tc_intp; + +/* The type object for the numba .dispatcher.OmittedArg class + * that wraps omitted arguments. + */ +static PyObject *omittedarg_type; + +static PyObject *typecache; +static PyObject *ndarray_typecache; +static PyObject *structured_dtypes; + +static PyObject *str_typeof_pyval = NULL; +static PyObject *str_value = NULL; +static PyObject *str_numba_type = NULL; + +/* CUDA device array API */ +void **DeviceArray_API; + +/* + * Type fingerprint computation. + */ + +typedef struct { + /* A buffer the fingerprint will be written to */ + char *buf; + size_t n; + size_t allocated; + /* A preallocated buffer, sufficient to fit the fingerprint for most types */ + char static_buf[40]; +} string_writer_t; + +static void +string_writer_init(string_writer_t *w) +{ + w->buf = w->static_buf; + w->n = 0; + w->allocated = sizeof(w->static_buf) / sizeof(unsigned char); +} + +static void +string_writer_clear(string_writer_t *w) +{ + if (w->buf != w->static_buf) + free(w->buf); +} + +static void +string_writer_move(string_writer_t *dest, const string_writer_t *src) +{ + dest->n = src->n; + dest->allocated = src->allocated; + if (src->buf == src->static_buf) { + dest->buf = dest->static_buf; + memcpy(dest->buf, src->buf, src->n); + } + else { + dest->buf = src->buf; + } +} + +/* Ensure at least *bytes* can be appended to the string writer's buffer. */ +static int +string_writer_ensure(string_writer_t *w, size_t bytes) +{ + size_t newsize; + bytes += w->n; + if (bytes <= w->allocated) + return 0; + newsize = (w->allocated << 2) + 1; + if (newsize < bytes) + newsize = bytes; + if (w->buf == w->static_buf) + w->buf = malloc(newsize); + else + w->buf = realloc(w->buf, newsize); + if (w->buf) { + w->allocated = newsize; + return 0; + } + else { + PyErr_NoMemory(); + return -1; + } +} + +static int +string_writer_put_char(string_writer_t *w, unsigned char c) +{ + if (string_writer_ensure(w, 1)) + return -1; + w->buf[w->n++] = c; + return 0; +} + +static int +string_writer_put_int32(string_writer_t *w, unsigned int v) +{ + if (string_writer_ensure(w, 4)) + return -1; + w->buf[w->n] = v & 0xff; + w->buf[w->n + 1] = (v >> 8) & 0xff; + w->buf[w->n + 2] = (v >> 16) & 0xff; + w->buf[w->n + 3] = (v >> 24) & 0xff; + w->n += 4; + return 0; +} + +static int +string_writer_put_intp(string_writer_t *w, npy_intp v) +{ + if (string_writer_ensure(w, NPY_SIZEOF_PY_INTPTR_T)) + return -1; + w->buf[w->n] = v & 0xff; + w->buf[w->n + 1] = (v >> 8) & 0xff; + w->buf[w->n + 2] = (v >> 16) & 0xff; + w->buf[w->n + 3] = (v >> 24) & 0xff; +#if NPY_SIZEOF_PY_INTPTR_T == 8 + w->buf[w->n + 4] = (v >> 32) & 0xff; + w->buf[w->n + 5] = (v >> 40) & 0xff; + w->buf[w->n + 6] = (v >> 48) & 0xff; + w->buf[w->n + 7] = (v >> 56) & 0xff; +#endif + w->n += NPY_SIZEOF_PY_INTPTR_T; + return 0; +} + +static int +string_writer_put_string(string_writer_t *w, const char *s) +{ + if (s == NULL) { + return string_writer_put_char(w, 0); + } + else { + size_t N = strlen(s) + 1; + if (string_writer_ensure(w, N)) + return -1; + memcpy(w->buf + w->n, s, N); + w->n += N; + return 0; + } +} + +enum opcode { + OP_START_TUPLE = '(', + OP_END_TUPLE = ')', + OP_INT = 'i', + OP_FLOAT = 'f', + OP_COMPLEX = 'c', + OP_BOOL = '?', + OP_OMITTED = '!', + + OP_BYTEARRAY = 'a', + OP_BYTES = 'b', + OP_NONE = 'n', + OP_LIST = '[', + OP_SET = '{', + + OP_BUFFER = 'B', + OP_NP_SCALAR = 'S', + OP_NP_ARRAY = 'A', + OP_NP_DTYPE = 'D' +}; + +#define TRY(func, w, arg) \ + do { \ + if (func(w, arg)) return -1; \ + } while (0) + + +static int +fingerprint_unrecognized(void) +{ + PyErr_SetString(PyExc_NotImplementedError, + "cannot compute type fingerprint for value"); + return -1; +} + +static int +compute_dtype_fingerprint(string_writer_t *w, PyArray_Descr *descr) +{ + int typenum = descr->type_num; + if (typenum < NPY_OBJECT) + return string_writer_put_char(w, (char) typenum); + if (typenum == NPY_VOID) { + /* Structured dtype: serialize the dtype pointer. Unfortunately, + * some structured dtypes can be ephemeral, so we have to + * intern them to avoid pointer reuse and fingerprint collisions. + * (e.g. np.recarray(dtype=some_dtype) creates a new dtype + * equal to some_dtype) + */ + PyObject *interned = PyDict_GetItem(structured_dtypes, + (PyObject *) descr); + if (interned == NULL) { + interned = (PyObject *) descr; + if (PyDict_SetItem(structured_dtypes, interned, interned)) + return -1; + } + TRY(string_writer_put_char, w, (char) typenum); + return string_writer_put_intp(w, (npy_intp) interned); + } +#if NPY_API_VERSION >= 0x00000007 + if (PyTypeNum_ISDATETIME(typenum)) { + PyArray_DatetimeMetaData *md; + md = &(((PyArray_DatetimeDTypeMetaData *)descr->c_metadata)->meta); + TRY(string_writer_put_char, w, (char) typenum); + TRY(string_writer_put_char, w, (char) md->base); + return string_writer_put_int32(w, (char) md->num); + } +#endif + + return fingerprint_unrecognized(); +} + +static int +compute_fingerprint(string_writer_t *w, PyObject *val) +{ + /* + * Implementation note: for performance, we start with common + * types that can be tested with fast checks. + */ + if (val == Py_None) + return string_writer_put_char(w, OP_NONE); + if (PyBool_Check(val)) + return string_writer_put_char(w, OP_BOOL); + /* Note we avoid matching int subclasses such as IntEnum */ + if (PyInt_CheckExact(val) || PyLong_CheckExact(val)) + return string_writer_put_char(w, OP_INT); + if (PyFloat_Check(val)) + return string_writer_put_char(w, OP_FLOAT); + if (PyComplex_CheckExact(val)) + return string_writer_put_char(w, OP_COMPLEX); + if (PyTuple_Check(val)) { + if(PyTuple_CheckExact(val)) { + Py_ssize_t i, n; + n = PyTuple_GET_SIZE(val); + TRY(string_writer_put_char, w, OP_START_TUPLE); + for (i = 0; i < n; i++) + TRY(compute_fingerprint, w, PyTuple_GET_ITEM(val, i)); + TRY(string_writer_put_char, w, OP_END_TUPLE); + return 0; + } + /* as per typeof.py, check "_asdict" for namedtuple. */ + else if(PyObject_HasAttrString(val, "_asdict")) + { + /* + * This encodes the class name and field names of a namedtuple into + * the fingerprint on the condition that the number of fields is + * small (<10) and that the class name and field names are encodable + * as ASCII. + */ + PyObject * clazz = NULL; + PyObject * name = NULL; + PyObject * _fields = PyObject_GetAttrString(val, "_fields"); + PyObject * field = NULL; + PyObject * ascii_str = NULL; + Py_ssize_t i, n, j, flen; + char * buf = NULL; + int ret; + + clazz = PyObject_GetAttrString(val, "__class__"); + if (clazz == NULL) + return -1; + + name = PyObject_GetAttrString(clazz, "__name__"); + Py_DECREF(clazz); + if (name == NULL) + return -1; + + ascii_str = PyUnicode_AsEncodedString(name, "ascii", "ignore"); + Py_DECREF(name); + if (ascii_str == NULL) + return -1; + ret = PyBytes_AsStringAndSize(ascii_str, &buf, &flen); + + if (ret == -1) + return -1; + for(j = 0; j < flen; j++) { + TRY(string_writer_put_char, w, buf[j]); + } + Py_DECREF(ascii_str); + + if (_fields == NULL) + return -1; + + n = PyTuple_GET_SIZE(val); + + TRY(string_writer_put_char, w, OP_START_TUPLE); + for (i = 0; i < n; i++) { + field = PyTuple_GET_ITEM(_fields, i); + if (field == NULL) + return -1; + ascii_str = PyUnicode_AsEncodedString(field, "ascii", "ignore"); + if (ascii_str == NULL) + return -1; + ret = PyBytes_AsStringAndSize(ascii_str, &buf, &flen); + if (ret == -1) + return -1; + for(j = 0; j < flen; j++) { + TRY(string_writer_put_char, w, buf[j]); + } + Py_DECREF(ascii_str); + TRY(compute_fingerprint, w, PyTuple_GET_ITEM(val, i)); + } + TRY(string_writer_put_char, w, OP_END_TUPLE); + Py_DECREF(_fields); + return 0; + } + } + if (PyBytes_Check(val)) + return string_writer_put_char(w, OP_BYTES); + if (PyByteArray_Check(val)) + return string_writer_put_char(w, OP_BYTEARRAY); + if ((PyObject *) Py_TYPE(val) == omittedarg_type) { + PyObject *default_val = PyObject_GetAttr(val, str_value); + if (default_val == NULL) + return -1; + TRY(string_writer_put_char, w, OP_OMITTED); + TRY(compute_fingerprint, w, default_val); + Py_DECREF(default_val); + return 0; + } + if (PyArray_IsScalar(val, Generic)) { + /* Note: PyArray_DescrFromScalar() may be a bit slow on + non-trivial types. */ + PyArray_Descr *descr = PyArray_DescrFromScalar(val); + if (descr == NULL) + return -1; + TRY(string_writer_put_char, w, OP_NP_SCALAR); + TRY(compute_dtype_fingerprint, w, descr); + Py_DECREF(descr); + return 0; + } + if (PyArray_Check(val)) { + PyArrayObject *ary = (PyArrayObject *) val; + int ndim = PyArray_NDIM(ary); + + TRY(string_writer_put_char, w, OP_NP_ARRAY); + TRY(string_writer_put_int32, w, ndim); + if (PyArray_IS_C_CONTIGUOUS(ary)) + TRY(string_writer_put_char, w, 'C'); + else if (PyArray_IS_F_CONTIGUOUS(ary)) + TRY(string_writer_put_char, w, 'F'); + else + TRY(string_writer_put_char, w, 'A'); + if (PyArray_ISWRITEABLE(ary)) + TRY(string_writer_put_char, w, 'W'); + else + TRY(string_writer_put_char, w, 'R'); + return compute_dtype_fingerprint(w, PyArray_DESCR(ary)); + } + if (PyList_Check(val)) { + Py_ssize_t n = PyList_GET_SIZE(val); + if (n == 0) { + PyErr_SetString(PyExc_ValueError, + "cannot compute fingerprint of empty list"); + return -1; + } + /* Only the first item is considered, as in typeof.py */ + TRY(string_writer_put_char, w, OP_LIST); + TRY(compute_fingerprint, w, PyList_GET_ITEM(val, 0)); + return 0; + } + /* Note we only accept sets, not frozensets */ + if (Py_TYPE(val) == &PySet_Type) { + Py_hash_t h; + PyObject *item; + Py_ssize_t pos = 0; + /* Only one item is considered, as in typeof.py */ + if (!_PySet_NextEntry(val, &pos, &item, &h)) { + /* Empty set */ + PyErr_SetString(PyExc_ValueError, + "cannot compute fingerprint of empty set"); + return -1; + } + TRY(string_writer_put_char, w, OP_SET); + TRY(compute_fingerprint, w, item); + return 0; + } + if (PyObject_CheckBuffer(val)) { + Py_buffer buf; + int flags = PyBUF_ND | PyBUF_STRIDES | PyBUF_FORMAT; + char contig; + int ndim; + char readonly; + + /* Attempt to get a writable buffer, then fallback on read-only */ + if (PyObject_GetBuffer(val, &buf, flags | PyBUF_WRITABLE)) { + PyErr_Clear(); + if (PyObject_GetBuffer(val, &buf, flags)) + goto _unrecognized; + } + if (PyBuffer_IsContiguous(&buf, 'C')) + contig = 'C'; + else if (PyBuffer_IsContiguous(&buf, 'F')) + contig = 'F'; + else + contig = 'A'; + ndim = buf.ndim; + readonly = buf.readonly ? 'R' : 'W'; + if (string_writer_put_char(w, OP_BUFFER) || + string_writer_put_int32(w, ndim) || + string_writer_put_char(w, contig) || + string_writer_put_char(w, readonly) || + string_writer_put_string(w, buf.format) || + /* We serialize the object's Python type as well, to + distinguish between types which have Numba specializations + (e.g. array.array() vs. memoryview) + */ + string_writer_put_intp(w, (npy_intp) Py_TYPE(val))) { + PyBuffer_Release(&buf); + return -1; + } + PyBuffer_Release(&buf); + return 0; + } + if (NUMBA_PyArray_DescrCheck(val)) { + TRY(string_writer_put_char, w, OP_NP_DTYPE); + return compute_dtype_fingerprint(w, (PyArray_Descr *) val); + } + +_unrecognized: + /* Type not recognized */ + return fingerprint_unrecognized(); +} + +PyObject * +typeof_compute_fingerprint(PyObject *val) +{ + PyObject *res; + string_writer_t w; + + string_writer_init(&w); + + if (compute_fingerprint(&w, val)) + goto error; + res = PyBytes_FromStringAndSize(w.buf, w.n); + + string_writer_clear(&w); + return res; + +error: + string_writer_clear(&w); + return NULL; +} + +/* + * Getting the typecode from a Type object. + */ +static int +_typecode_from_type_object(PyObject *tyobj) { + int typecode; + PyObject *tmpcode = PyObject_GetAttrString(tyobj, "_code"); + if (tmpcode == NULL) { + return -1; + } + typecode = PyLong_AsLong(tmpcode); + Py_DECREF(tmpcode); + return typecode; +} + +/* When we want to cache the type's typecode for later lookup, we need to + keep a reference to the returned type object so that it cannot be + deleted. This is because of the following events occurring when first + using a @jit function for a given set of types: + + 1. typecode_fallback requests a new typecode for an arbitrary Python value; + this implies creating a Numba type object (on the first dispatcher call); + the typecode cache is then populated. + 2. matching of the typecode list in _dispatcherimpl.cpp fails, since the + typecode is new. + 3. we have to compile: compile_and_invoke() is called, it will invoke + Dispatcher_Insert to register the new signature. + + The reference to the Numba type object returned in step 1 is deleted as + soon as we call Py_DECREF() on it, since we are holding the only + reference. If this happens and we use the typecode we got to populate the + cache, then the cache won't ever return the correct typecode, and the + dispatcher will never successfully match the typecodes with those of + some already-compiled instance. So we need to make sure that we don't + call Py_DECREF() on objects whose typecode will be used to populate the + cache. This is ensured by calling _typecode_fallback with + retain_reference == 0. + + Note that technically we are leaking the reference, since we do not continue + to hold a pointer to the type object that we get back from typeof_pyval. + However, we don't need to refer to it again, we just need to make sure that + it is never deleted. +*/ +static int +_typecode_fallback(PyObject *dispatcher, PyObject *val, + int retain_reference) { + PyObject *numba_type; + int typecode; + + /* + * For values that define "_numba_type_", which holds a numba Type + * instance that should be used as the type of the value. + * Note this is done here, not in typeof_typecode(), so that + * some values can still benefit from fingerprint caching. + */ + if (PyObject_HasAttr(val, str_numba_type)) { + numba_type = PyObject_GetAttrString(val, "_numba_type_"); + if (!numba_type) + return -1; + } + else { + // Go back to the interpreter + numba_type = PyObject_CallMethodObjArgs((PyObject *) dispatcher, + str_typeof_pyval, val, NULL); + } + if (!numba_type) + return -1; + typecode = _typecode_from_type_object(numba_type); + if (!retain_reference) + Py_DECREF(numba_type); + return typecode; +} + +/* Variations on _typecode_fallback for convenience */ + +static +int typecode_fallback(PyObject *dispatcher, PyObject *val) { + return _typecode_fallback(dispatcher, val, 0); +} + +static +int typecode_fallback_keep_ref(PyObject *dispatcher, PyObject *val) { + return _typecode_fallback(dispatcher, val, 1); +} + + +/* A cache mapping fingerprints (string_writer_t *) to typecodes (int). */ +static _Numba_hashtable_t *fingerprint_hashtable = NULL; + +static Py_uhash_t +hash_writer(const void *key) +{ + string_writer_t *writer = (string_writer_t *) key; + Py_uhash_t x = 0; + + /* The old FNV algorithm used by Python 2 */ + if (writer->n > 0) { + unsigned char *p = (unsigned char *) writer->buf; + Py_ssize_t len = writer->n; + x ^= *p << 7; + while (--len >= 0) + x = (1000003*x) ^ *p++; + x ^= writer->n; + if (x == (Py_uhash_t) -1) + x = -2; + } + return x; +} + +static int +compare_writer(const void *key, const _Numba_hashtable_entry_t *entry) +{ + string_writer_t *v = (string_writer_t *) key; + string_writer_t *w = (string_writer_t *) entry->key; + if (v->n != w->n) + return 0; + return memcmp(v->buf, w->buf, v->n) == 0; +} + +/* Try to compute *val*'s typecode using its fingerprint and the + * fingerprint->typecode cache. + */ +static int +typecode_using_fingerprint(PyObject *dispatcher, PyObject *val) +{ + int typecode; + string_writer_t w; + + string_writer_init(&w); + + if (compute_fingerprint(&w, val)) { + string_writer_clear(&w); + if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) { + /* Can't compute a type fingerprint for the given value, + fall back on typeof() without caching. */ + PyErr_Clear(); + return typecode_fallback(dispatcher, val); + } + return -1; + } + if (_Numba_HASHTABLE_GET(fingerprint_hashtable, &w, typecode) > 0) { + /* Cache hit */ + string_writer_clear(&w); + return typecode; + } + + /* Not found in cache: invoke pure Python typeof() and cache result. + * Note we have to keep the type alive forever as explained + * above in _typecode_fallback(). + */ + typecode = typecode_fallback_keep_ref(dispatcher, val); + if (typecode >= 0) { + string_writer_t *key = (string_writer_t *) malloc(sizeof(string_writer_t)); + if (key == NULL) { + string_writer_clear(&w); + PyErr_NoMemory(); + return -1; + } + /* Ownership of the string writer's buffer will be transferred + * to the hash table. + */ + string_writer_move(key, &w); + if (_Numba_HASHTABLE_SET(fingerprint_hashtable, key, typecode)) { + string_writer_clear(&w); + PyErr_NoMemory(); + return -1; + } + } + return typecode; +} + + +/* + * Direct lookup table for extra-fast typecode resolution of simple array types. + */ + +#define N_DTYPES 12 +#define N_NDIM 5 /* Fast path for up to 5D array */ +#define N_LAYOUT 3 +static int cached_arycode[N_NDIM][N_LAYOUT][N_DTYPES]; + +/* Convert a Numpy dtype number to an internal index into cached_arycode. + The returned value must also be a valid index into BASIC_TYPECODES. */ +static int dtype_num_to_typecode(int type_num) { + int dtype; + switch(type_num) { + case NPY_INT8: + dtype = 0; + break; + case NPY_INT16: + dtype = 1; + break; + case NPY_INT32: + dtype = 2; + break; + case NPY_INT64: + dtype = 3; + break; + case NPY_UINT8: + dtype = 4; + break; + case NPY_UINT16: + dtype = 5; + break; + case NPY_UINT32: + dtype = 6; + break; + case NPY_UINT64: + dtype = 7; + break; + case NPY_FLOAT32: + dtype = 8; + break; + case NPY_FLOAT64: + dtype = 9; + break; + case NPY_COMPLEX64: + dtype = 10; + break; + case NPY_COMPLEX128: + dtype = 11; + break; + default: + /* Type not included in the global lookup table */ + dtype = -1; + } + return dtype; +} + +static +int get_cached_typecode(PyArray_Descr* descr) { + PyObject* tmpobject = PyDict_GetItem(typecache, (PyObject*)descr); + if (tmpobject == NULL) + return -1; + + return PyLong_AsLong(tmpobject); +} + +static +void cache_typecode(PyArray_Descr* descr, int typecode) { + PyObject* value = PyLong_FromLong(typecode); + PyDict_SetItem(typecache, (PyObject*)descr, value); + Py_DECREF(value); +} + +static +PyObject* ndarray_key(int ndim, int layout, PyArray_Descr* descr) { + PyObject* tmpndim = PyLong_FromLong(ndim); + PyObject* tmplayout = PyLong_FromLong(layout); + PyObject* key = PyTuple_Pack(3, tmpndim, tmplayout, descr); + Py_DECREF(tmpndim); + Py_DECREF(tmplayout); + return key; +} + +static +int get_cached_ndarray_typecode(int ndim, int layout, PyArray_Descr* descr) { + PyObject* key = ndarray_key(ndim, layout, descr); + PyObject *tmpobject = PyDict_GetItem(ndarray_typecache, key); + if (tmpobject == NULL) + return -1; + + Py_DECREF(key); + return PyLong_AsLong(tmpobject); +} + +static +void cache_ndarray_typecode(int ndim, int layout, PyArray_Descr* descr, + int typecode) { + PyObject* key = ndarray_key(ndim, layout, descr); + PyObject* value = PyLong_FromLong(typecode); + PyDict_SetItem(ndarray_typecache, key, value); + Py_DECREF(key); + Py_DECREF(value); +} + +static +int typecode_ndarray(PyObject *dispatcher, PyArrayObject *ary) { + int typecode; + int dtype; + int ndim = PyArray_NDIM(ary); + int layout = 0; + + /* The order in which we check for the right contiguous-ness is important. + The order must match the order by numba.numpy_support.map_layout. + Further, only *contiguous-ness* is checked, not alignment, byte order or + write permissions. + */ + if (PyArray_IS_C_CONTIGUOUS(ary)){ + layout = 1; + } else if (PyArray_IS_F_CONTIGUOUS(ary)) { + layout = 2; + } + + /* the typecode cache by convention is for "behaved" arrays (aligned and + * writeable), all others must be forced to the fall back */ + if (!PyArray_ISBEHAVED(ary)) goto FALLBACK; + + if (ndim <= 0 || ndim > N_NDIM) goto FALLBACK; + + dtype = dtype_num_to_typecode(PyArray_TYPE(ary)); + if (dtype == -1) goto FALLBACK; + + /* Fast path, using direct table lookup */ + assert(layout < N_LAYOUT); + assert(ndim <= N_NDIM); + assert(dtype < N_DTYPES); + + typecode = cached_arycode[ndim - 1][layout][dtype]; + if (typecode == -1) { + /* First use of this table entry, so it requires populating */ + typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary); + cached_arycode[ndim - 1][layout][dtype] = typecode; + } + return typecode; + +FALLBACK: + /* Slower path, for non-trivial array types */ + + /* If this isn't a structured array then we can't use the cache */ + if (PyArray_TYPE(ary) != NPY_VOID) + return typecode_using_fingerprint(dispatcher, (PyObject *) ary); + + /* Check type cache */ + typecode = get_cached_ndarray_typecode(ndim, layout, PyArray_DESCR(ary)); + if (typecode == -1) { + /* First use of this type, use fallback and populate the cache */ + typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary); + cache_ndarray_typecode(ndim, layout, PyArray_DESCR(ary), typecode); + } + return typecode; +} + +static +int typecode_arrayscalar(PyObject *dispatcher, PyObject* aryscalar) { + int typecode; + PyArray_Descr *descr; + descr = PyArray_DescrFromScalar(aryscalar); + if (!descr) + return typecode_using_fingerprint(dispatcher, aryscalar); + + /* Is it a structured scalar? */ + if (descr->type_num == NPY_VOID) { + typecode = get_cached_typecode(descr); + if (typecode == -1) { + /* Resolve through fallback then populate cache */ + typecode = typecode_fallback_keep_ref(dispatcher, aryscalar); + cache_typecode(descr, typecode); + } + Py_DECREF(descr); + return typecode; + } + + /* Is it one of the well-known basic types? */ + typecode = dtype_num_to_typecode(descr->type_num); + Py_DECREF(descr); + if (typecode == -1) + return typecode_using_fingerprint(dispatcher, aryscalar); + return BASIC_TYPECODES[typecode]; +} + +static +int typecode_devicendarray(PyObject *dispatcher, PyObject *ary) +{ + int typecode; + int dtype; + int ndim; + int layout = 0; + + PyObject* flags = PyObject_GetAttrString(ary, "flags"); + if (flags == NULL) + { + PyErr_Clear(); + goto FALLBACK; + } + + if (PyDict_GetItemString(flags, "C_CONTIGUOUS") == Py_True) { + layout = 1; + } else if (PyDict_GetItemString(flags, "F_CONTIGUOUS") == Py_True) { + layout = 2; + } + + Py_DECREF(flags); + + PyObject *ndim_obj = PyObject_GetAttrString(ary, "ndim"); + if (ndim_obj == NULL) { + /* If there's no ndim, try to proceed by clearing the error and using the + * fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + ndim = PyLong_AsLong(ndim_obj); + Py_DECREF(ndim_obj); + + if (PyErr_Occurred()) { + /* ndim wasn't an integer for some reason - unlikely to happen, but try + * the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + if (ndim <= 0 || ndim > N_NDIM) + goto FALLBACK; + + PyObject* dtype_obj = PyObject_GetAttrString(ary, "dtype"); + if (dtype_obj == NULL) { + /* No dtype: try the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + PyObject* num_obj = PyObject_GetAttrString(dtype_obj, "num"); + Py_DECREF(dtype_obj); + + if (num_obj == NULL) { + /* This strange dtype has no num - try the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + int dtype_num = PyLong_AsLong(num_obj); + Py_DECREF(num_obj); + + if (PyErr_Occurred()) { + /* num wasn't an integer for some reason - unlikely to happen, but try + * the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + dtype = dtype_num_to_typecode(dtype_num); + if (dtype == -1) { + /* Not a dtype we have in the global lookup table. */ + goto FALLBACK; + } + + /* Fast path, using direct table lookup */ + assert(layout < N_LAYOUT); + assert(ndim <= N_NDIM); + assert(dtype < N_DTYPES); + typecode = cached_arycode[ndim - 1][layout][dtype]; + + if (typecode == -1) { + /* First use of this table entry, so it requires populating */ + typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary); + cached_arycode[ndim - 1][layout][dtype] = typecode; + } + + return typecode; + +FALLBACK: + /* Slower path, for non-trivial array types. At present this always uses + the fingerprinting to get the typecode. Future optimization might + implement a cache, but this would require some fast equivalent of + PyArray_DESCR for a device array. */ + + return typecode_using_fingerprint(dispatcher, (PyObject *) ary); +} + +int +typeof_typecode(PyObject *dispatcher, PyObject *val) +{ + PyTypeObject *tyobj = Py_TYPE(val); + int subtype_attr; + /* This needs to be kept in sync with Dispatcher.typeof_pyval(), + * otherwise funny things may happen. + */ + if (tyobj == &PyInt_Type || tyobj == &PyLong_Type) { +#if SIZEOF_VOID_P < 8 + /* On 32-bit platforms, choose between tc_intp (32-bit) and tc_int64 */ + PY_LONG_LONG ll = PyLong_AsLongLong(val); + if (ll == -1 && PyErr_Occurred()) { + /* The integer is too large, let us truncate it */ + PyErr_Clear(); + return tc_int64; + } + if ((ll & 0xffffffff) != ll) + return tc_int64; +#endif + return tc_intp; + } + else if (tyobj == &PyFloat_Type) + return tc_float64; + else if (tyobj == &PyComplex_Type) + return tc_complex128; + /* Array scalar handling */ + else if (PyArray_CheckScalar(val)) { + return typecode_arrayscalar(dispatcher, val); + } + /* Array handling */ + else if (tyobj == &PyArray_Type) { + return typecode_ndarray(dispatcher, (PyArrayObject*)val); + } + /* Subtype of CUDA device array */ + else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) { + return typecode_devicendarray(dispatcher, val); + } + /* Subtypes of Array handling */ + else if (PyType_IsSubtype(tyobj, &PyArray_Type)) { + /* By default, Numba will treat all numpy.ndarray subtypes as if they + were the base numpy.ndarray type. In this way, ndarray subtypes + can easily use all of the support that Numba has for ndarray + methods. + EXPERIMENTAL: There may be cases where a programmer would NOT want + ndarray subtypes to be treated exactly like the base numpy.ndarray. + For this purpose, a currently experimental feature allows a + programmer to add an attribute named + __numba_array_subtype_dispatch__ to their ndarray subtype. This + attribute can have any value as Numba only checks for the presence + of the attribute and not its value. When present, a ndarray subtype + will NOT be typed by Numba as a regular ndarray but this code will + fallthrough to the typecode_using_fingerprint call, which will + create a new unique Numba typecode for this ndarray subtype. This + behavior has several significant effects. First, since this + ndarray subtype will be treated as a different type by Numba, + the Numba dispatcher would then specialize on this type. So, if + there was a function that had several parameters that were + expected to be either numpy.ndarray or a subtype of ndarray, then + Numba would compile a custom version of this function for each + combination of base and subtypes that were actually passed to the + function. Second, because this subtype would now be treated as + a totally separate type, it will cease to function in Numba unless + an implementation of that type is provided to Numba through the + Numba type extension mechanisms (e.g., overload). This would + typically start with defining a Numba type corresponding to the + ndarray subtype. This is the same concept as how Numba has a + corollary of numpy.ndarray in its type system as types.Array. + Next, one would typically defining boxing and unboxing routines + and the associated memory model. Then, overloads for NumPy + functions on that type would be created. However, + if the same default array memory model is used then there are tricks + one can do to look at Numba's internal types.Array registries and + to quickly apply those to the subtype as well. In this manner, + only those cases where the base ndarray and the ndarray subtype + behavior differ would new custom functions need to be written for + the subtype. Finally, + after adding support for the new type, you would have a separate + ndarray subtype that could operate with other objects of the same + subtype but would not support interoperation with regular NumPy + ndarrays. In standard Python, this interoperation is provided + through the __array_ufunc__ magic method in the ndarray subtype + class and in that case the function operates on ndarrays or their + subtypes. This idea is extended into Numba such that + __array_ufunc__ can be present in a Numba array type object. + In this case, this function is consulted during Numba typing and + so the arguments to __array_ufunc__ are Numba types instead of + ndarray subtypes. The array type __array_ufunc__ returns the + type of the output of the given ufunc. + */ + subtype_attr = PyObject_HasAttrString(val, "__numba_array_subtype_dispatch__"); + if (!subtype_attr) { + return typecode_ndarray(dispatcher, (PyArrayObject*)val); + } + } + + return typecode_using_fingerprint(dispatcher, val); +} + + +static +void* wrap_import_array(void) { + import_array(); /* import array returns NULL on failure */ + return (void*)1; +} + + +static +int init_numpy(void) { + return wrap_import_array() != NULL; +} + + +/* + * typeof_init(omittedarg_type, typecode_dict) + * (called from dispatcher.py to fill in missing information) + */ +PyObject * +typeof_init(PyObject *self, PyObject *args) +{ + PyObject *tmpobj; + PyObject *dict; + int index = 0; + + if (!PyArg_ParseTuple(args, "O!O!:typeof_init", + &PyType_Type, &omittedarg_type, + &PyDict_Type, &dict)) + return NULL; + + /* Initialize Numpy API */ + if ( ! init_numpy() ) { + return NULL; + } + + #define UNWRAP_TYPE(S) \ + if(!(tmpobj = PyDict_GetItemString(dict, #S))) return NULL; \ + else { tc_##S = PyLong_AsLong(tmpobj); \ + BASIC_TYPECODES[index++] = tc_##S; } + + UNWRAP_TYPE(int8) + UNWRAP_TYPE(int16) + UNWRAP_TYPE(int32) + UNWRAP_TYPE(int64) + + UNWRAP_TYPE(uint8) + UNWRAP_TYPE(uint16) + UNWRAP_TYPE(uint32) + UNWRAP_TYPE(uint64) + + UNWRAP_TYPE(float32) + UNWRAP_TYPE(float64) + + UNWRAP_TYPE(complex64) + UNWRAP_TYPE(complex128) + + switch(sizeof(void*)) { + case 4: + tc_intp = tc_int32; + break; + case 8: + tc_intp = tc_int64; + break; + default: + PyErr_SetString(PyExc_AssertionError, "sizeof(void*) != {4, 8}"); + return NULL; + } + + #undef UNWRAP_TYPE + + typecache = PyDict_New(); + ndarray_typecache = PyDict_New(); + structured_dtypes = PyDict_New(); + if (typecache == NULL || ndarray_typecache == NULL || + structured_dtypes == NULL) { + PyErr_SetString(PyExc_RuntimeError, "failed to create type cache"); + return NULL; + } + + fingerprint_hashtable = _Numba_hashtable_new(sizeof(int), + hash_writer, + compare_writer); + if (fingerprint_hashtable == NULL) { + PyErr_NoMemory(); + return NULL; + } + + /* initialize cached_arycode to all ones (in bits) */ + memset(cached_arycode, 0xFF, sizeof(cached_arycode)); + + str_typeof_pyval = PyString_InternFromString("typeof_pyval"); + str_value = PyString_InternFromString("value"); + str_numba_type = PyString_InternFromString("_numba_type_"); + if (!str_value || !str_typeof_pyval || !str_numba_type) + return NULL; + + Py_RETURN_NONE; +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_typeof.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/_typeof.h new file mode 100644 index 0000000000000000000000000000000000000000..6e0039b5f3814dcb6b666ddc767019b815781742 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_typeof.h @@ -0,0 +1,16 @@ +#ifndef NUMBA_TYPEOF_H_ +#define NUMBA_TYPEOF_H_ + +#ifdef __cplusplus + extern "C" { +#endif + +extern PyObject *typeof_init(PyObject *self, PyObject *args); +extern int typeof_typecode(PyObject *dispatcher, PyObject *val); +extern PyObject *typeof_compute_fingerprint(PyObject *val); + +#ifdef __cplusplus + } +#endif + +#endif /* NUMBA_TYPEOF_H_ */ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_unicodetype_db.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/_unicodetype_db.h new file mode 100644 index 0000000000000000000000000000000000000000..d4dca060d776ed479d272cdd7514d95a54839724 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_unicodetype_db.h @@ -0,0 +1,6091 @@ +/* This file is from CPython: + * https://github.com/python/cpython/blob/3.7/Objects/unicodetype_db.h + * As of Commit SHA: 1d4b6ba19466aba0eb91c4ba01ba509acf18c723 + * + * Changes made include: + * - Renaming all functions and structures with a `numba` prefix to prevent + * collisions. + * + * NOTE: Numba devs, this may need updating from time to time as the unicode + * standard is updated. + */ + +#ifndef _UNICODETYPE_DB_H +#define _UNICODETYPE_DB_H + +/*Py_UCS4 definition from Include/unicodeobject.h */ +#define Py_UCS4 uint32_t + +typedef struct { + /* + These are either deltas to the character or offsets in + _PyUnicode_ExtendedCase. + */ + const int upper; + const int lower; + const int title; + /* Note if more flag space is needed, decimal and digit could be unified. */ + const unsigned char decimal; + const unsigned char digit; + const unsigned short flags; +} numba_PyUnicode_TypeRecord; + +/* -------------------------------------------------------------------------- */ +/* CPython unicodetype_db.h definitions start here */ +/* -------------------------------------------------------------------------- */ + +/* this file was generated by Tools/unicode/makeunicodedata.py 3.2 */ + +/* a list of unique character type descriptors */ +const numba_PyUnicode_TypeRecord numba_PyUnicode_TypeRecords[] = { + {0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 32}, + {0, 0, 0, 0, 0, 48}, + {0, 0, 0, 0, 0, 1056}, + {0, 0, 0, 0, 0, 1024}, + {0, 0, 0, 0, 0, 5120}, + {0, 0, 0, 0, 0, 3590}, + {0, 0, 0, 1, 1, 3590}, + {0, 0, 0, 2, 2, 3590}, + {0, 0, 0, 3, 3, 3590}, + {0, 0, 0, 4, 4, 3590}, + {0, 0, 0, 5, 5, 3590}, + {0, 0, 0, 6, 6, 3590}, + {0, 0, 0, 7, 7, 3590}, + {0, 0, 0, 8, 8, 3590}, + {0, 0, 0, 9, 9, 3590}, + {0, 32, 0, 0, 0, 10113}, + {0, 0, 0, 0, 0, 1536}, + {-32, 0, -32, 0, 0, 9993}, + {0, 0, 0, 0, 0, 9993}, + {0, 0, 0, 0, 0, 4096}, + {0, 0, 0, 0, 2, 3076}, + {0, 0, 0, 0, 3, 3076}, + {16777218, 17825792, 16777218, 0, 0, 26377}, + {0, 0, 0, 0, 0, 5632}, + {0, 0, 0, 0, 1, 3076}, + {0, 0, 0, 0, 0, 3072}, + {33554438, 18874371, 33554440, 0, 0, 26377}, + {121, 0, 121, 0, 0, 9993}, + {0, 1, 0, 0, 0, 10113}, + {-1, 0, -1, 0, 0, 9993}, + {16777228, 33554442, 16777228, 0, 0, 26497}, + {-232, 0, -232, 0, 0, 9993}, + {33554448, 18874381, 33554448, 0, 0, 26377}, + {0, -121, 0, 0, 0, 10113}, + {16777236, 17825810, 16777236, 0, 0, 26377}, + {195, 0, 195, 0, 0, 9993}, + {0, 210, 0, 0, 0, 10113}, + {0, 206, 0, 0, 0, 10113}, + {0, 205, 0, 0, 0, 10113}, + {0, 79, 0, 0, 0, 10113}, + {0, 202, 0, 0, 0, 10113}, + {0, 203, 0, 0, 0, 10113}, + {0, 207, 0, 0, 0, 10113}, + {97, 0, 97, 0, 0, 9993}, + {0, 211, 0, 0, 0, 10113}, + {0, 209, 0, 0, 0, 10113}, + {163, 0, 163, 0, 0, 9993}, + {0, 213, 0, 0, 0, 10113}, + {130, 0, 130, 0, 0, 9993}, + {0, 214, 0, 0, 0, 10113}, + {0, 218, 0, 0, 0, 10113}, + {0, 217, 0, 0, 0, 10113}, + {0, 219, 0, 0, 0, 10113}, + {0, 0, 0, 0, 0, 1793}, + {56, 0, 56, 0, 0, 9993}, + {0, 2, 1, 0, 0, 10113}, + {-1, 1, 0, 0, 0, 10049}, + {-2, 0, -1, 0, 0, 9993}, + {-79, 0, -79, 0, 0, 9993}, + {33554456, 18874389, 33554456, 0, 0, 26377}, + {0, -97, 0, 0, 0, 10113}, + {0, -56, 0, 0, 0, 10113}, + {0, -130, 0, 0, 0, 10113}, + {0, 10795, 0, 0, 0, 10113}, + {0, -163, 0, 0, 0, 10113}, + {0, 10792, 0, 0, 0, 10113}, + {10815, 0, 10815, 0, 0, 9993}, + {0, -195, 0, 0, 0, 10113}, + {0, 69, 0, 0, 0, 10113}, + {0, 71, 0, 0, 0, 10113}, + {10783, 0, 10783, 0, 0, 9993}, + {10780, 0, 10780, 0, 0, 9993}, + {10782, 0, 10782, 0, 0, 9993}, + {-210, 0, -210, 0, 0, 9993}, + {-206, 0, -206, 0, 0, 9993}, + {-205, 0, -205, 0, 0, 9993}, + {-202, 0, -202, 0, 0, 9993}, + {-203, 0, -203, 0, 0, 9993}, + {42319, 0, 42319, 0, 0, 9993}, + {42315, 0, 42315, 0, 0, 9993}, + {-207, 0, -207, 0, 0, 9993}, + {42280, 0, 42280, 0, 0, 9993}, + {42308, 0, 42308, 0, 0, 9993}, + {-209, 0, -209, 0, 0, 9993}, + {-211, 0, -211, 0, 0, 9993}, + {10743, 0, 10743, 0, 0, 9993}, + {42305, 0, 42305, 0, 0, 9993}, + {10749, 0, 10749, 0, 0, 9993}, + {-213, 0, -213, 0, 0, 9993}, + {-214, 0, -214, 0, 0, 9993}, + {10727, 0, 10727, 0, 0, 9993}, + {-218, 0, -218, 0, 0, 9993}, + {42282, 0, 42282, 0, 0, 9993}, + {-69, 0, -69, 0, 0, 9993}, + {-217, 0, -217, 0, 0, 9993}, + {-71, 0, -71, 0, 0, 9993}, + {-219, 0, -219, 0, 0, 9993}, + {42261, 0, 42261, 0, 0, 9993}, + {42258, 0, 42258, 0, 0, 9993}, + {0, 0, 0, 0, 0, 14089}, + {0, 0, 0, 0, 0, 5889}, + {16777244, 17825818, 16777244, 0, 0, 30216}, + {0, 0, 0, 0, 0, 13321}, + {0, 116, 0, 0, 0, 10113}, + {0, 38, 0, 0, 0, 10113}, + {0, 37, 0, 0, 0, 10113}, + {0, 64, 0, 0, 0, 10113}, + {0, 63, 0, 0, 0, 10113}, + {50331681, 19922973, 50331681, 0, 0, 26377}, + {-38, 0, -38, 0, 0, 9993}, + {-37, 0, -37, 0, 0, 9993}, + {50331688, 19922980, 50331688, 0, 0, 26377}, + {16777261, 17825835, 16777261, 0, 0, 26377}, + {-64, 0, -64, 0, 0, 9993}, + {-63, 0, -63, 0, 0, 9993}, + {0, 8, 0, 0, 0, 10113}, + {16777264, 17825838, 16777264, 0, 0, 26377}, + {16777267, 17825841, 16777267, 0, 0, 26377}, + {0, 0, 0, 0, 0, 10113}, + {16777270, 17825844, 16777270, 0, 0, 26377}, + {16777273, 17825847, 16777273, 0, 0, 26377}, + {-8, 0, -8, 0, 0, 9993}, + {16777276, 17825850, 16777276, 0, 0, 26377}, + {16777279, 17825853, 16777279, 0, 0, 26377}, + {7, 0, 7, 0, 0, 9993}, + {-116, 0, -116, 0, 0, 9993}, + {0, -60, 0, 0, 0, 10113}, + {16777282, 17825856, 16777282, 0, 0, 26377}, + {0, -7, 0, 0, 0, 10113}, + {0, 80, 0, 0, 0, 10113}, + {-80, 0, -80, 0, 0, 9993}, + {0, 15, 0, 0, 0, 10113}, + {-15, 0, -15, 0, 0, 9993}, + {0, 48, 0, 0, 0, 10113}, + {-48, 0, -48, 0, 0, 9993}, + {33554502, 18874435, 33554504, 0, 0, 26377}, + {0, 0, 0, 0, 0, 1537}, + {0, 7264, 0, 0, 0, 10113}, + {3008, 0, 0, 0, 0, 9993}, + {0, 0, 0, 0, 1, 3588}, + {0, 0, 0, 0, 2, 3588}, + {0, 0, 0, 0, 3, 3588}, + {0, 0, 0, 0, 4, 3588}, + {0, 0, 0, 0, 5, 3588}, + {0, 0, 0, 0, 6, 3588}, + {0, 0, 0, 0, 7, 3588}, + {0, 0, 0, 0, 8, 3588}, + {0, 0, 0, 0, 9, 3588}, + {16777292, 17825866, 16777292, 0, 0, 26497}, + {16777295, 17825869, 16777295, 0, 0, 26497}, + {16777298, 17825872, 16777298, 0, 0, 26497}, + {16777301, 17825875, 16777301, 0, 0, 26497}, + {16777304, 17825878, 16777304, 0, 0, 26497}, + {16777307, 17825881, 16777307, 0, 0, 26497}, + {16777310, 17825884, 16777310, 0, 0, 26497}, + {16777313, 17825887, 16777313, 0, 0, 26497}, + {16777316, 17825890, 16777316, 0, 0, 26497}, + {16777319, 17825893, 16777319, 0, 0, 26497}, + {16777322, 17825896, 16777322, 0, 0, 26497}, + {16777325, 17825899, 16777325, 0, 0, 26497}, + {16777328, 17825902, 16777328, 0, 0, 26497}, + {16777331, 17825905, 16777331, 0, 0, 26497}, + {16777334, 17825908, 16777334, 0, 0, 26497}, + {16777337, 17825911, 16777337, 0, 0, 26497}, + {16777340, 17825914, 16777340, 0, 0, 26497}, + {16777343, 17825917, 16777343, 0, 0, 26497}, + {16777346, 17825920, 16777346, 0, 0, 26497}, + {16777349, 17825923, 16777349, 0, 0, 26497}, + {16777352, 17825926, 16777352, 0, 0, 26497}, + {16777355, 17825929, 16777355, 0, 0, 26497}, + {16777358, 17825932, 16777358, 0, 0, 26497}, + {16777361, 17825935, 16777361, 0, 0, 26497}, + {16777364, 17825938, 16777364, 0, 0, 26497}, + {16777367, 17825941, 16777367, 0, 0, 26497}, + {16777370, 17825944, 16777370, 0, 0, 26497}, + {16777373, 17825947, 16777373, 0, 0, 26497}, + {16777376, 17825950, 16777376, 0, 0, 26497}, + {16777379, 17825953, 16777379, 0, 0, 26497}, + {16777382, 17825956, 16777382, 0, 0, 26497}, + {16777385, 17825959, 16777385, 0, 0, 26497}, + {16777388, 17825962, 16777388, 0, 0, 26497}, + {16777391, 17825965, 16777391, 0, 0, 26497}, + {16777394, 17825968, 16777394, 0, 0, 26497}, + {16777397, 17825971, 16777397, 0, 0, 26497}, + {16777400, 17825974, 16777400, 0, 0, 26497}, + {16777403, 17825977, 16777403, 0, 0, 26497}, + {16777406, 17825980, 16777406, 0, 0, 26497}, + {16777409, 17825983, 16777409, 0, 0, 26497}, + {16777412, 17825986, 16777412, 0, 0, 26497}, + {16777415, 17825989, 16777415, 0, 0, 26497}, + {16777418, 17825992, 16777418, 0, 0, 26497}, + {16777421, 17825995, 16777421, 0, 0, 26497}, + {16777424, 17825998, 16777424, 0, 0, 26497}, + {16777427, 17826001, 16777427, 0, 0, 26497}, + {16777430, 17826004, 16777430, 0, 0, 26497}, + {16777433, 17826007, 16777433, 0, 0, 26497}, + {16777436, 17826010, 16777436, 0, 0, 26497}, + {16777439, 17826013, 16777439, 0, 0, 26497}, + {16777442, 17826016, 16777442, 0, 0, 26497}, + {16777445, 17826019, 16777445, 0, 0, 26497}, + {16777448, 17826022, 16777448, 0, 0, 26497}, + {16777451, 17826025, 16777451, 0, 0, 26497}, + {16777454, 17826028, 16777454, 0, 0, 26497}, + {16777457, 17826031, 16777457, 0, 0, 26497}, + {16777460, 17826034, 16777460, 0, 0, 26497}, + {16777463, 17826037, 16777463, 0, 0, 26497}, + {16777466, 17826040, 16777466, 0, 0, 26497}, + {16777469, 17826043, 16777469, 0, 0, 26497}, + {16777472, 17826046, 16777472, 0, 0, 26497}, + {16777475, 17826049, 16777475, 0, 0, 26497}, + {16777478, 17826052, 16777478, 0, 0, 26497}, + {16777481, 17826055, 16777481, 0, 0, 26497}, + {16777484, 17826058, 16777484, 0, 0, 26497}, + {16777487, 17826061, 16777487, 0, 0, 26497}, + {16777490, 17826064, 16777490, 0, 0, 26497}, + {16777493, 17826067, 16777493, 0, 0, 26497}, + {16777496, 17826070, 16777496, 0, 0, 26497}, + {16777499, 17826073, 16777499, 0, 0, 26497}, + {16777502, 17826076, 16777502, 0, 0, 26497}, + {16777505, 17826079, 16777505, 0, 0, 26497}, + {16777508, 17826082, 16777508, 0, 0, 26497}, + {16777511, 17826085, 16777511, 0, 0, 26497}, + {16777514, 17826088, 16777514, 0, 0, 26497}, + {16777517, 17826091, 16777517, 0, 0, 26497}, + {16777520, 17826094, 16777520, 0, 0, 26497}, + {16777523, 17826097, 16777523, 0, 0, 26497}, + {16777526, 17826100, 16777526, 0, 0, 26497}, + {16777529, 17826103, 16777529, 0, 0, 26497}, + {16777532, 17826106, 16777532, 0, 0, 26497}, + {16777535, 17826109, 16777535, 0, 0, 26497}, + {16777538, 17826112, 16777538, 0, 0, 26497}, + {16777541, 17826115, 16777541, 0, 0, 26497}, + {16777544, 17826118, 16777544, 0, 0, 26497}, + {16777547, 17826121, 16777547, 0, 0, 26497}, + {16777550, 17826124, 16777550, 0, 0, 26377}, + {16777553, 17826127, 16777553, 0, 0, 26377}, + {16777556, 17826130, 16777556, 0, 0, 26377}, + {16777559, 17826133, 16777559, 0, 0, 26377}, + {16777562, 17826136, 16777562, 0, 0, 26377}, + {16777565, 17826139, 16777565, 0, 0, 26377}, + {0, 0, 0, 0, 0, 3840}, + {0, 0, 0, 0, 0, 5888}, + {16777568, 17826142, 16777568, 0, 0, 26377}, + {16777571, 17826145, 16777571, 0, 0, 26377}, + {16777574, 17826148, 16777574, 0, 0, 26377}, + {16777577, 17826151, 16777577, 0, 0, 26377}, + {16777580, 17826154, 16777580, 0, 0, 26377}, + {16777583, 17826157, 16777583, 0, 0, 26377}, + {16777586, 17826160, 16777586, 0, 0, 26377}, + {16777589, 17826163, 16777589, 0, 0, 26377}, + {16777592, 17826166, 16777592, 0, 0, 26377}, + {0, -3008, 0, 0, 0, 10113}, + {35332, 0, 35332, 0, 0, 9993}, + {3814, 0, 3814, 0, 0, 9993}, + {33554812, 18874745, 33554812, 0, 0, 26377}, + {33554817, 18874750, 33554817, 0, 0, 26377}, + {33554822, 18874755, 33554822, 0, 0, 26377}, + {33554827, 18874760, 33554827, 0, 0, 26377}, + {33554832, 18874765, 33554832, 0, 0, 26377}, + {16777620, 17826194, 16777620, 0, 0, 26377}, + {16777624, 18874773, 16777624, 0, 0, 26497}, + {8, 0, 8, 0, 0, 9993}, + {0, -8, 0, 0, 0, 10113}, + {33554844, 18874777, 33554844, 0, 0, 26377}, + {50332066, 19923358, 50332066, 0, 0, 26377}, + {50332073, 19923365, 50332073, 0, 0, 26377}, + {50332080, 19923372, 50332080, 0, 0, 26377}, + {74, 0, 74, 0, 0, 9993}, + {86, 0, 86, 0, 0, 9993}, + {100, 0, 100, 0, 0, 9993}, + {128, 0, 128, 0, 0, 9993}, + {112, 0, 112, 0, 0, 9993}, + {126, 0, 126, 0, 0, 9993}, + {33554870, 18874803, 16777656, 0, 0, 26377}, + {33554876, 18874809, 16777662, 0, 0, 26377}, + {33554882, 18874815, 16777668, 0, 0, 26377}, + {33554888, 18874821, 16777674, 0, 0, 26377}, + {33554894, 18874827, 16777680, 0, 0, 26377}, + {33554900, 18874833, 16777686, 0, 0, 26377}, + {33554906, 18874839, 16777692, 0, 0, 26377}, + {33554912, 18874845, 16777698, 0, 0, 26377}, + {33554918, 18874851, 16777704, 0, 0, 26433}, + {33554924, 18874857, 16777710, 0, 0, 26433}, + {33554930, 18874863, 16777716, 0, 0, 26433}, + {33554936, 18874869, 16777722, 0, 0, 26433}, + {33554942, 18874875, 16777728, 0, 0, 26433}, + {33554948, 18874881, 16777734, 0, 0, 26433}, + {33554954, 18874887, 16777740, 0, 0, 26433}, + {33554960, 18874893, 16777746, 0, 0, 26433}, + {33554966, 18874899, 16777752, 0, 0, 26377}, + {33554972, 18874905, 16777758, 0, 0, 26377}, + {33554978, 18874911, 16777764, 0, 0, 26377}, + {33554984, 18874917, 16777770, 0, 0, 26377}, + {33554990, 18874923, 16777776, 0, 0, 26377}, + {33554996, 18874929, 16777782, 0, 0, 26377}, + {33555002, 18874935, 16777788, 0, 0, 26377}, + {33555008, 18874941, 16777794, 0, 0, 26377}, + {33555014, 18874947, 16777800, 0, 0, 26433}, + {33555020, 18874953, 16777806, 0, 0, 26433}, + {33555026, 18874959, 16777812, 0, 0, 26433}, + {33555032, 18874965, 16777818, 0, 0, 26433}, + {33555038, 18874971, 16777824, 0, 0, 26433}, + {33555044, 18874977, 16777830, 0, 0, 26433}, + {33555050, 18874983, 16777836, 0, 0, 26433}, + {33555056, 18874989, 16777842, 0, 0, 26433}, + {33555062, 18874995, 16777848, 0, 0, 26377}, + {33555068, 18875001, 16777854, 0, 0, 26377}, + {33555074, 18875007, 16777860, 0, 0, 26377}, + {33555080, 18875013, 16777866, 0, 0, 26377}, + {33555086, 18875019, 16777872, 0, 0, 26377}, + {33555092, 18875025, 16777878, 0, 0, 26377}, + {33555098, 18875031, 16777884, 0, 0, 26377}, + {33555104, 18875037, 16777890, 0, 0, 26377}, + {33555110, 18875043, 16777896, 0, 0, 26433}, + {33555116, 18875049, 16777902, 0, 0, 26433}, + {33555122, 18875055, 16777908, 0, 0, 26433}, + {33555128, 18875061, 16777914, 0, 0, 26433}, + {33555134, 18875067, 16777920, 0, 0, 26433}, + {33555140, 18875073, 16777926, 0, 0, 26433}, + {33555146, 18875079, 16777932, 0, 0, 26433}, + {33555152, 18875085, 16777938, 0, 0, 26433}, + {33555158, 18875091, 33555160, 0, 0, 26377}, + {33555165, 18875098, 16777951, 0, 0, 26377}, + {33555171, 18875104, 33555173, 0, 0, 26377}, + {33555178, 18875111, 33555178, 0, 0, 26377}, + {50332400, 19923692, 50332403, 0, 0, 26377}, + {0, -74, 0, 0, 0, 10113}, + {33555193, 18875126, 16777979, 0, 0, 26433}, + {16777982, 17826556, 16777982, 0, 0, 26377}, + {33555202, 18875135, 33555204, 0, 0, 26377}, + {33555209, 18875142, 16777995, 0, 0, 26377}, + {33555215, 18875148, 33555217, 0, 0, 26377}, + {33555222, 18875155, 33555222, 0, 0, 26377}, + {50332444, 19923736, 50332447, 0, 0, 26377}, + {0, -86, 0, 0, 0, 10113}, + {33555237, 18875170, 16778023, 0, 0, 26433}, + {50332460, 19923752, 50332460, 0, 0, 26377}, + {50332467, 19923759, 50332467, 0, 0, 26377}, + {33555257, 18875190, 33555257, 0, 0, 26377}, + {50332479, 19923771, 50332479, 0, 0, 26377}, + {0, -100, 0, 0, 0, 10113}, + {50332486, 19923778, 50332486, 0, 0, 26377}, + {50332493, 19923785, 50332493, 0, 0, 26377}, + {33555283, 18875216, 33555283, 0, 0, 26377}, + {33555288, 18875221, 33555288, 0, 0, 26377}, + {50332510, 19923802, 50332510, 0, 0, 26377}, + {0, -112, 0, 0, 0, 10113}, + {33555300, 18875233, 33555302, 0, 0, 26377}, + {33555307, 18875240, 16778093, 0, 0, 26377}, + {33555313, 18875246, 33555315, 0, 0, 26377}, + {33555320, 18875253, 33555320, 0, 0, 26377}, + {50332542, 19923834, 50332545, 0, 0, 26377}, + {0, -128, 0, 0, 0, 10113}, + {0, -126, 0, 0, 0, 10113}, + {33555335, 18875268, 16778121, 0, 0, 26433}, + {0, 0, 0, 0, 0, 3076}, + {0, 0, 0, 0, 4, 3076}, + {0, 0, 0, 0, 5, 3076}, + {0, 0, 0, 0, 6, 3076}, + {0, 0, 0, 0, 7, 3076}, + {0, 0, 0, 0, 8, 3076}, + {0, 0, 0, 0, 9, 3076}, + {0, 0, 0, 0, 0, 1792}, + {0, -7517, 0, 0, 0, 10113}, + {0, -8383, 0, 0, 0, 10113}, + {0, -8262, 0, 0, 0, 10113}, + {0, 28, 0, 0, 0, 10113}, + {-28, 0, -28, 0, 0, 9993}, + {0, 16, 0, 0, 0, 12160}, + {-16, 0, -16, 0, 0, 12040}, + {0, 26, 0, 0, 0, 9344}, + {-26, 0, -26, 0, 0, 9224}, + {0, -10743, 0, 0, 0, 10113}, + {0, -3814, 0, 0, 0, 10113}, + {0, -10727, 0, 0, 0, 10113}, + {-10795, 0, -10795, 0, 0, 9993}, + {-10792, 0, -10792, 0, 0, 9993}, + {0, -10780, 0, 0, 0, 10113}, + {0, -10749, 0, 0, 0, 10113}, + {0, -10783, 0, 0, 0, 10113}, + {0, -10782, 0, 0, 0, 10113}, + {0, -10815, 0, 0, 0, 10113}, + {-7264, 0, -7264, 0, 0, 9993}, + {0, 0, 0, 0, 0, 5121}, + {0, 0, 0, 0, 0, 3841}, + {0, -35332, 0, 0, 0, 10113}, + {0, -42280, 0, 0, 0, 10113}, + {0, -42308, 0, 0, 0, 10113}, + {0, -42319, 0, 0, 0, 10113}, + {0, -42315, 0, 0, 0, 10113}, + {0, -42305, 0, 0, 0, 10113}, + {0, -42258, 0, 0, 0, 10113}, + {0, -42282, 0, 0, 0, 10113}, + {0, -42261, 0, 0, 0, 10113}, + {0, 928, 0, 0, 0, 10113}, + {-928, 0, -928, 0, 0, 9993}, + {16778124, 17826698, 16778124, 0, 0, 26377}, + {16778127, 17826701, 16778127, 0, 0, 26377}, + {16778130, 17826704, 16778130, 0, 0, 26377}, + {16778133, 17826707, 16778133, 0, 0, 26377}, + {16778136, 17826710, 16778136, 0, 0, 26377}, + {16778139, 17826713, 16778139, 0, 0, 26377}, + {16778142, 17826716, 16778142, 0, 0, 26377}, + {16778145, 17826719, 16778145, 0, 0, 26377}, + {16778148, 17826722, 16778148, 0, 0, 26377}, + {16778151, 17826725, 16778151, 0, 0, 26377}, + {16778154, 17826728, 16778154, 0, 0, 26377}, + {16778157, 17826731, 16778157, 0, 0, 26377}, + {16778160, 17826734, 16778160, 0, 0, 26377}, + {16778163, 17826737, 16778163, 0, 0, 26377}, + {16778166, 17826740, 16778166, 0, 0, 26377}, + {16778169, 17826743, 16778169, 0, 0, 26377}, + {16778172, 17826746, 16778172, 0, 0, 26377}, + {16778175, 17826749, 16778175, 0, 0, 26377}, + {16778178, 17826752, 16778178, 0, 0, 26377}, + {16778181, 17826755, 16778181, 0, 0, 26377}, + {16778184, 17826758, 16778184, 0, 0, 26377}, + {16778187, 17826761, 16778187, 0, 0, 26377}, + {16778190, 17826764, 16778190, 0, 0, 26377}, + {16778193, 17826767, 16778193, 0, 0, 26377}, + {16778196, 17826770, 16778196, 0, 0, 26377}, + {16778199, 17826773, 16778199, 0, 0, 26377}, + {16778202, 17826776, 16778202, 0, 0, 26377}, + {16778205, 17826779, 16778205, 0, 0, 26377}, + {16778208, 17826782, 16778208, 0, 0, 26377}, + {16778211, 17826785, 16778211, 0, 0, 26377}, + {16778214, 17826788, 16778214, 0, 0, 26377}, + {16778217, 17826791, 16778217, 0, 0, 26377}, + {16778220, 17826794, 16778220, 0, 0, 26377}, + {16778223, 17826797, 16778223, 0, 0, 26377}, + {16778226, 17826800, 16778226, 0, 0, 26377}, + {16778229, 17826803, 16778229, 0, 0, 26377}, + {16778232, 17826806, 16778232, 0, 0, 26377}, + {16778235, 17826809, 16778235, 0, 0, 26377}, + {16778238, 17826812, 16778238, 0, 0, 26377}, + {16778241, 17826815, 16778241, 0, 0, 26377}, + {16778244, 17826818, 16778244, 0, 0, 26377}, + {16778247, 17826821, 16778247, 0, 0, 26377}, + {16778250, 17826824, 16778250, 0, 0, 26377}, + {16778253, 17826827, 16778253, 0, 0, 26377}, + {16778256, 17826830, 16778256, 0, 0, 26377}, + {16778259, 17826833, 16778259, 0, 0, 26377}, + {16778262, 17826836, 16778262, 0, 0, 26377}, + {16778265, 17826839, 16778265, 0, 0, 26377}, + {16778268, 17826842, 16778268, 0, 0, 26377}, + {16778271, 17826845, 16778271, 0, 0, 26377}, + {16778274, 17826848, 16778274, 0, 0, 26377}, + {16778277, 17826851, 16778277, 0, 0, 26377}, + {16778280, 17826854, 16778280, 0, 0, 26377}, + {16778283, 17826857, 16778283, 0, 0, 26377}, + {16778286, 17826860, 16778286, 0, 0, 26377}, + {16778289, 17826863, 16778289, 0, 0, 26377}, + {16778292, 17826866, 16778292, 0, 0, 26377}, + {16778295, 17826869, 16778295, 0, 0, 26377}, + {16778298, 17826872, 16778298, 0, 0, 26377}, + {16778301, 17826875, 16778301, 0, 0, 26377}, + {16778304, 17826878, 16778304, 0, 0, 26377}, + {16778307, 17826881, 16778307, 0, 0, 26377}, + {16778310, 17826884, 16778310, 0, 0, 26377}, + {16778313, 17826887, 16778313, 0, 0, 26377}, + {16778316, 17826890, 16778316, 0, 0, 26377}, + {16778319, 17826893, 16778319, 0, 0, 26377}, + {16778322, 17826896, 16778322, 0, 0, 26377}, + {16778325, 17826899, 16778325, 0, 0, 26377}, + {16778328, 17826902, 16778328, 0, 0, 26377}, + {16778331, 17826905, 16778331, 0, 0, 26377}, + {16778334, 17826908, 16778334, 0, 0, 26377}, + {16778337, 17826911, 16778337, 0, 0, 26377}, + {16778340, 17826914, 16778340, 0, 0, 26377}, + {16778343, 17826917, 16778343, 0, 0, 26377}, + {16778346, 17826920, 16778346, 0, 0, 26377}, + {16778349, 17826923, 16778349, 0, 0, 26377}, + {16778352, 17826926, 16778352, 0, 0, 26377}, + {16778355, 17826929, 16778355, 0, 0, 26377}, + {16778358, 17826932, 16778358, 0, 0, 26377}, + {16778361, 17826935, 16778361, 0, 0, 26377}, + {33555581, 18875514, 33555583, 0, 0, 26377}, + {33555588, 18875521, 33555590, 0, 0, 26377}, + {33555595, 18875528, 33555597, 0, 0, 26377}, + {50332819, 19924111, 50332822, 0, 0, 26377}, + {50332829, 19924121, 50332832, 0, 0, 26377}, + {33555622, 18875555, 33555624, 0, 0, 26377}, + {33555629, 18875562, 33555631, 0, 0, 26377}, + {33555636, 18875569, 33555638, 0, 0, 26377}, + {33555643, 18875576, 33555645, 0, 0, 26377}, + {33555650, 18875583, 33555652, 0, 0, 26377}, + {33555657, 18875590, 33555659, 0, 0, 26377}, + {33555664, 18875597, 33555666, 0, 0, 26377}, + {0, 0, 0, 0, 0, 1025}, + {0, 0, 0, 0, 0, 5633}, + {0, 40, 0, 0, 0, 10113}, + {-40, 0, -40, 0, 0, 9993}, + {0, 34, 0, 0, 0, 10113}, + {-34, 0, -34, 0, 0, 9993}, + {0, 0, 0, 0, 0, 9344}, +}; + +/* extended case mappings */ + +const Py_UCS4 numba_PyUnicode_ExtendedCase[] = { + 181, + 956, + 924, + 223, + 115, + 115, + 83, + 83, + 83, + 115, + 105, + 775, + 304, + 329, + 700, + 110, + 700, + 78, + 383, + 115, + 83, + 496, + 106, + 780, + 74, + 780, + 837, + 953, + 921, + 912, + 953, + 776, + 769, + 921, + 776, + 769, + 944, + 965, + 776, + 769, + 933, + 776, + 769, + 962, + 963, + 931, + 976, + 946, + 914, + 977, + 952, + 920, + 981, + 966, + 934, + 982, + 960, + 928, + 1008, + 954, + 922, + 1009, + 961, + 929, + 1013, + 949, + 917, + 1415, + 1381, + 1410, + 1333, + 1362, + 1333, + 1410, + 43888, + 5024, + 5024, + 43889, + 5025, + 5025, + 43890, + 5026, + 5026, + 43891, + 5027, + 5027, + 43892, + 5028, + 5028, + 43893, + 5029, + 5029, + 43894, + 5030, + 5030, + 43895, + 5031, + 5031, + 43896, + 5032, + 5032, + 43897, + 5033, + 5033, + 43898, + 5034, + 5034, + 43899, + 5035, + 5035, + 43900, + 5036, + 5036, + 43901, + 5037, + 5037, + 43902, + 5038, + 5038, + 43903, + 5039, + 5039, + 43904, + 5040, + 5040, + 43905, + 5041, + 5041, + 43906, + 5042, + 5042, + 43907, + 5043, + 5043, + 43908, + 5044, + 5044, + 43909, + 5045, + 5045, + 43910, + 5046, + 5046, + 43911, + 5047, + 5047, + 43912, + 5048, + 5048, + 43913, + 5049, + 5049, + 43914, + 5050, + 5050, + 43915, + 5051, + 5051, + 43916, + 5052, + 5052, + 43917, + 5053, + 5053, + 43918, + 5054, + 5054, + 43919, + 5055, + 5055, + 43920, + 5056, + 5056, + 43921, + 5057, + 5057, + 43922, + 5058, + 5058, + 43923, + 5059, + 5059, + 43924, + 5060, + 5060, + 43925, + 5061, + 5061, + 43926, + 5062, + 5062, + 43927, + 5063, + 5063, + 43928, + 5064, + 5064, + 43929, + 5065, + 5065, + 43930, + 5066, + 5066, + 43931, + 5067, + 5067, + 43932, + 5068, + 5068, + 43933, + 5069, + 5069, + 43934, + 5070, + 5070, + 43935, + 5071, + 5071, + 43936, + 5072, + 5072, + 43937, + 5073, + 5073, + 43938, + 5074, + 5074, + 43939, + 5075, + 5075, + 43940, + 5076, + 5076, + 43941, + 5077, + 5077, + 43942, + 5078, + 5078, + 43943, + 5079, + 5079, + 43944, + 5080, + 5080, + 43945, + 5081, + 5081, + 43946, + 5082, + 5082, + 43947, + 5083, + 5083, + 43948, + 5084, + 5084, + 43949, + 5085, + 5085, + 43950, + 5086, + 5086, + 43951, + 5087, + 5087, + 43952, + 5088, + 5088, + 43953, + 5089, + 5089, + 43954, + 5090, + 5090, + 43955, + 5091, + 5091, + 43956, + 5092, + 5092, + 43957, + 5093, + 5093, + 43958, + 5094, + 5094, + 43959, + 5095, + 5095, + 43960, + 5096, + 5096, + 43961, + 5097, + 5097, + 43962, + 5098, + 5098, + 43963, + 5099, + 5099, + 43964, + 5100, + 5100, + 43965, + 5101, + 5101, + 43966, + 5102, + 5102, + 43967, + 5103, + 5103, + 5112, + 5104, + 5104, + 5113, + 5105, + 5105, + 5114, + 5106, + 5106, + 5115, + 5107, + 5107, + 5116, + 5108, + 5108, + 5117, + 5109, + 5109, + 5112, + 5104, + 5104, + 5113, + 5105, + 5105, + 5114, + 5106, + 5106, + 5115, + 5107, + 5107, + 5116, + 5108, + 5108, + 5117, + 5109, + 5109, + 7296, + 1074, + 1042, + 7297, + 1076, + 1044, + 7298, + 1086, + 1054, + 7299, + 1089, + 1057, + 7300, + 1090, + 1058, + 7301, + 1090, + 1058, + 7302, + 1098, + 1066, + 7303, + 1123, + 1122, + 7304, + 42571, + 42570, + 7830, + 104, + 817, + 72, + 817, + 7831, + 116, + 776, + 84, + 776, + 7832, + 119, + 778, + 87, + 778, + 7833, + 121, + 778, + 89, + 778, + 7834, + 97, + 702, + 65, + 702, + 7835, + 7777, + 7776, + 223, + 115, + 115, + 7838, + 8016, + 965, + 787, + 933, + 787, + 8018, + 965, + 787, + 768, + 933, + 787, + 768, + 8020, + 965, + 787, + 769, + 933, + 787, + 769, + 8022, + 965, + 787, + 834, + 933, + 787, + 834, + 8064, + 7936, + 953, + 7944, + 921, + 8072, + 8065, + 7937, + 953, + 7945, + 921, + 8073, + 8066, + 7938, + 953, + 7946, + 921, + 8074, + 8067, + 7939, + 953, + 7947, + 921, + 8075, + 8068, + 7940, + 953, + 7948, + 921, + 8076, + 8069, + 7941, + 953, + 7949, + 921, + 8077, + 8070, + 7942, + 953, + 7950, + 921, + 8078, + 8071, + 7943, + 953, + 7951, + 921, + 8079, + 8064, + 7936, + 953, + 7944, + 921, + 8072, + 8065, + 7937, + 953, + 7945, + 921, + 8073, + 8066, + 7938, + 953, + 7946, + 921, + 8074, + 8067, + 7939, + 953, + 7947, + 921, + 8075, + 8068, + 7940, + 953, + 7948, + 921, + 8076, + 8069, + 7941, + 953, + 7949, + 921, + 8077, + 8070, + 7942, + 953, + 7950, + 921, + 8078, + 8071, + 7943, + 953, + 7951, + 921, + 8079, + 8080, + 7968, + 953, + 7976, + 921, + 8088, + 8081, + 7969, + 953, + 7977, + 921, + 8089, + 8082, + 7970, + 953, + 7978, + 921, + 8090, + 8083, + 7971, + 953, + 7979, + 921, + 8091, + 8084, + 7972, + 953, + 7980, + 921, + 8092, + 8085, + 7973, + 953, + 7981, + 921, + 8093, + 8086, + 7974, + 953, + 7982, + 921, + 8094, + 8087, + 7975, + 953, + 7983, + 921, + 8095, + 8080, + 7968, + 953, + 7976, + 921, + 8088, + 8081, + 7969, + 953, + 7977, + 921, + 8089, + 8082, + 7970, + 953, + 7978, + 921, + 8090, + 8083, + 7971, + 953, + 7979, + 921, + 8091, + 8084, + 7972, + 953, + 7980, + 921, + 8092, + 8085, + 7973, + 953, + 7981, + 921, + 8093, + 8086, + 7974, + 953, + 7982, + 921, + 8094, + 8087, + 7975, + 953, + 7983, + 921, + 8095, + 8096, + 8032, + 953, + 8040, + 921, + 8104, + 8097, + 8033, + 953, + 8041, + 921, + 8105, + 8098, + 8034, + 953, + 8042, + 921, + 8106, + 8099, + 8035, + 953, + 8043, + 921, + 8107, + 8100, + 8036, + 953, + 8044, + 921, + 8108, + 8101, + 8037, + 953, + 8045, + 921, + 8109, + 8102, + 8038, + 953, + 8046, + 921, + 8110, + 8103, + 8039, + 953, + 8047, + 921, + 8111, + 8096, + 8032, + 953, + 8040, + 921, + 8104, + 8097, + 8033, + 953, + 8041, + 921, + 8105, + 8098, + 8034, + 953, + 8042, + 921, + 8106, + 8099, + 8035, + 953, + 8043, + 921, + 8107, + 8100, + 8036, + 953, + 8044, + 921, + 8108, + 8101, + 8037, + 953, + 8045, + 921, + 8109, + 8102, + 8038, + 953, + 8046, + 921, + 8110, + 8103, + 8039, + 953, + 8047, + 921, + 8111, + 8114, + 8048, + 953, + 8122, + 921, + 8122, + 837, + 8115, + 945, + 953, + 913, + 921, + 8124, + 8116, + 940, + 953, + 902, + 921, + 902, + 837, + 8118, + 945, + 834, + 913, + 834, + 8119, + 945, + 834, + 953, + 913, + 834, + 921, + 913, + 834, + 837, + 8115, + 945, + 953, + 913, + 921, + 8124, + 8126, + 953, + 921, + 8130, + 8052, + 953, + 8138, + 921, + 8138, + 837, + 8131, + 951, + 953, + 919, + 921, + 8140, + 8132, + 942, + 953, + 905, + 921, + 905, + 837, + 8134, + 951, + 834, + 919, + 834, + 8135, + 951, + 834, + 953, + 919, + 834, + 921, + 919, + 834, + 837, + 8131, + 951, + 953, + 919, + 921, + 8140, + 8146, + 953, + 776, + 768, + 921, + 776, + 768, + 8147, + 953, + 776, + 769, + 921, + 776, + 769, + 8150, + 953, + 834, + 921, + 834, + 8151, + 953, + 776, + 834, + 921, + 776, + 834, + 8162, + 965, + 776, + 768, + 933, + 776, + 768, + 8163, + 965, + 776, + 769, + 933, + 776, + 769, + 8164, + 961, + 787, + 929, + 787, + 8166, + 965, + 834, + 933, + 834, + 8167, + 965, + 776, + 834, + 933, + 776, + 834, + 8178, + 8060, + 953, + 8186, + 921, + 8186, + 837, + 8179, + 969, + 953, + 937, + 921, + 8188, + 8180, + 974, + 953, + 911, + 921, + 911, + 837, + 8182, + 969, + 834, + 937, + 834, + 8183, + 969, + 834, + 953, + 937, + 834, + 921, + 937, + 834, + 837, + 8179, + 969, + 953, + 937, + 921, + 8188, + 43888, + 5024, + 5024, + 43889, + 5025, + 5025, + 43890, + 5026, + 5026, + 43891, + 5027, + 5027, + 43892, + 5028, + 5028, + 43893, + 5029, + 5029, + 43894, + 5030, + 5030, + 43895, + 5031, + 5031, + 43896, + 5032, + 5032, + 43897, + 5033, + 5033, + 43898, + 5034, + 5034, + 43899, + 5035, + 5035, + 43900, + 5036, + 5036, + 43901, + 5037, + 5037, + 43902, + 5038, + 5038, + 43903, + 5039, + 5039, + 43904, + 5040, + 5040, + 43905, + 5041, + 5041, + 43906, + 5042, + 5042, + 43907, + 5043, + 5043, + 43908, + 5044, + 5044, + 43909, + 5045, + 5045, + 43910, + 5046, + 5046, + 43911, + 5047, + 5047, + 43912, + 5048, + 5048, + 43913, + 5049, + 5049, + 43914, + 5050, + 5050, + 43915, + 5051, + 5051, + 43916, + 5052, + 5052, + 43917, + 5053, + 5053, + 43918, + 5054, + 5054, + 43919, + 5055, + 5055, + 43920, + 5056, + 5056, + 43921, + 5057, + 5057, + 43922, + 5058, + 5058, + 43923, + 5059, + 5059, + 43924, + 5060, + 5060, + 43925, + 5061, + 5061, + 43926, + 5062, + 5062, + 43927, + 5063, + 5063, + 43928, + 5064, + 5064, + 43929, + 5065, + 5065, + 43930, + 5066, + 5066, + 43931, + 5067, + 5067, + 43932, + 5068, + 5068, + 43933, + 5069, + 5069, + 43934, + 5070, + 5070, + 43935, + 5071, + 5071, + 43936, + 5072, + 5072, + 43937, + 5073, + 5073, + 43938, + 5074, + 5074, + 43939, + 5075, + 5075, + 43940, + 5076, + 5076, + 43941, + 5077, + 5077, + 43942, + 5078, + 5078, + 43943, + 5079, + 5079, + 43944, + 5080, + 5080, + 43945, + 5081, + 5081, + 43946, + 5082, + 5082, + 43947, + 5083, + 5083, + 43948, + 5084, + 5084, + 43949, + 5085, + 5085, + 43950, + 5086, + 5086, + 43951, + 5087, + 5087, + 43952, + 5088, + 5088, + 43953, + 5089, + 5089, + 43954, + 5090, + 5090, + 43955, + 5091, + 5091, + 43956, + 5092, + 5092, + 43957, + 5093, + 5093, + 43958, + 5094, + 5094, + 43959, + 5095, + 5095, + 43960, + 5096, + 5096, + 43961, + 5097, + 5097, + 43962, + 5098, + 5098, + 43963, + 5099, + 5099, + 43964, + 5100, + 5100, + 43965, + 5101, + 5101, + 43966, + 5102, + 5102, + 43967, + 5103, + 5103, + 64256, + 102, + 102, + 70, + 70, + 70, + 102, + 64257, + 102, + 105, + 70, + 73, + 70, + 105, + 64258, + 102, + 108, + 70, + 76, + 70, + 108, + 64259, + 102, + 102, + 105, + 70, + 70, + 73, + 70, + 102, + 105, + 64260, + 102, + 102, + 108, + 70, + 70, + 76, + 70, + 102, + 108, + 64261, + 115, + 116, + 83, + 84, + 83, + 116, + 64262, + 115, + 116, + 83, + 84, + 83, + 116, + 64275, + 1396, + 1398, + 1348, + 1350, + 1348, + 1398, + 64276, + 1396, + 1381, + 1348, + 1333, + 1348, + 1381, + 64277, + 1396, + 1387, + 1348, + 1339, + 1348, + 1387, + 64278, + 1406, + 1398, + 1358, + 1350, + 1358, + 1398, + 64279, + 1396, + 1389, + 1348, + 1341, + 1348, + 1389, +}; + +/* type indexes */ +#define SHIFT 7 +static unsigned short index1[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 34, 35, 36, 37, + 38, 39, 34, 34, 34, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 64, 64, 64, 65, 66, 64, + 64, 64, 64, 67, 68, 64, 64, 64, 64, 64, 64, 69, 70, 71, 72, 73, 74, 75, + 76, 64, 77, 78, 79, 80, 81, 82, 83, 64, 64, 84, 85, 34, 34, 34, 34, 34, + 34, 86, 34, 34, 34, 34, 34, 87, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 88, 89, 90, 91, 34, 34, 34, 92, 34, 34, + 34, 93, 94, 34, 34, 34, 34, 34, 95, 34, 34, 34, 96, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 97, 98, 99, 34, 34, 34, 34, 34, 34, 100, 101, 34, 34, + 34, 34, 34, 34, 34, 34, 102, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 103, 34, 34, 34, 34, 34, 34, 34, 34, 104, 34, 34, 34, 34, + 100, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 103, 34, 34, 34, 34, 34, 34, 105, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 106, 107, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 108, 109, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 110, 111, 34, 34, 34, 34, 34, + 34, 34, 34, 112, 34, 34, 113, 114, 115, 116, 117, 118, 119, 120, 121, + 122, 123, 124, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 125, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 127, 128, 129, + 130, 131, 132, 133, 34, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 144, 34, 34, 151, 144, 152, 153, 154, + 155, 156, 157, 158, 159, 160, 161, 162, 144, 163, 144, 164, 144, 165, + 166, 167, 168, 169, 170, 171, 144, 172, 173, 144, 174, 175, 176, 177, + 144, 178, 179, 144, 144, 180, 181, 144, 144, 182, 183, 184, 185, 144, + 186, 144, 144, 34, 34, 34, 34, 34, 34, 34, 187, 188, 34, 189, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 34, 34, 34, 34, 34, 34, 34, 34, 190, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 34, 34, 34, 34, 191, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 34, 34, 34, 34, 192, 193, 194, 195, 144, 144, 144, 144, 196, + 197, 198, 199, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 200, 34, 34, + 34, 34, 34, 201, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 34, 34, 202, 34, 34, 203, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 204, 205, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 64, + 206, 207, 208, 209, 210, 211, 144, 212, 213, 214, 215, 216, 217, 218, + 219, 64, 64, 64, 64, 220, 221, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 222, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 34, 223, 224, 144, 144, 144, 144, 144, 225, 226, 144, + 144, 227, 228, 144, 144, 229, 230, 231, 232, 233, 144, 64, 234, 64, 64, + 64, 64, 64, 235, 236, 237, 238, 239, 240, 241, 242, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 243, 244, 245, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 86, 246, 34, 247, 248, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 249, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 250, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 251, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 252, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 253, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 254, 34, + 255, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 256, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 257, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 34, 249, 34, 34, 258, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 259, 144, + 260, 261, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, + 144, 144, 144, 144, 144, 144, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 262, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, 126, + 126, 126, 126, 126, 126, 126, 126, 262, +}; + +static unsigned short index2[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 3, 3, 3, 2, 4, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6, 5, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 6, 5, 5, 5, 5, 5, 5, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 5, 5, 5, 6, 18, 6, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 5, 5, + 5, 5, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 5, 5, 5, 5, 5, 5, 5, 6, 5, 20, 5, 5, + 21, 5, 6, 5, 5, 22, 23, 6, 24, 5, 25, 6, 26, 20, 5, 27, 27, 27, 5, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 5, 17, 17, 17, 17, 17, 17, 17, 28, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 5, 19, 19, 19, 19, 19, 19, 19, 29, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 32, 33, 30, 31, 30, 31, 30, 31, 20, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 34, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 35, 30, 31, 30, 31, 30, 31, 36, 37, 38, 30, 31, 30, 31, 39, + 30, 31, 40, 40, 30, 31, 20, 41, 42, 43, 30, 31, 40, 44, 45, 46, 47, 30, + 31, 48, 20, 46, 49, 50, 51, 30, 31, 30, 31, 30, 31, 52, 30, 31, 52, 20, + 20, 30, 31, 52, 30, 31, 53, 53, 30, 31, 30, 31, 54, 30, 31, 20, 55, 30, + 31, 20, 56, 55, 55, 55, 55, 57, 58, 59, 57, 58, 59, 57, 58, 59, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 60, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 61, 57, 58, + 59, 30, 31, 62, 63, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 64, 20, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 20, 20, 20, 20, 20, 20, 65, + 30, 31, 66, 67, 68, 68, 30, 31, 69, 70, 71, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 72, 73, 74, 75, 76, 20, 77, 77, 20, 78, 20, 79, 80, 20, 20, + 20, 77, 81, 20, 82, 20, 83, 84, 20, 85, 86, 84, 87, 88, 20, 20, 86, 20, + 89, 90, 20, 20, 91, 20, 20, 20, 20, 20, 20, 20, 92, 20, 20, 93, 20, 20, + 93, 20, 20, 20, 94, 93, 95, 96, 96, 97, 20, 20, 20, 20, 20, 98, 20, 55, + 20, 20, 20, 20, 20, 20, 20, 20, 99, 100, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 102, 102, 102, 102, 102, 102, 102, 101, 101, 6, 6, 6, 6, 102, + 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 101, 101, 101, 101, 101, 6, 6, 6, 6, 6, 6, 6, + 102, 6, 102, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 103, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 30, 31, 30, 31, 102, 6, 30, 31, 0, 0, 104, 50, 50, 50, 5, 105, 0, + 0, 0, 0, 6, 6, 106, 25, 107, 107, 107, 0, 108, 0, 109, 109, 110, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 0, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 111, 112, 112, 112, 113, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 114, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 115, 116, 116, 117, 118, 119, 120, 120, 120, 121, 122, + 123, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 124, 125, 126, 127, 128, 129, 5, 30, 31, 130, + 30, 31, 20, 64, 64, 64, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 132, + 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, + 132, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 5, + 25, 25, 25, 25, 25, 6, 6, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 133, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 134, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 0, 135, 135, 135, 135, 135, 135, + 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, + 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, + 135, 135, 135, 135, 0, 0, 102, 5, 5, 5, 5, 5, 5, 20, 136, 136, 136, 136, + 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, + 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, + 136, 136, 136, 136, 136, 136, 137, 20, 5, 5, 0, 0, 5, 5, 5, 0, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 5, 25, 5, 25, 25, 5, 25, 25, 5, 25, 0, 0, 0, + 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 55, 55, + 55, 55, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 21, 21, 21, 21, 21, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 5, + 21, 0, 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 102, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 5, 5, 5, 5, 55, 55, 25, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 5, 55, 25, 25, 25, 25, 25, 25, 25, 21, 5, 25, 25, 25, 25, 25, 25, + 102, 102, 25, 25, 5, 25, 25, 25, 25, 55, 55, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 55, 55, 55, 5, 5, 55, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 0, 21, 55, 25, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 102, 102, 5, 5, 5, 5, 102, 0, 0, 25, + 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 25, 25, 25, 25, 102, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 102, 25, 25, 25, 102, 25, 25, 25, 25, 25, 0, 0, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 25, 0, 0, + 5, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 21, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 18, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 25, 18, 25, 55, 18, 18, 18, 25, 25, 25, 25, 25, 25, 25, 25, 18, + 18, 18, 18, 25, 18, 18, 55, 25, 25, 25, 25, 25, 25, 25, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 25, 25, 5, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 5, 102, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 25, 18, 18, 0, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 55, 55, 0, 0, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 0, 55, 0, 0, 0, 55, 55, 55, + 55, 0, 0, 25, 55, 18, 18, 18, 25, 25, 25, 25, 0, 0, 18, 18, 0, 0, 18, 18, + 25, 55, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 55, 55, 0, 55, 55, 55, + 25, 25, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 55, 55, 5, 5, 27, 27, + 27, 27, 27, 27, 5, 5, 55, 5, 25, 0, 0, 25, 25, 18, 0, 55, 55, 55, 55, 55, + 55, 0, 0, 0, 0, 55, 55, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, + 55, 0, 55, 55, 0, 55, 55, 0, 55, 55, 0, 0, 25, 0, 18, 18, 18, 25, 25, 0, + 0, 0, 0, 25, 25, 0, 0, 25, 25, 25, 0, 0, 0, 25, 0, 0, 0, 0, 0, 0, 0, 55, + 55, 55, 55, 0, 55, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 25, 25, 55, 55, 55, 25, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 18, + 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 0, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 0, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 0, 55, 55, 55, 55, 55, 0, 0, + 25, 55, 18, 18, 18, 25, 25, 25, 25, 25, 0, 25, 25, 18, 0, 18, 18, 25, 0, + 0, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 25, 25, 0, 0, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 5, 5, 0, 0, 0, 0, 0, 0, 0, 55, 25, + 25, 25, 25, 25, 25, 0, 25, 18, 18, 0, 55, 55, 55, 55, 55, 55, 55, 55, 0, + 0, 55, 55, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, + 0, 55, 55, 55, 55, 55, 0, 0, 25, 55, 18, 25, 18, 25, 25, 25, 25, 0, 0, + 18, 18, 0, 0, 18, 18, 25, 0, 0, 0, 0, 0, 0, 0, 0, 25, 18, 0, 0, 0, 0, 55, + 55, 0, 55, 55, 55, 25, 25, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 5, + 55, 27, 27, 27, 27, 27, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 55, 0, 55, + 55, 55, 55, 55, 55, 0, 0, 0, 55, 55, 55, 0, 55, 55, 55, 55, 0, 0, 0, 55, + 55, 0, 55, 0, 55, 55, 0, 0, 0, 55, 55, 0, 0, 0, 55, 55, 55, 0, 0, 0, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 18, 18, 25, 18, + 18, 0, 0, 0, 18, 18, 18, 0, 18, 18, 18, 25, 0, 0, 55, 0, 0, 0, 0, 0, 0, + 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 27, 27, 27, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 25, 18, + 18, 18, 25, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 0, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 0, 0, 0, 55, 25, 25, 25, 18, 18, 18, 18, 0, 25, 25, 25, 0, 25, 25, + 25, 25, 0, 0, 0, 0, 0, 0, 0, 25, 25, 0, 55, 55, 55, 0, 0, 0, 0, 0, 55, + 55, 25, 25, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, + 0, 0, 27, 27, 27, 27, 27, 27, 27, 5, 55, 25, 18, 18, 5, 55, 55, 55, 55, + 55, 55, 55, 55, 0, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 0, 0, 25, 55, 18, 25, 18, + 18, 18, 18, 18, 0, 25, 18, 18, 0, 18, 18, 25, 25, 0, 0, 0, 0, 0, 0, 0, + 18, 18, 0, 0, 0, 0, 0, 0, 0, 55, 0, 55, 55, 25, 25, 0, 0, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 0, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 25, 25, 18, 18, 0, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 0, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 25, 25, 55, 18, 18, 18, 25, 25, 25, 25, 0, 18, 18, 18, 0, + 18, 18, 18, 25, 55, 5, 0, 0, 0, 0, 55, 55, 55, 18, 27, 27, 27, 27, 27, + 27, 27, 55, 55, 55, 25, 25, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 5, 55, 55, 55, 55, 55, 55, 0, 0, 18, + 18, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 0, 55, 0, 0, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 25, 0, 0, 0, 0, + 18, 18, 18, 25, 25, 25, 0, 25, 0, 18, 18, 18, 18, 18, 18, 18, 18, 0, 0, + 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 18, 18, 5, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 25, 55, 138, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 5, 55, 55, 55, + 55, 55, 55, 102, 25, 25, 25, 25, 25, 25, 25, 25, 5, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 0, + 55, 0, 0, 55, 55, 0, 55, 0, 0, 55, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 0, + 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 0, 55, 0, 55, 0, 0, 55, 55, 0, + 55, 55, 55, 55, 25, 55, 138, 25, 25, 25, 25, 25, 25, 0, 25, 25, 55, 0, 0, + 55, 55, 55, 55, 55, 0, 102, 0, 25, 25, 25, 25, 25, 25, 0, 0, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 0, 0, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 25, + 25, 5, 5, 5, 5, 5, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 5, 25, 5, 25, 5, 25, 5, 5, 5, 5, 18, 18, 55, + 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 18, 25, 25, 25, 25, 25, 5, 25, 25, 55, 55, + 55, 55, 55, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 5, 5, 5, + 5, 5, 5, 5, 5, 25, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 18, 18, 25, 25, 25, 25, 18, 25, 25, 25, 25, 25, 25, 18, 25, 25, 18, 18, + 25, 25, 55, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 5, 5, 5, 5, 5, 5, 55, + 55, 55, 55, 55, 55, 18, 18, 25, 25, 55, 55, 55, 55, 25, 25, 25, 55, 18, + 18, 18, 55, 55, 18, 18, 18, 18, 18, 18, 18, 55, 55, 55, 25, 25, 25, 25, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 18, 18, 25, 25, + 18, 18, 18, 18, 18, 18, 25, 55, 18, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 18, 18, 18, 25, 5, 5, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, + 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, + 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 0, + 139, 0, 0, 0, 0, 0, 139, 0, 0, 140, 140, 140, 140, 140, 140, 140, 140, + 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, + 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, + 140, 140, 140, 140, 140, 140, 140, 5, 102, 140, 140, 140, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, + 55, 55, 55, 55, 0, 0, 55, 55, 55, 55, 55, 55, 55, 0, 55, 0, 55, 55, 55, + 55, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 0, 0, 55, 55, + 55, 55, 55, 55, 55, 0, 55, 0, 55, 55, 55, 55, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, + 55, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 25, 25, + 25, 5, 5, 5, 5, 5, 5, 5, 5, 5, 141, 142, 143, 144, 145, 146, 147, 148, + 149, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 0, 0, 0, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 0, 0, 0, 0, 0, 0, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, + 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, + 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, + 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, + 230, 231, 232, 233, 234, 235, 0, 0, 236, 237, 238, 239, 240, 241, 0, 0, + 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 2, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 5, 5, 0, 0, 0, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 5, 5, 5, 242, 242, 242, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, + 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, + 55, 55, 25, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 25, 5, 5, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 0, 25, 25, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 25, 25, 18, 25, 25, 25, 25, 25, 25, 25, 18, + 18, 18, 18, 18, 18, 18, 18, 25, 18, 18, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 5, 5, 5, 102, 5, 5, 5, 5, 55, 25, 0, 0, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 25, 25, 25, 21, 0, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 102, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 55, + 55, 55, 55, 55, 243, 243, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 25, 55, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 25, 25, 25, 18, 18, 18, 18, + 25, 25, 18, 18, 18, 0, 0, 0, 0, 18, 18, 25, 18, 18, 18, 18, 18, 18, 25, + 25, 25, 0, 0, 0, 0, 5, 0, 0, 0, 5, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 55, 55, 55, 55, + 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, + 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 141, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 25, 25, 18, 18, 25, 0, 0, 5, 5, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, 25, 18, 25, 25, 25, 25, 25, + 25, 25, 0, 25, 18, 25, 18, 18, 25, 25, 25, 25, 25, 25, 25, 25, 18, 18, + 18, 18, 18, 18, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 25, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 102, 5, 5, 5, 5, 5, 5, + 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 6, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 18, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 25, 18, 25, 25, 25, 25, 25, 18, 25, 18, + 18, 18, 18, 18, 25, 18, 18, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 25, 25, 25, 25, 25, 25, 25, 25, 25, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 0, 0, 0, 25, 25, 18, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, + 25, 25, 25, 25, 18, 18, 25, 25, 18, 25, 25, 25, 55, 55, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 18, 25, 25, 18, + 18, 18, 25, 18, 25, 25, 25, 18, 18, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 18, 18, 18, 18, 18, 18, 18, 18, 25, 25, 25, 25, 25, 25, 25, 25, 18, 18, + 25, 25, 0, 0, 0, 5, 5, 5, 5, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, + 0, 0, 55, 55, 55, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 102, 102, 102, 102, 102, 102, 5, 5, 244, + 245, 246, 247, 248, 249, 250, 251, 252, 0, 0, 0, 0, 0, 0, 0, 253, 253, + 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, + 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, + 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 0, 0, + 253, 253, 253, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, + 25, 5, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 18, 25, 25, + 25, 25, 25, 25, 25, 55, 55, 55, 55, 25, 55, 55, 55, 55, 18, 18, 25, 55, + 55, 18, 25, 25, 0, 0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 101, 254, 20, 20, 20, 255, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 101, 101, 101, 101, 101, 101, 101, 101, 101, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 25, 25, + 25, 25, 25, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 256, 257, 258, 259, 260, 261, 20, 20, + 262, 20, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 263, 263, 263, 263, 263, 263, 263, 263, + 264, 264, 264, 264, 264, 264, 264, 264, 263, 263, 263, 263, 263, 263, 0, + 0, 264, 264, 264, 264, 264, 264, 0, 0, 263, 263, 263, 263, 263, 263, 263, + 263, 264, 264, 264, 264, 264, 264, 264, 264, 263, 263, 263, 263, 263, + 263, 263, 263, 264, 264, 264, 264, 264, 264, 264, 264, 263, 263, 263, + 263, 263, 263, 0, 0, 264, 264, 264, 264, 264, 264, 0, 0, 265, 263, 266, + 263, 267, 263, 268, 263, 0, 264, 0, 264, 0, 264, 0, 264, 263, 263, 263, + 263, 263, 263, 263, 263, 264, 264, 264, 264, 264, 264, 264, 264, 269, + 269, 270, 270, 270, 270, 271, 271, 272, 272, 273, 273, 274, 274, 0, 0, + 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, + 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, + 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, + 317, 318, 319, 320, 321, 322, 263, 263, 323, 324, 325, 0, 326, 327, 264, + 264, 328, 328, 329, 6, 330, 6, 6, 6, 331, 332, 333, 0, 334, 335, 336, + 336, 336, 336, 337, 6, 6, 6, 263, 263, 338, 339, 0, 0, 340, 341, 264, + 264, 342, 342, 0, 6, 6, 6, 263, 263, 343, 344, 345, 126, 346, 347, 264, + 264, 348, 348, 130, 6, 6, 6, 0, 0, 349, 350, 351, 0, 352, 353, 354, 354, + 355, 355, 356, 6, 6, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 21, 21, 21, 21, + 21, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, + 6, 3, 3, 21, 21, 21, 21, 21, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 18, 18, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 18, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 21, 21, 21, 21, 21, 0, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 357, 101, 0, 0, 358, 359, 360, 361, 362, 363, 5, + 5, 5, 5, 5, 101, 357, 26, 22, 23, 358, 359, 360, 361, 362, 363, 5, 5, 5, + 5, 5, 0, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 6, 6, 6, 6, + 25, 6, 6, 6, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 120, 5, 5, 5, 5, 120, 5, 5, 20, + 120, 120, 120, 20, 20, 120, 120, 120, 20, 5, 120, 5, 5, 364, 120, 120, + 120, 120, 120, 5, 5, 5, 5, 5, 5, 120, 5, 365, 5, 120, 5, 366, 367, 120, + 120, 364, 20, 120, 120, 368, 120, 20, 55, 55, 55, 55, 20, 5, 5, 20, 20, + 120, 120, 5, 5, 5, 5, 5, 120, 20, 20, 20, 20, 5, 5, 5, 5, 369, 5, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 370, 370, 370, + 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 370, 371, + 371, 371, 371, 371, 371, 371, 371, 371, 371, 371, 371, 371, 371, 371, + 371, 242, 242, 242, 30, 31, 242, 242, 242, 242, 27, 5, 5, 0, 0, 0, 0, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, + 22, 23, 358, 359, 360, 361, 362, 363, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 22, 23, 358, 359, 360, 361, 362, 363, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 26, 22, 23, 358, 359, 360, 361, 362, 363, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 373, 373, 373, 373, 373, 373, 373, 373, 373, + 373, 373, 373, 373, 373, 373, 373, 373, 373, 373, 373, 373, 373, 373, + 373, 373, 373, 357, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 26, 22, 23, + 358, 359, 360, 361, 362, 363, 27, 357, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 26, 22, 23, 358, 359, 360, 361, 362, + 363, 27, 26, 22, 23, 358, 359, 360, 361, 362, 363, 27, 26, 22, 23, 358, + 359, 360, 361, 362, 363, 27, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 135, 135, 135, 135, 135, 135, 135, + 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, + 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, + 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, 0, 136, 136, + 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, + 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, + 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, + 136, 136, 136, 0, 30, 31, 374, 375, 376, 377, 378, 30, 31, 30, 31, 30, + 31, 379, 380, 381, 382, 20, 30, 31, 20, 30, 31, 20, 20, 20, 20, 20, 101, + 101, 383, 383, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 20, 5, 5, 5, 5, + 5, 5, 30, 31, 30, 31, 25, 25, 25, 30, 31, 0, 0, 0, 0, 0, 5, 5, 5, 5, 27, + 5, 5, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 0, 384, 0, 0, 0, + 0, 0, 384, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 102, 5, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 25, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, + 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, + 55, 0, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 0, 55, + 55, 55, 55, 55, 55, 55, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 385, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 0, 0, 0, 0, 2, 5, 5, 5, 5, 102, 55, 242, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 25, 25, 25, 25, 18, 18, 5, 102, 102, 102, 102, 102, 5, 5, + 242, 242, 242, 102, 55, 5, 5, 5, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 0, 0, 25, 25, 6, 6, 102, 102, 55, 5, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 5, 102, 102, 102, + 55, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 5, + 5, 27, 27, 27, 27, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 27, 27, 27, 27, 27, 27, 27, 27, 5, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 55, 55, 55, 55, 55, 386, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 386, 55, 55, 386, 55, 55, 55, 386, 55, 386, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 386, 55, 55, 55, 55, 55, 55, 55, 386, 55, 386, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 386, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, + 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 386, 55, 386, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 386, 55, 386, 386, 386, 55, 55, 55, 55, 55, + 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 386, 386, 386, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 386, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 386, 386, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 386, 386, 386, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, + 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 386, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 102, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 102, 102, 102, 102, 102, 102, 5, + 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 102, 5, 5, 5, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 55, 25, 6, 6, 6, + 5, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 5, 102, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, + 31, 30, 31, 30, 31, 101, 101, 25, 25, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 25, 25, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 6, 6, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 20, 20, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 101, 20, 20, 20, + 20, 20, 20, 20, 20, 30, 31, 30, 31, 387, 30, 31, 30, 31, 30, 31, 30, 31, + 30, 31, 102, 6, 6, 30, 31, 388, 20, 55, 30, 31, 30, 31, 20, 20, 30, 31, + 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, 30, 31, + 389, 390, 391, 392, 389, 20, 393, 394, 395, 396, 30, 31, 30, 31, 30, 31, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 101, 101, 20, 55, 55, 55, 55, + 55, 55, 55, 25, 55, 55, 55, 25, 55, 55, 55, 55, 25, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 18, 18, 25, 25, 18, 5, 5, 5, 5, 0, 0, 0, 0, 27, 27, 27, 27, 27, 27, 5, 5, + 5, 5, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 18, 18, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 55, 55, 55, 55, + 55, 55, 5, 5, 5, 55, 5, 55, 55, 25, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 25, 25, 25, 25, 25, 25, + 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 18, + 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 0, 0, 0, 25, 25, 25, 18, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 25, 18, 18, 25, 25, 25, 25, 18, 18, 25, 18, 18, 18, 18, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 102, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 0, 0, 0, 0, 5, 5, 55, 55, 55, 55, 55, 25, 102, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 55, 55, 55, 55, 55, + 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 25, 25, 25, 25, 25, 25, 18, 18, 25, 25, 18, 18, + 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 25, 55, 55, 55, 55, 55, + 55, 55, 55, 25, 18, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 5, + 5, 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 102, 55, 55, 55, 55, 55, 55, 5, 5, 5, 55, 18, 25, 18, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 55, 25, 25, 25, 55, 55, 25, 25, + 55, 55, 55, 55, 55, 25, 25, 55, 25, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 102, 5, 5, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 18, 25, 25, 18, 18, 5, 5, 55, 102, 102, 18, + 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 0, 0, 55, 55, + 55, 55, 55, 55, 0, 0, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 0, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 397, 20, 20, 20, + 20, 20, 20, 20, 6, 101, 101, 101, 101, 20, 20, 20, 20, 20, 20, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, + 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, + 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, + 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, + 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, 18, + 25, 18, 18, 25, 18, 18, 5, 18, 25, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, + 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 386, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 478, 479, 480, 481, 482, 483, 484, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 485, 486, 487, 488, 489, 0, 0, 0, 0, 0, 55, 25, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 0, 55, 0, 55, 55, 0, 55, 55, 0, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 490, 490, 490, 490, 490, 490, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 490, 490, 5, 5, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 5, 5, 5, 18, 18, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 18, 18, 18, 5, 5, 6, 0, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 0, 5, 5, 5, 5, 0, 0, 0, 0, 490, 55, 490, 55, 490, 0, 490, 55, + 490, 55, 490, 55, 490, 55, 490, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 21, 0, 5, 5, 5, 5, 5, 5, 6, 5, 5, + 5, 5, 5, 5, 6, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 6, 5, 5, 5, 5, 5, + 5, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 5, 5, 5, 6, 18, 6, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 102, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 491, 491, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 0, 0, 55, 55, 55, 55, 55, 55, 0, 0, 55, 55, 55, 55, 55, 55, 0, 0, 55, + 55, 55, 0, 0, 0, 5, 5, 5, 6, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 21, 21, 21, 5, 5, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 0, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, + 0, 0, 5, 5, 5, 0, 0, 0, 0, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 0, 0, 0, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 27, 27, 27, 27, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 27, 27, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 5, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 25, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 25, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 0, 0, 0, + 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 27, 27, 27, + 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 242, 55, 55, 55, 55, 55, 55, 55, + 55, 242, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 5, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 55, 55, 55, + 55, 55, 55, 55, 55, 5, 242, 242, 242, 242, 242, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 492, 492, 492, 492, 492, 492, 492, 492, + 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, + 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, + 492, 492, 492, 492, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, + 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, + 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, + 493, 493, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 0, 0, 0, 0, 0, 0, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, + 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, + 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 492, 0, 0, 0, 0, + 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, + 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, 493, + 493, 493, 493, 493, 493, 493, 493, 493, 0, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, + 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 0, 0, 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 0, 0, 0, 55, + 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 0, 5, 27, 27, 27, 27, 27, 27, 27, 27, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 5, 5, 27, 27, 27, 27, 27, 27, 27, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 0, 55, 55, 0, 0, 0, 0, 0, 27, 27, 27, 27, 27, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 27, + 27, 27, 27, 27, 27, 0, 0, 0, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, + 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, + 0, 27, 27, 55, 55, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 0, 0, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 55, 25, 25, 25, + 0, 25, 25, 0, 0, 0, 0, 0, 25, 25, 25, 25, 55, 55, 55, 55, 0, 55, 55, 55, + 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 25, 25, 25, 0, 0, + 0, 0, 25, 26, 22, 23, 358, 27, 27, 27, 27, 27, 0, 0, 0, 0, 0, 0, 0, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 27, 27, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 27, + 27, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 5, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 0, 0, 0, 0, 27, 27, 27, 27, 27, + 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 5, 5, 5, 5, 5, + 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 0, 0, 27, 27, 27, 27, 27, 27, 27, 27, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, + 0, 27, 27, 27, 27, 27, 27, 27, 27, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 27, 27, 27, 27, 27, 27, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 115, 115, 115, 115, 115, + 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, + 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, + 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, + 115, 115, 115, 115, 0, 0, 0, 0, 0, 0, 0, 27, 27, 27, 27, 27, 27, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, + 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 22, + 23, 358, 359, 360, 361, 362, 363, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 55, 0, 0, + 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 27, 27, 27, 27, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 18, 25, 18, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 26, 22, 23, 358, 359, 360, 361, 362, + 363, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 18, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, 18, 18, 25, 25, 25, 25, 18, 18, + 25, 25, 5, 5, 21, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 0, 0, 0, 0, 0, 0, 25, 25, 25, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 25, 25, 25, 18, 25, 25, + 25, 25, 25, 25, 25, 25, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 5, 5, 5, + 5, 55, 18, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 5, 5, 55, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 25, 25, 18, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, 18, + 18, 25, 25, 25, 25, 25, 25, 25, 25, 25, 18, 18, 55, 55, 55, 55, 5, 5, 5, + 5, 25, 25, 25, 25, 5, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 55, 5, + 55, 5, 5, 5, 0, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 18, 18, 18, 25, 25, 25, 18, 18, 25, 18, 25, 25, 5, 5, 5, + 5, 5, 5, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, + 55, 55, 55, 55, 55, 0, 55, 0, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 5, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 25, 18, 18, 18, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 25, 25, 18, 18, 0, 55, 55, + 55, 55, 55, 55, 55, 55, 0, 0, 55, 55, 0, 0, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, + 55, 55, 55, 55, 55, 0, 55, 55, 0, 55, 55, 55, 55, 55, 0, 25, 25, 55, 18, + 18, 25, 18, 18, 18, 18, 0, 0, 18, 18, 0, 0, 18, 18, 18, 0, 0, 55, 0, 0, + 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 18, 18, 0, 0, 25, 25, + 25, 25, 25, 25, 25, 0, 0, 0, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 18, 18, 18, 25, 25, 25, 25, 25, 25, 25, 25, 18, 18, 25, 25, 25, 18, + 25, 55, 55, 55, 55, 5, 5, 5, 5, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 5, 0, 5, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 18, 18, 18, 25, 25, 25, 25, 25, 25, 18, 25, 18, 18, 18, + 18, 25, 25, 18, 25, 25, 55, 55, 5, 55, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, 18, 18, 25, 25, 25, 25, 0, 0, + 18, 18, 18, 18, 25, 25, 18, 25, 25, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 55, 55, 55, 55, 25, 25, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, 18, + 18, 25, 25, 25, 25, 25, 25, 25, 25, 18, 18, 25, 18, 25, 25, 5, 5, 5, 55, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, + 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, + 18, 25, 18, 18, 25, 25, 25, 25, 25, 25, 18, 25, 0, 0, 0, 0, 0, 0, 0, 0, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 0, 0, 25, 25, 25, 18, 18, 25, 25, 25, 25, 18, 25, 25, 25, + 25, 25, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 27, 27, 5, 5, 5, + 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 18, 18, 18, 25, 25, 25, 25, 25, 25, 25, 25, 25, 18, 25, 25, + 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 27, 27, 27, 27, 27, 27, 27, 27, 27, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 55, 55, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 25, 25, 25, 25, 25, 25, 18, 55, 25, 25, 25, 25, 5, 5, 5, 5, 5, 5, + 5, 5, 25, 0, 0, 0, 0, 0, 0, 0, 0, 55, 25, 25, 25, 25, 25, 25, 18, 18, 25, + 25, 25, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 0, 0, 55, 55, 55, 55, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 18, 25, 25, 5, 5, 5, 55, 5, 5, 5, 5, 5, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 18, 25, 25, 25, 25, 25, 25, 25, 0, 25, 25, 25, 25, 25, 25, 18, + 25, 55, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 0, 0, 0, 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 0, 18, 25, 25, 25, 25, 25, 25, 25, 18, + 25, 25, 18, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 0, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 25, 25, 25, 25, 25, 25, 0, 0, 0, 25, 0, 25, 25, 0, 25, 25, 25, 25, 25, + 25, 25, 55, 25, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 0, 55, 55, 0, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, 18, 18, 18, 18, 0, 25, + 25, 0, 18, 18, 25, 18, 25, 55, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 18, + 18, 5, 5, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 0, + 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, + 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 25, 25, + 25, 25, 25, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 25, 25, 25, 25, 25, 25, 25, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 102, 102, 102, 102, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 0, 27, 27, 27, 27, 27, 27, 27, 0, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, + 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 55, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 102, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 5, 25, 25, 5, 21, 21, 21, + 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 18, 18, 25, 25, 25, 5, 5, 5, 18, 18, 18, + 18, 18, 18, 21, 21, 21, 21, 21, 21, 21, 21, 25, 25, 25, 25, 25, 25, 25, + 25, 5, 5, 25, 25, 25, 25, 25, 25, 25, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 25, 25, 25, 25, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 25, 25, 25, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 0, 0, 0, 0, 0, 0, 0, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 20, 20, 20, 20, 20, 20, 20, 0, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 120, 0, 120, + 120, 0, 0, 120, 0, 0, 120, 120, 0, 0, 120, 120, 120, 120, 0, 120, 120, + 120, 120, 120, 120, 120, 120, 20, 20, 20, 20, 0, 20, 0, 20, 20, 20, 20, + 20, 20, 20, 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 120, 120, 0, 120, 120, 120, 120, 0, 0, 120, 120, 120, 120, 120, 120, + 120, 120, 0, 120, 120, 120, 120, 120, 120, 120, 0, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 120, 120, 0, 120, 120, 120, 120, 0, 120, 120, 120, 120, 120, + 0, 120, 0, 0, 0, 120, 120, 120, 120, 120, 120, 120, 0, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 5, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 5, 20, 20, 20, 20, 20, 20, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 5, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 5, 20, 20, 20, 20, + 20, 20, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 5, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 5, 20, 20, 20, 20, 20, 20, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 5, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 5, 20, 20, + 20, 20, 20, 20, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 5, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 5, 20, 20, 20, 20, 20, 20, 120, 20, 0, 0, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 16, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 5, 5, 5, 5, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 5, 5, 5, 5, 5, 5, 5, 5, 25, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 25, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 0, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, + 25, 25, 25, 25, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 0, 0, 25, 25, 25, 25, 25, 25, 25, 0, 25, 25, 0, 25, 25, + 25, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 494, 494, 494, 494, 494, 494, 494, 494, 494, 494, 494, + 494, 494, 494, 494, 494, 494, 494, 494, 494, 494, 494, 494, 494, 494, + 494, 494, 494, 494, 494, 494, 494, 494, 494, 495, 495, 495, 495, 495, + 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, + 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, + 495, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 5, 27, 27, 27, 5, 27, + 27, 27, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 0, 55, 55, 0, 55, 0, 0, 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 0, 55, 55, 55, 55, 0, 55, 0, 55, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, + 55, 0, 55, 0, 55, 0, 55, 55, 55, 0, 55, 55, 0, 55, 0, 0, 55, 0, 55, 0, + 55, 0, 55, 0, 55, 0, 55, 55, 0, 55, 0, 0, 55, 55, 55, 55, 0, 55, 55, 55, + 55, 55, 55, 55, 0, 55, 55, 55, 55, 0, 55, 55, 55, 55, 0, 55, 0, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 55, 55, 55, 0, 55, 55, 55, + 55, 55, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 357, 357, 26, 22, 23, 358, 359, + 360, 361, 362, 363, 27, 27, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 496, 496, + 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, + 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 5, 5, 5, 5, 5, 5, 496, + 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, + 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 5, 5, 0, 0, 0, 0, + 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, + 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 496, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, + 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, + 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, + 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 5, + 5, 0, 0, 0, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, + 0, 0, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, + 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 386, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, +}; + +/* Returns the numeric value as double for Unicode characters + * having this property, -1.0 otherwise. + */ +double numba_PyUnicode_ToNumeric(Py_UCS4 ch) +{ + switch (ch) { + case 0x0F33: + return (double) -1.0/2.0; + case 0x0030: + case 0x0660: + case 0x06F0: + case 0x07C0: + case 0x0966: + case 0x09E6: + case 0x0A66: + case 0x0AE6: + case 0x0B66: + case 0x0BE6: + case 0x0C66: + case 0x0C78: + case 0x0CE6: + case 0x0D66: + case 0x0DE6: + case 0x0E50: + case 0x0ED0: + case 0x0F20: + case 0x1040: + case 0x1090: + case 0x17E0: + case 0x17F0: + case 0x1810: + case 0x1946: + case 0x19D0: + case 0x1A80: + case 0x1A90: + case 0x1B50: + case 0x1BB0: + case 0x1C40: + case 0x1C50: + case 0x2070: + case 0x2080: + case 0x2189: + case 0x24EA: + case 0x24FF: + case 0x3007: + case 0x96F6: + case 0xA620: + case 0xA6EF: + case 0xA8D0: + case 0xA900: + case 0xA9D0: + case 0xA9F0: + case 0xAA50: + case 0xABF0: + case 0xF9B2: + case 0xFF10: + case 0x1018A: + case 0x104A0: + case 0x10D30: + case 0x11066: + case 0x110F0: + case 0x11136: + case 0x111D0: + case 0x112F0: + case 0x11450: + case 0x114D0: + case 0x11650: + case 0x116C0: + case 0x11730: + case 0x118E0: + case 0x11C50: + case 0x11D50: + case 0x11DA0: + case 0x16A60: + case 0x16B50: + case 0x16E80: + case 0x1D2E0: + case 0x1D7CE: + case 0x1D7D8: + case 0x1D7E2: + case 0x1D7EC: + case 0x1D7F6: + case 0x1E950: + case 0x1F100: + case 0x1F101: + case 0x1F10B: + case 0x1F10C: + return (double) 0.0; + case 0x0031: + case 0x00B9: + case 0x0661: + case 0x06F1: + case 0x07C1: + case 0x0967: + case 0x09E7: + case 0x0A67: + case 0x0AE7: + case 0x0B67: + case 0x0BE7: + case 0x0C67: + case 0x0C79: + case 0x0C7C: + case 0x0CE7: + case 0x0D67: + case 0x0DE7: + case 0x0E51: + case 0x0ED1: + case 0x0F21: + case 0x1041: + case 0x1091: + case 0x1369: + case 0x17E1: + case 0x17F1: + case 0x1811: + case 0x1947: + case 0x19D1: + case 0x19DA: + case 0x1A81: + case 0x1A91: + case 0x1B51: + case 0x1BB1: + case 0x1C41: + case 0x1C51: + case 0x2081: + case 0x215F: + case 0x2160: + case 0x2170: + case 0x2460: + case 0x2474: + case 0x2488: + case 0x24F5: + case 0x2776: + case 0x2780: + case 0x278A: + case 0x3021: + case 0x3192: + case 0x3220: + case 0x3280: + case 0x4E00: + case 0x58F1: + case 0x58F9: + case 0x5E7A: + case 0x5F0C: + case 0xA621: + case 0xA6E6: + case 0xA8D1: + case 0xA901: + case 0xA9D1: + case 0xA9F1: + case 0xAA51: + case 0xABF1: + case 0xFF11: + case 0x10107: + case 0x10142: + case 0x10158: + case 0x10159: + case 0x1015A: + case 0x102E1: + case 0x10320: + case 0x103D1: + case 0x104A1: + case 0x10858: + case 0x10879: + case 0x108A7: + case 0x108FB: + case 0x10916: + case 0x109C0: + case 0x10A40: + case 0x10A7D: + case 0x10A9D: + case 0x10AEB: + case 0x10B58: + case 0x10B78: + case 0x10BA9: + case 0x10CFA: + case 0x10D31: + case 0x10E60: + case 0x10F1D: + case 0x10F51: + case 0x11052: + case 0x11067: + case 0x110F1: + case 0x11137: + case 0x111D1: + case 0x111E1: + case 0x112F1: + case 0x11451: + case 0x114D1: + case 0x11651: + case 0x116C1: + case 0x11731: + case 0x118E1: + case 0x11C51: + case 0x11C5A: + case 0x11D51: + case 0x11DA1: + case 0x12415: + case 0x1241E: + case 0x1242C: + case 0x12434: + case 0x1244F: + case 0x12458: + case 0x16A61: + case 0x16B51: + case 0x16E81: + case 0x16E94: + case 0x1D2E1: + case 0x1D360: + case 0x1D372: + case 0x1D377: + case 0x1D7CF: + case 0x1D7D9: + case 0x1D7E3: + case 0x1D7ED: + case 0x1D7F7: + case 0x1E8C7: + case 0x1E951: + case 0x1EC71: + case 0x1ECA3: + case 0x1ECB1: + case 0x1F102: + case 0x2092A: + return (double) 1.0; + case 0x0D5C: + case 0x2152: + return (double) 1.0/10.0; + case 0x109F6: + return (double) 1.0/12.0; + case 0x09F4: + case 0x0B75: + case 0x0D76: + case 0xA833: + return (double) 1.0/16.0; + case 0x0D58: + return (double) 1.0/160.0; + case 0x00BD: + case 0x0B73: + case 0x0D74: + case 0x0F2A: + case 0x2CFD: + case 0xA831: + case 0x10141: + case 0x10175: + case 0x10176: + case 0x109BD: + case 0x10A48: + case 0x10E7B: + case 0x10F26: + case 0x12464: + case 0x1ECAE: + return (double) 1.0/2.0; + case 0x0D5B: + return (double) 1.0/20.0; + case 0x2153: + case 0x10E7D: + case 0x1245A: + case 0x1245D: + case 0x12465: + return (double) 1.0/3.0; + case 0x00BC: + case 0x09F7: + case 0x0B72: + case 0x0D73: + case 0xA830: + case 0x10140: + case 0x1018B: + case 0x10E7C: + case 0x12460: + case 0x12462: + case 0x12463: + case 0x1ECAD: + return (double) 1.0/4.0; + case 0x0D59: + return (double) 1.0/40.0; + case 0x0D5E: + case 0x2155: + return (double) 1.0/5.0; + case 0x2159: + case 0x12461: + return (double) 1.0/6.0; + case 0x2150: + return (double) 1.0/7.0; + case 0x09F5: + case 0x0B76: + case 0x0D77: + case 0x215B: + case 0xA834: + case 0x1245F: + return (double) 1.0/8.0; + case 0x2151: + return (double) 1.0/9.0; + case 0x0BF0: + case 0x0D70: + case 0x1372: + case 0x2169: + case 0x2179: + case 0x2469: + case 0x247D: + case 0x2491: + case 0x24FE: + case 0x277F: + case 0x2789: + case 0x2793: + case 0x3038: + case 0x3229: + case 0x3248: + case 0x3289: + case 0x4EC0: + case 0x5341: + case 0x62FE: + case 0xF973: + case 0xF9FD: + case 0x10110: + case 0x10149: + case 0x10150: + case 0x10157: + case 0x10160: + case 0x10161: + case 0x10162: + case 0x10163: + case 0x10164: + case 0x102EA: + case 0x10322: + case 0x103D3: + case 0x1085B: + case 0x1087E: + case 0x108AD: + case 0x108FD: + case 0x10917: + case 0x109C9: + case 0x10A44: + case 0x10A9E: + case 0x10AED: + case 0x10B5C: + case 0x10B7C: + case 0x10BAD: + case 0x10CFC: + case 0x10E69: + case 0x10F22: + case 0x10F52: + case 0x1105B: + case 0x111EA: + case 0x1173A: + case 0x118EA: + case 0x11C63: + case 0x16B5B: + case 0x16E8A: + case 0x1D2EA: + case 0x1D369: + case 0x1EC7A: + return (double) 10.0; + case 0x109FF: + return (double) 10.0/12.0; + case 0x0BF1: + case 0x0D71: + case 0x137B: + case 0x216D: + case 0x217D: + case 0x4F70: + case 0x767E: + case 0x964C: + case 0x10119: + case 0x1014B: + case 0x10152: + case 0x1016A: + case 0x102F3: + case 0x103D5: + case 0x1085D: + case 0x108AF: + case 0x108FF: + case 0x10919: + case 0x109D2: + case 0x10A46: + case 0x10AEF: + case 0x10B5E: + case 0x10B7E: + case 0x10BAF: + case 0x10CFE: + case 0x10E72: + case 0x10F25: + case 0x10F54: + case 0x11064: + case 0x111F3: + case 0x11C6C: + case 0x16B5C: + case 0x1EC83: + return (double) 100.0; + case 0x0BF2: + case 0x0D72: + case 0x216F: + case 0x217F: + case 0x2180: + case 0x4EDF: + case 0x5343: + case 0x9621: + case 0x10122: + case 0x1014D: + case 0x10154: + case 0x10171: + case 0x1085E: + case 0x109DB: + case 0x10A47: + case 0x10B5F: + case 0x10B7F: + case 0x10CFF: + case 0x11065: + case 0x111F4: + case 0x1EC8C: + return (double) 1000.0; + case 0x137C: + case 0x2182: + case 0x4E07: + case 0x842C: + case 0x1012B: + case 0x10155: + case 0x1085F: + case 0x109E4: + case 0x16B5D: + case 0x1EC95: + case 0x1ECB3: + return (double) 10000.0; + case 0x2188: + case 0x109ED: + case 0x1EC9E: + case 0x1ECA0: + case 0x1ECB4: + return (double) 100000.0; + case 0x16B5E: + return (double) 1000000.0; + case 0x1ECA1: + return (double) 10000000.0; + case 0x4EBF: + case 0x5104: + case 0x16B5F: + return (double) 100000000.0; + case 0x16B60: + return (double) 10000000000.0; + case 0x5146: + case 0x16B61: + return (double) 1000000000000.0; + case 0x216A: + case 0x217A: + case 0x246A: + case 0x247E: + case 0x2492: + case 0x24EB: + case 0x16E8B: + case 0x1D2EB: + return (double) 11.0; + case 0x109BC: + return (double) 11.0/12.0; + case 0x0F2F: + return (double) 11.0/2.0; + case 0x216B: + case 0x217B: + case 0x246B: + case 0x247F: + case 0x2493: + case 0x24EC: + case 0x16E8C: + case 0x1D2EC: + return (double) 12.0; + case 0x246C: + case 0x2480: + case 0x2494: + case 0x24ED: + case 0x16E8D: + case 0x1D2ED: + return (double) 13.0; + case 0x0F30: + return (double) 13.0/2.0; + case 0x246D: + case 0x2481: + case 0x2495: + case 0x24EE: + case 0x16E8E: + case 0x1D2EE: + return (double) 14.0; + case 0x246E: + case 0x2482: + case 0x2496: + case 0x24EF: + case 0x16E8F: + case 0x1D2EF: + return (double) 15.0; + case 0x0F31: + return (double) 15.0/2.0; + case 0x09F9: + case 0x246F: + case 0x2483: + case 0x2497: + case 0x24F0: + case 0x16E90: + case 0x1D2F0: + return (double) 16.0; + case 0x16EE: + case 0x2470: + case 0x2484: + case 0x2498: + case 0x24F1: + case 0x16E91: + case 0x1D2F1: + return (double) 17.0; + case 0x0F32: + return (double) 17.0/2.0; + case 0x16EF: + case 0x2471: + case 0x2485: + case 0x2499: + case 0x24F2: + case 0x16E92: + case 0x1D2F2: + return (double) 18.0; + case 0x16F0: + case 0x2472: + case 0x2486: + case 0x249A: + case 0x24F3: + case 0x16E93: + case 0x1D2F3: + return (double) 19.0; + case 0x0032: + case 0x00B2: + case 0x0662: + case 0x06F2: + case 0x07C2: + case 0x0968: + case 0x09E8: + case 0x0A68: + case 0x0AE8: + case 0x0B68: + case 0x0BE8: + case 0x0C68: + case 0x0C7A: + case 0x0C7D: + case 0x0CE8: + case 0x0D68: + case 0x0DE8: + case 0x0E52: + case 0x0ED2: + case 0x0F22: + case 0x1042: + case 0x1092: + case 0x136A: + case 0x17E2: + case 0x17F2: + case 0x1812: + case 0x1948: + case 0x19D2: + case 0x1A82: + case 0x1A92: + case 0x1B52: + case 0x1BB2: + case 0x1C42: + case 0x1C52: + case 0x2082: + case 0x2161: + case 0x2171: + case 0x2461: + case 0x2475: + case 0x2489: + case 0x24F6: + case 0x2777: + case 0x2781: + case 0x278B: + case 0x3022: + case 0x3193: + case 0x3221: + case 0x3281: + case 0x3483: + case 0x4E8C: + case 0x5169: + case 0x5F0D: + case 0x5F10: + case 0x8CAE: + case 0x8CB3: + case 0x8D30: + case 0xA622: + case 0xA6E7: + case 0xA8D2: + case 0xA902: + case 0xA9D2: + case 0xA9F2: + case 0xAA52: + case 0xABF2: + case 0xF978: + case 0xFF12: + case 0x10108: + case 0x1015B: + case 0x1015C: + case 0x1015D: + case 0x1015E: + case 0x102E2: + case 0x103D2: + case 0x104A2: + case 0x10859: + case 0x1087A: + case 0x108A8: + case 0x1091A: + case 0x109C1: + case 0x10A41: + case 0x10B59: + case 0x10B79: + case 0x10BAA: + case 0x10D32: + case 0x10E61: + case 0x10F1E: + case 0x11053: + case 0x11068: + case 0x110F2: + case 0x11138: + case 0x111D2: + case 0x111E2: + case 0x112F2: + case 0x11452: + case 0x114D2: + case 0x11652: + case 0x116C2: + case 0x11732: + case 0x118E2: + case 0x11C52: + case 0x11C5B: + case 0x11D52: + case 0x11DA2: + case 0x12400: + case 0x12416: + case 0x1241F: + case 0x12423: + case 0x1242D: + case 0x12435: + case 0x1244A: + case 0x12450: + case 0x12456: + case 0x12459: + case 0x16A62: + case 0x16B52: + case 0x16E82: + case 0x16E95: + case 0x1D2E2: + case 0x1D361: + case 0x1D373: + case 0x1D7D0: + case 0x1D7DA: + case 0x1D7E4: + case 0x1D7EE: + case 0x1D7F8: + case 0x1E8C8: + case 0x1E952: + case 0x1EC72: + case 0x1ECA4: + case 0x1ECB2: + case 0x1F103: + case 0x22390: + return (double) 2.0; + case 0x109F7: + return (double) 2.0/12.0; + case 0x2154: + case 0x10177: + case 0x10E7E: + case 0x1245B: + case 0x1245E: + case 0x12466: + return (double) 2.0/3.0; + case 0x2156: + return (double) 2.0/5.0; + case 0x1373: + case 0x2473: + case 0x2487: + case 0x249B: + case 0x24F4: + case 0x3039: + case 0x3249: + case 0x5344: + case 0x5EFF: + case 0x10111: + case 0x102EB: + case 0x103D4: + case 0x1085C: + case 0x1087F: + case 0x108AE: + case 0x108FE: + case 0x10918: + case 0x109CA: + case 0x10A45: + case 0x10A9F: + case 0x10AEE: + case 0x10B5D: + case 0x10B7D: + case 0x10BAE: + case 0x10E6A: + case 0x10F23: + case 0x10F53: + case 0x1105C: + case 0x111EB: + case 0x1173B: + case 0x118EB: + case 0x11C64: + case 0x1D36A: + case 0x1EC7B: + return (double) 20.0; + case 0x1011A: + case 0x102F4: + case 0x109D3: + case 0x10E73: + case 0x1EC84: + return (double) 200.0; + case 0x10123: + case 0x109DC: + case 0x1EC8D: + return (double) 2000.0; + case 0x1012C: + case 0x109E5: + case 0x1EC96: + return (double) 20000.0; + case 0x109EE: + case 0x1EC9F: + return (double) 200000.0; + case 0x1ECA2: + return (double) 20000000.0; + case 0x3251: + return (double) 21.0; + case 0x12432: + return (double) 216000.0; + case 0x3252: + return (double) 22.0; + case 0x3253: + return (double) 23.0; + case 0x3254: + return (double) 24.0; + case 0x3255: + return (double) 25.0; + case 0x3256: + return (double) 26.0; + case 0x3257: + return (double) 27.0; + case 0x3258: + return (double) 28.0; + case 0x3259: + return (double) 29.0; + case 0x0033: + case 0x00B3: + case 0x0663: + case 0x06F3: + case 0x07C3: + case 0x0969: + case 0x09E9: + case 0x0A69: + case 0x0AE9: + case 0x0B69: + case 0x0BE9: + case 0x0C69: + case 0x0C7B: + case 0x0C7E: + case 0x0CE9: + case 0x0D69: + case 0x0DE9: + case 0x0E53: + case 0x0ED3: + case 0x0F23: + case 0x1043: + case 0x1093: + case 0x136B: + case 0x17E3: + case 0x17F3: + case 0x1813: + case 0x1949: + case 0x19D3: + case 0x1A83: + case 0x1A93: + case 0x1B53: + case 0x1BB3: + case 0x1C43: + case 0x1C53: + case 0x2083: + case 0x2162: + case 0x2172: + case 0x2462: + case 0x2476: + case 0x248A: + case 0x24F7: + case 0x2778: + case 0x2782: + case 0x278C: + case 0x3023: + case 0x3194: + case 0x3222: + case 0x3282: + case 0x4E09: + case 0x4EE8: + case 0x53C1: + case 0x53C2: + case 0x53C3: + case 0x53C4: + case 0x5F0E: + case 0xA623: + case 0xA6E8: + case 0xA8D3: + case 0xA903: + case 0xA9D3: + case 0xA9F3: + case 0xAA53: + case 0xABF3: + case 0xF96B: + case 0xFF13: + case 0x10109: + case 0x102E3: + case 0x104A3: + case 0x1085A: + case 0x1087B: + case 0x108A9: + case 0x1091B: + case 0x109C2: + case 0x10A42: + case 0x10B5A: + case 0x10B7A: + case 0x10BAB: + case 0x10D33: + case 0x10E62: + case 0x10F1F: + case 0x11054: + case 0x11069: + case 0x110F3: + case 0x11139: + case 0x111D3: + case 0x111E3: + case 0x112F3: + case 0x11453: + case 0x114D3: + case 0x11653: + case 0x116C3: + case 0x11733: + case 0x118E3: + case 0x11C53: + case 0x11C5C: + case 0x11D53: + case 0x11DA3: + case 0x12401: + case 0x12408: + case 0x12417: + case 0x12420: + case 0x12424: + case 0x12425: + case 0x1242E: + case 0x1242F: + case 0x12436: + case 0x12437: + case 0x1243A: + case 0x1243B: + case 0x1244B: + case 0x12451: + case 0x12457: + case 0x16A63: + case 0x16B53: + case 0x16E83: + case 0x16E96: + case 0x1D2E3: + case 0x1D362: + case 0x1D374: + case 0x1D7D1: + case 0x1D7DB: + case 0x1D7E5: + case 0x1D7EF: + case 0x1D7F9: + case 0x1E8C9: + case 0x1E953: + case 0x1EC73: + case 0x1ECA5: + case 0x1F104: + case 0x20AFD: + case 0x20B19: + case 0x22998: + case 0x23B1B: + return (double) 3.0; + case 0x109F8: + return (double) 3.0/12.0; + case 0x09F6: + case 0x0B77: + case 0x0D78: + case 0xA835: + return (double) 3.0/16.0; + case 0x0F2B: + return (double) 3.0/2.0; + case 0x0D5D: + return (double) 3.0/20.0; + case 0x00BE: + case 0x09F8: + case 0x0B74: + case 0x0D75: + case 0xA832: + case 0x10178: + case 0x1ECAF: + return (double) 3.0/4.0; + case 0x2157: + return (double) 3.0/5.0; + case 0x215C: + return (double) 3.0/8.0; + case 0x0D5A: + return (double) 3.0/80.0; + case 0x1374: + case 0x303A: + case 0x324A: + case 0x325A: + case 0x5345: + case 0x10112: + case 0x10165: + case 0x102EC: + case 0x109CB: + case 0x10E6B: + case 0x10F24: + case 0x1105D: + case 0x111EC: + case 0x118EC: + case 0x11C65: + case 0x1D36B: + case 0x1EC7C: + case 0x20983: + return (double) 30.0; + case 0x1011B: + case 0x1016B: + case 0x102F5: + case 0x109D4: + case 0x10E74: + case 0x1EC85: + return (double) 300.0; + case 0x10124: + case 0x109DD: + case 0x1EC8E: + return (double) 3000.0; + case 0x1012D: + case 0x109E6: + case 0x1EC97: + return (double) 30000.0; + case 0x109EF: + return (double) 300000.0; + case 0x325B: + return (double) 31.0; + case 0x325C: + return (double) 32.0; + case 0x325D: + return (double) 33.0; + case 0x325E: + return (double) 34.0; + case 0x325F: + return (double) 35.0; + case 0x32B1: + return (double) 36.0; + case 0x32B2: + return (double) 37.0; + case 0x32B3: + return (double) 38.0; + case 0x32B4: + return (double) 39.0; + case 0x0034: + case 0x0664: + case 0x06F4: + case 0x07C4: + case 0x096A: + case 0x09EA: + case 0x0A6A: + case 0x0AEA: + case 0x0B6A: + case 0x0BEA: + case 0x0C6A: + case 0x0CEA: + case 0x0D6A: + case 0x0DEA: + case 0x0E54: + case 0x0ED4: + case 0x0F24: + case 0x1044: + case 0x1094: + case 0x136C: + case 0x17E4: + case 0x17F4: + case 0x1814: + case 0x194A: + case 0x19D4: + case 0x1A84: + case 0x1A94: + case 0x1B54: + case 0x1BB4: + case 0x1C44: + case 0x1C54: + case 0x2074: + case 0x2084: + case 0x2163: + case 0x2173: + case 0x2463: + case 0x2477: + case 0x248B: + case 0x24F8: + case 0x2779: + case 0x2783: + case 0x278D: + case 0x3024: + case 0x3195: + case 0x3223: + case 0x3283: + case 0x4E96: + case 0x56DB: + case 0x8086: + case 0xA624: + case 0xA6E9: + case 0xA8D4: + case 0xA904: + case 0xA9D4: + case 0xA9F4: + case 0xAA54: + case 0xABF4: + case 0xFF14: + case 0x1010A: + case 0x102E4: + case 0x104A4: + case 0x1087C: + case 0x108AA: + case 0x108AB: + case 0x109C3: + case 0x10A43: + case 0x10B5B: + case 0x10B7B: + case 0x10BAC: + case 0x10D34: + case 0x10E63: + case 0x10F20: + case 0x11055: + case 0x1106A: + case 0x110F4: + case 0x1113A: + case 0x111D4: + case 0x111E4: + case 0x112F4: + case 0x11454: + case 0x114D4: + case 0x11654: + case 0x116C4: + case 0x11734: + case 0x118E4: + case 0x11C54: + case 0x11C5D: + case 0x11D54: + case 0x11DA4: + case 0x12402: + case 0x12409: + case 0x1240F: + case 0x12418: + case 0x12421: + case 0x12426: + case 0x12430: + case 0x12438: + case 0x1243C: + case 0x1243D: + case 0x1243E: + case 0x1243F: + case 0x1244C: + case 0x12452: + case 0x12453: + case 0x12469: + case 0x16A64: + case 0x16B54: + case 0x16E84: + case 0x1D2E4: + case 0x1D363: + case 0x1D375: + case 0x1D7D2: + case 0x1D7DC: + case 0x1D7E6: + case 0x1D7F0: + case 0x1D7FA: + case 0x1E8CA: + case 0x1E954: + case 0x1EC74: + case 0x1ECA6: + case 0x1F105: + case 0x20064: + case 0x200E2: + case 0x2626D: + return (double) 4.0; + case 0x109F9: + return (double) 4.0/12.0; + case 0x2158: + return (double) 4.0/5.0; + case 0x1375: + case 0x324B: + case 0x32B5: + case 0x534C: + case 0x10113: + case 0x102ED: + case 0x109CC: + case 0x10E6C: + case 0x1105E: + case 0x111ED: + case 0x118ED: + case 0x11C66: + case 0x12467: + case 0x1D36C: + case 0x1EC7D: + case 0x2098C: + case 0x2099C: + return (double) 40.0; + case 0x1011C: + case 0x102F6: + case 0x109D5: + case 0x10E75: + case 0x1EC86: + return (double) 400.0; + case 0x10125: + case 0x109DE: + case 0x1EC8F: + return (double) 4000.0; + case 0x1012E: + case 0x109E7: + case 0x1EC98: + return (double) 40000.0; + case 0x109F0: + return (double) 400000.0; + case 0x32B6: + return (double) 41.0; + case 0x32B7: + return (double) 42.0; + case 0x32B8: + return (double) 43.0; + case 0x12433: + return (double) 432000.0; + case 0x32B9: + return (double) 44.0; + case 0x32BA: + return (double) 45.0; + case 0x32BB: + return (double) 46.0; + case 0x32BC: + return (double) 47.0; + case 0x32BD: + return (double) 48.0; + case 0x32BE: + return (double) 49.0; + case 0x0035: + case 0x0665: + case 0x06F5: + case 0x07C5: + case 0x096B: + case 0x09EB: + case 0x0A6B: + case 0x0AEB: + case 0x0B6B: + case 0x0BEB: + case 0x0C6B: + case 0x0CEB: + case 0x0D6B: + case 0x0DEB: + case 0x0E55: + case 0x0ED5: + case 0x0F25: + case 0x1045: + case 0x1095: + case 0x136D: + case 0x17E5: + case 0x17F5: + case 0x1815: + case 0x194B: + case 0x19D5: + case 0x1A85: + case 0x1A95: + case 0x1B55: + case 0x1BB5: + case 0x1C45: + case 0x1C55: + case 0x2075: + case 0x2085: + case 0x2164: + case 0x2174: + case 0x2464: + case 0x2478: + case 0x248C: + case 0x24F9: + case 0x277A: + case 0x2784: + case 0x278E: + case 0x3025: + case 0x3224: + case 0x3284: + case 0x3405: + case 0x382A: + case 0x4E94: + case 0x4F0D: + case 0xA625: + case 0xA6EA: + case 0xA8D5: + case 0xA905: + case 0xA9D5: + case 0xA9F5: + case 0xAA55: + case 0xABF5: + case 0xFF15: + case 0x1010B: + case 0x10143: + case 0x10148: + case 0x1014F: + case 0x1015F: + case 0x10173: + case 0x102E5: + case 0x10321: + case 0x104A5: + case 0x1087D: + case 0x108AC: + case 0x108FC: + case 0x109C4: + case 0x10AEC: + case 0x10CFB: + case 0x10D35: + case 0x10E64: + case 0x10F21: + case 0x11056: + case 0x1106B: + case 0x110F5: + case 0x1113B: + case 0x111D5: + case 0x111E5: + case 0x112F5: + case 0x11455: + case 0x114D5: + case 0x11655: + case 0x116C5: + case 0x11735: + case 0x118E5: + case 0x11C55: + case 0x11C5E: + case 0x11D55: + case 0x11DA5: + case 0x12403: + case 0x1240A: + case 0x12410: + case 0x12419: + case 0x12422: + case 0x12427: + case 0x12431: + case 0x12439: + case 0x1244D: + case 0x12454: + case 0x12455: + case 0x1246A: + case 0x16A65: + case 0x16B55: + case 0x16E85: + case 0x1D2E5: + case 0x1D364: + case 0x1D376: + case 0x1D378: + case 0x1D7D3: + case 0x1D7DD: + case 0x1D7E7: + case 0x1D7F1: + case 0x1D7FB: + case 0x1E8CB: + case 0x1E955: + case 0x1EC75: + case 0x1ECA7: + case 0x1F106: + case 0x20121: + return (double) 5.0; + case 0x109FA: + return (double) 5.0/12.0; + case 0x0F2C: + return (double) 5.0/2.0; + case 0x215A: + case 0x1245C: + return (double) 5.0/6.0; + case 0x215D: + return (double) 5.0/8.0; + case 0x1376: + case 0x216C: + case 0x217C: + case 0x2186: + case 0x324C: + case 0x32BF: + case 0x10114: + case 0x10144: + case 0x1014A: + case 0x10151: + case 0x10166: + case 0x10167: + case 0x10168: + case 0x10169: + case 0x10174: + case 0x102EE: + case 0x10323: + case 0x109CD: + case 0x10A7E: + case 0x10CFD: + case 0x10E6D: + case 0x1105F: + case 0x111EE: + case 0x118EE: + case 0x11C67: + case 0x12468: + case 0x1D36D: + case 0x1EC7E: + return (double) 50.0; + case 0x216E: + case 0x217E: + case 0x1011D: + case 0x10145: + case 0x1014C: + case 0x10153: + case 0x1016C: + case 0x1016D: + case 0x1016E: + case 0x1016F: + case 0x10170: + case 0x102F7: + case 0x109D6: + case 0x10E76: + case 0x1EC87: + return (double) 500.0; + case 0x2181: + case 0x10126: + case 0x10146: + case 0x1014E: + case 0x10172: + case 0x109DF: + case 0x1EC90: + return (double) 5000.0; + case 0x2187: + case 0x1012F: + case 0x10147: + case 0x10156: + case 0x109E8: + case 0x1EC99: + return (double) 50000.0; + case 0x109F1: + return (double) 500000.0; + case 0x0036: + case 0x0666: + case 0x06F6: + case 0x07C6: + case 0x096C: + case 0x09EC: + case 0x0A6C: + case 0x0AEC: + case 0x0B6C: + case 0x0BEC: + case 0x0C6C: + case 0x0CEC: + case 0x0D6C: + case 0x0DEC: + case 0x0E56: + case 0x0ED6: + case 0x0F26: + case 0x1046: + case 0x1096: + case 0x136E: + case 0x17E6: + case 0x17F6: + case 0x1816: + case 0x194C: + case 0x19D6: + case 0x1A86: + case 0x1A96: + case 0x1B56: + case 0x1BB6: + case 0x1C46: + case 0x1C56: + case 0x2076: + case 0x2086: + case 0x2165: + case 0x2175: + case 0x2185: + case 0x2465: + case 0x2479: + case 0x248D: + case 0x24FA: + case 0x277B: + case 0x2785: + case 0x278F: + case 0x3026: + case 0x3225: + case 0x3285: + case 0x516D: + case 0x9646: + case 0x9678: + case 0xA626: + case 0xA6EB: + case 0xA8D6: + case 0xA906: + case 0xA9D6: + case 0xA9F6: + case 0xAA56: + case 0xABF6: + case 0xF9D1: + case 0xF9D3: + case 0xFF16: + case 0x1010C: + case 0x102E6: + case 0x104A6: + case 0x109C5: + case 0x10D36: + case 0x10E65: + case 0x11057: + case 0x1106C: + case 0x110F6: + case 0x1113C: + case 0x111D6: + case 0x111E6: + case 0x112F6: + case 0x11456: + case 0x114D6: + case 0x11656: + case 0x116C6: + case 0x11736: + case 0x118E6: + case 0x11C56: + case 0x11C5F: + case 0x11D56: + case 0x11DA6: + case 0x12404: + case 0x1240B: + case 0x12411: + case 0x1241A: + case 0x12428: + case 0x12440: + case 0x1244E: + case 0x1246B: + case 0x16A66: + case 0x16B56: + case 0x16E86: + case 0x1D2E6: + case 0x1D365: + case 0x1D7D4: + case 0x1D7DE: + case 0x1D7E8: + case 0x1D7F2: + case 0x1D7FC: + case 0x1E8CC: + case 0x1E956: + case 0x1EC76: + case 0x1ECA8: + case 0x1F107: + case 0x20AEA: + return (double) 6.0; + case 0x109FB: + return (double) 6.0/12.0; + case 0x1377: + case 0x324D: + case 0x10115: + case 0x102EF: + case 0x109CE: + case 0x10E6E: + case 0x11060: + case 0x111EF: + case 0x118EF: + case 0x11C68: + case 0x1D36E: + case 0x1EC7F: + return (double) 60.0; + case 0x1011E: + case 0x102F8: + case 0x109D7: + case 0x10E77: + case 0x1EC88: + return (double) 600.0; + case 0x10127: + case 0x109E0: + case 0x1EC91: + return (double) 6000.0; + case 0x10130: + case 0x109E9: + case 0x1EC9A: + return (double) 60000.0; + case 0x109F2: + return (double) 600000.0; + case 0x0037: + case 0x0667: + case 0x06F7: + case 0x07C7: + case 0x096D: + case 0x09ED: + case 0x0A6D: + case 0x0AED: + case 0x0B6D: + case 0x0BED: + case 0x0C6D: + case 0x0CED: + case 0x0D6D: + case 0x0DED: + case 0x0E57: + case 0x0ED7: + case 0x0F27: + case 0x1047: + case 0x1097: + case 0x136F: + case 0x17E7: + case 0x17F7: + case 0x1817: + case 0x194D: + case 0x19D7: + case 0x1A87: + case 0x1A97: + case 0x1B57: + case 0x1BB7: + case 0x1C47: + case 0x1C57: + case 0x2077: + case 0x2087: + case 0x2166: + case 0x2176: + case 0x2466: + case 0x247A: + case 0x248E: + case 0x24FB: + case 0x277C: + case 0x2786: + case 0x2790: + case 0x3027: + case 0x3226: + case 0x3286: + case 0x3B4D: + case 0x4E03: + case 0x67D2: + case 0x6F06: + case 0xA627: + case 0xA6EC: + case 0xA8D7: + case 0xA907: + case 0xA9D7: + case 0xA9F7: + case 0xAA57: + case 0xABF7: + case 0xFF17: + case 0x1010D: + case 0x102E7: + case 0x104A7: + case 0x109C6: + case 0x10D37: + case 0x10E66: + case 0x11058: + case 0x1106D: + case 0x110F7: + case 0x1113D: + case 0x111D7: + case 0x111E7: + case 0x112F7: + case 0x11457: + case 0x114D7: + case 0x11657: + case 0x116C7: + case 0x11737: + case 0x118E7: + case 0x11C57: + case 0x11C60: + case 0x11D57: + case 0x11DA7: + case 0x12405: + case 0x1240C: + case 0x12412: + case 0x1241B: + case 0x12429: + case 0x12441: + case 0x12442: + case 0x12443: + case 0x1246C: + case 0x16A67: + case 0x16B57: + case 0x16E87: + case 0x1D2E7: + case 0x1D366: + case 0x1D7D5: + case 0x1D7DF: + case 0x1D7E9: + case 0x1D7F3: + case 0x1D7FD: + case 0x1E8CD: + case 0x1E957: + case 0x1EC77: + case 0x1ECA9: + case 0x1F108: + case 0x20001: + return (double) 7.0; + case 0x109FC: + return (double) 7.0/12.0; + case 0x0F2D: + return (double) 7.0/2.0; + case 0x215E: + return (double) 7.0/8.0; + case 0x1378: + case 0x324E: + case 0x10116: + case 0x102F0: + case 0x109CF: + case 0x10E6F: + case 0x11061: + case 0x111F0: + case 0x118F0: + case 0x11C69: + case 0x1D36F: + case 0x1EC80: + return (double) 70.0; + case 0x1011F: + case 0x102F9: + case 0x109D8: + case 0x10E78: + case 0x1EC89: + return (double) 700.0; + case 0x10128: + case 0x109E1: + case 0x1EC92: + return (double) 7000.0; + case 0x10131: + case 0x109EA: + case 0x1EC9B: + return (double) 70000.0; + case 0x109F3: + return (double) 700000.0; + case 0x0038: + case 0x0668: + case 0x06F8: + case 0x07C8: + case 0x096E: + case 0x09EE: + case 0x0A6E: + case 0x0AEE: + case 0x0B6E: + case 0x0BEE: + case 0x0C6E: + case 0x0CEE: + case 0x0D6E: + case 0x0DEE: + case 0x0E58: + case 0x0ED8: + case 0x0F28: + case 0x1048: + case 0x1098: + case 0x1370: + case 0x17E8: + case 0x17F8: + case 0x1818: + case 0x194E: + case 0x19D8: + case 0x1A88: + case 0x1A98: + case 0x1B58: + case 0x1BB8: + case 0x1C48: + case 0x1C58: + case 0x2078: + case 0x2088: + case 0x2167: + case 0x2177: + case 0x2467: + case 0x247B: + case 0x248F: + case 0x24FC: + case 0x277D: + case 0x2787: + case 0x2791: + case 0x3028: + case 0x3227: + case 0x3287: + case 0x516B: + case 0x634C: + case 0xA628: + case 0xA6ED: + case 0xA8D8: + case 0xA908: + case 0xA9D8: + case 0xA9F8: + case 0xAA58: + case 0xABF8: + case 0xFF18: + case 0x1010E: + case 0x102E8: + case 0x104A8: + case 0x109C7: + case 0x10D38: + case 0x10E67: + case 0x11059: + case 0x1106E: + case 0x110F8: + case 0x1113E: + case 0x111D8: + case 0x111E8: + case 0x112F8: + case 0x11458: + case 0x114D8: + case 0x11658: + case 0x116C8: + case 0x11738: + case 0x118E8: + case 0x11C58: + case 0x11C61: + case 0x11D58: + case 0x11DA8: + case 0x12406: + case 0x1240D: + case 0x12413: + case 0x1241C: + case 0x1242A: + case 0x12444: + case 0x12445: + case 0x1246D: + case 0x16A68: + case 0x16B58: + case 0x16E88: + case 0x1D2E8: + case 0x1D367: + case 0x1D7D6: + case 0x1D7E0: + case 0x1D7EA: + case 0x1D7F4: + case 0x1D7FE: + case 0x1E8CE: + case 0x1E958: + case 0x1EC78: + case 0x1ECAA: + case 0x1F109: + return (double) 8.0; + case 0x109FD: + return (double) 8.0/12.0; + case 0x1379: + case 0x324F: + case 0x10117: + case 0x102F1: + case 0x10E70: + case 0x11062: + case 0x111F1: + case 0x118F1: + case 0x11C6A: + case 0x1D370: + case 0x1EC81: + return (double) 80.0; + case 0x10120: + case 0x102FA: + case 0x109D9: + case 0x10E79: + case 0x1EC8A: + return (double) 800.0; + case 0x10129: + case 0x109E2: + case 0x1EC93: + return (double) 8000.0; + case 0x10132: + case 0x109EB: + case 0x1EC9C: + return (double) 80000.0; + case 0x109F4: + return (double) 800000.0; + case 0x0039: + case 0x0669: + case 0x06F9: + case 0x07C9: + case 0x096F: + case 0x09EF: + case 0x0A6F: + case 0x0AEF: + case 0x0B6F: + case 0x0BEF: + case 0x0C6F: + case 0x0CEF: + case 0x0D6F: + case 0x0DEF: + case 0x0E59: + case 0x0ED9: + case 0x0F29: + case 0x1049: + case 0x1099: + case 0x1371: + case 0x17E9: + case 0x17F9: + case 0x1819: + case 0x194F: + case 0x19D9: + case 0x1A89: + case 0x1A99: + case 0x1B59: + case 0x1BB9: + case 0x1C49: + case 0x1C59: + case 0x2079: + case 0x2089: + case 0x2168: + case 0x2178: + case 0x2468: + case 0x247C: + case 0x2490: + case 0x24FD: + case 0x277E: + case 0x2788: + case 0x2792: + case 0x3029: + case 0x3228: + case 0x3288: + case 0x4E5D: + case 0x5EFE: + case 0x7396: + case 0xA629: + case 0xA6EE: + case 0xA8D9: + case 0xA909: + case 0xA9D9: + case 0xA9F9: + case 0xAA59: + case 0xABF9: + case 0xFF19: + case 0x1010F: + case 0x102E9: + case 0x104A9: + case 0x109C8: + case 0x10D39: + case 0x10E68: + case 0x1105A: + case 0x1106F: + case 0x110F9: + case 0x1113F: + case 0x111D9: + case 0x111E9: + case 0x112F9: + case 0x11459: + case 0x114D9: + case 0x11659: + case 0x116C9: + case 0x11739: + case 0x118E9: + case 0x11C59: + case 0x11C62: + case 0x11D59: + case 0x11DA9: + case 0x12407: + case 0x1240E: + case 0x12414: + case 0x1241D: + case 0x1242B: + case 0x12446: + case 0x12447: + case 0x12448: + case 0x12449: + case 0x1246E: + case 0x16A69: + case 0x16B59: + case 0x16E89: + case 0x1D2E9: + case 0x1D368: + case 0x1D7D7: + case 0x1D7E1: + case 0x1D7EB: + case 0x1D7F5: + case 0x1D7FF: + case 0x1E8CF: + case 0x1E959: + case 0x1EC79: + case 0x1ECAB: + case 0x1F10A: + case 0x2F890: + return (double) 9.0; + case 0x109FE: + return (double) 9.0/12.0; + case 0x0F2E: + return (double) 9.0/2.0; + case 0x137A: + case 0x10118: + case 0x102F2: + case 0x10341: + case 0x10E71: + case 0x11063: + case 0x111F2: + case 0x118F2: + case 0x11C6B: + case 0x1D371: + case 0x1EC82: + return (double) 90.0; + case 0x10121: + case 0x102FB: + case 0x1034A: + case 0x109DA: + case 0x10E7A: + case 0x1EC8B: + return (double) 900.0; + case 0x1012A: + case 0x109E3: + case 0x1EC94: + return (double) 9000.0; + case 0x10133: + case 0x109EC: + case 0x1EC9D: + return (double) 90000.0; + case 0x109F5: + return (double) 900000.0; + } + return -1.0; +} + +/* Returns 1 for Unicode characters having the bidirectional + * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. + */ +int numba_PyUnicode_IsWhitespace(const Py_UCS4 ch) +{ + switch (ch) { + case 0x0009: + case 0x000A: + case 0x000B: + case 0x000C: + case 0x000D: + case 0x001C: + case 0x001D: + case 0x001E: + case 0x001F: + case 0x0020: + case 0x0085: + case 0x00A0: + case 0x1680: + case 0x2000: + case 0x2001: + case 0x2002: + case 0x2003: + case 0x2004: + case 0x2005: + case 0x2006: + case 0x2007: + case 0x2008: + case 0x2009: + case 0x200A: + case 0x2028: + case 0x2029: + case 0x202F: + case 0x205F: + case 0x3000: + return 1; + } + return 0; +} + +/* Returns 1 for Unicode characters having the line break + * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional + * type 'B', 0 otherwise. + */ +int numba_PyUnicode_IsLinebreak(const Py_UCS4 ch) +{ + switch (ch) { + case 0x000A: + case 0x000B: + case 0x000C: + case 0x000D: + case 0x001C: + case 0x001D: + case 0x001E: + case 0x0085: + case 0x2028: + case 0x2029: + return 1; + } + return 0; +} + +#endif /* _UNICODETYPE_DB_H */ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/_version.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..06d32884336b3b88567c02acaad1345637175f9a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/_version.py @@ -0,0 +1,238 @@ +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.14 (https://github.com/warner/python-versioneer) + +import errno +import os +import re +import subprocess +import sys + +# these strings will be replaced by git during git-archive +git_refnames = "$Format:%d$" +git_full = "$Format:%H$" + +# these strings are filled in when 'setup.py versioneer' creates _version.py +tag_prefix = "" +parentdir_prefix = "numba-" +versionfile_source = "numba/_version.py" + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): + assert isinstance(commands, list) + p = None + for c in commands: + try: + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % args[0]) + print(e) + return None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % args[0]) + return None + return stdout + + +def versions_from_parentdir(parentdir_prefix, root, verbose=False): + # Source tarballs conventionally unpack into a directory that includes + # both the project name and a version string. + dirname = os.path.basename(root) + if not dirname.startswith(parentdir_prefix): + if verbose: + print("guessing rootdir is '%s', but '%s' doesn't start with " + "prefix '%s'" % (root, dirname, parentdir_prefix)) + return None + return {"version": dirname[len(parentdir_prefix):], "full": ""} + + +def git_get_keywords(versionfile_abs): + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +def git_versions_from_keywords(keywords, tag_prefix, verbose=False): + if not keywords: + return {} # keyword-finding function failed to find keywords + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + return {} # unexpanded, so not in an unpacked git-archive tarball + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs-tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full": keywords["full"].strip()} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full": keywords["full"].strip()} + + +def git_parse_vcs_describe(git_describe, tag_prefix, verbose=False): + # TAG-NUM-gHEX[-dirty] or HEX[-dirty] . TAG might have hyphens. + + # dirty + dirty = git_describe.endswith("-dirty") + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + dirty_suffix = ".dirty" if dirty else "" + + # now we have TAG-NUM-gHEX or HEX + + if "-" not in git_describe: # just HEX + return "0+untagged.g"+git_describe+dirty_suffix, dirty + + # just TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + return "0+unparseable"+dirty_suffix, dirty + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + return None, dirty + tag = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + distance = int(mo.group(2)) + + # commit: short hex revision ID + commit = mo.group(3) + + # now build up version string, with post-release "local version + # identifier". Our goal: TAG[+NUM.gHEX[.dirty]] . Note that if you get a + # tagged build and then dirty it, you'll get TAG+0.gHEX.dirty . So you + # can always test version.endswith(".dirty"). + version = tag + if distance or dirty: + version += "+%d.g%s" % (distance, commit) + dirty_suffix + + return version, dirty + + +def git_versions_from_vcs(tag_prefix, root, verbose=False): + # this runs 'git' from the root of the source tree. This only gets called + # if the git-archive 'subst' keywords were *not* expanded, and + # _version.py hasn't already been rewritten with a short version string, + # meaning we're inside a checked out source tree. + + if not os.path.exists(os.path.join(root, ".git")): + if verbose: + print("no .git in %s" % root) + return {} # get_versions() will try next method + + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + # if there is a tag, this yields TAG-NUM-gHEX[-dirty] + # if there are no tags, this yields HEX[-dirty] (no NUM) + stdout = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long"], + cwd=root) + # --long was added in git-1.5.5 + if stdout is None: + return {} # try next method + version, dirty = git_parse_vcs_describe(stdout, tag_prefix, verbose) + + # build "full", which is FULLHEX[.dirty] + stdout = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if stdout is None: + return {} + full = stdout.strip() + if dirty: + full += ".dirty" + + return {"version": version, "full": full} + + +def get_versions(default={"version": "0+unknown", "full": ""}, verbose=False): + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + keywords = {"refnames": git_refnames, "full": git_full} + ver = git_versions_from_keywords(keywords, tag_prefix, verbose) + if ver: + return ver + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return default + + return (git_versions_from_vcs(tag_prefix, root, verbose) + or versions_from_parentdir(parentdir_prefix, root, verbose) + or default) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/capsulethunk.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/capsulethunk.h new file mode 100644 index 0000000000000000000000000000000000000000..4bdf5b41facebcee3a5c264904c038c31fb330da --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/capsulethunk.h @@ -0,0 +1,108 @@ +/** + + This is a modified version of capsulethunk.h for use in llvmpy + +**/ + +#ifndef __CAPSULETHUNK_H +#define __CAPSULETHUNK_H + +#if ( (PY_VERSION_HEX < 0x02070000) \ + || ((PY_VERSION_HEX >= 0x03000000) \ + && (PY_VERSION_HEX < 0x03010000)) ) + +//#define Assert(X) do_assert(!!(X), #X, __FILE__, __LINE__) +#define Assert(X) + +static +void do_assert(int cond, const char * msg, const char *file, unsigned line){ + if (!cond) { + fprintf(stderr, "Assertion failed %s:%d\n%s\n", file, line, msg); + exit(1); + } +} + +typedef void (*PyCapsule_Destructor)(PyObject *); + +struct FakePyCapsule_Desc { + const char *name; + void *context; + PyCapsule_Destructor dtor; + PyObject *parent; + + FakePyCapsule_Desc() : name(0), context(0), dtor(0) {} +}; + +static +FakePyCapsule_Desc* get_pycobj_desc(PyObject *p){ + void *desc = ((PyCObject*)p)->desc; + Assert(desc && "No desc in PyCObject"); + return static_cast(desc); +} + +static +void pycobject_pycapsule_dtor(void *p, void *desc){ + Assert(desc); + Assert(p); + FakePyCapsule_Desc *fpc_desc = static_cast(desc); + Assert(fpc_desc->parent); + Assert(PyCObject_Check(fpc_desc->parent)); + fpc_desc->dtor(static_cast(fpc_desc->parent)); + delete fpc_desc; +} + +static +PyObject* PyCapsule_New(void* ptr, const char *name, PyCapsule_Destructor dtor) +{ + FakePyCapsule_Desc *desc = new FakePyCapsule_Desc; + desc->name = name; + desc->context = NULL; + desc->dtor = dtor; + PyObject *p = PyCObject_FromVoidPtrAndDesc(ptr, desc, + pycobject_pycapsule_dtor); + desc->parent = p; + return p; +} + +static +int PyCapsule_CheckExact(PyObject *p) +{ + return PyCObject_Check(p); +} + +static +void* PyCapsule_GetPointer(PyObject *p, const char *name) +{ + Assert(PyCapsule_CheckExact(p)); + if (strcmp(get_pycobj_desc(p)->name, name) != 0) { + PyErr_SetString(PyExc_ValueError, "Invalid PyCapsule object"); + } + return PyCObject_AsVoidPtr(p); +} + +static +void* PyCapsule_GetContext(PyObject *p) +{ + Assert(p); + Assert(PyCapsule_CheckExact(p)); + return get_pycobj_desc(p)->context; +} + +static +int PyCapsule_SetContext(PyObject *p, void *context) +{ + Assert(PyCapsule_CheckExact(p)); + get_pycobj_desc(p)->context = context; + return 0; +} + +static +const char * PyCapsule_GetName(PyObject *p) +{ +// Assert(PyCapsule_CheckExact(p)); + return get_pycobj_desc(p)->name; +} + +#endif /* #if PY_VERSION_HEX < 0x02070000 */ + +#endif /* __CAPSULETHUNK_H */ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/__init__.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0ff54a4d6820fae84f4bda25d4cd577c1389fea8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/__init__.py @@ -0,0 +1,23 @@ +""" +Utilities for getting information about Numba C extensions +""" + +import os + + +def get_extension_libs(): + """Return the .c files in the `numba.cext` directory. + """ + libs = [] + base = get_path() + for fn in os.listdir(base): + if fn.endswith('.c'): + fn = os.path.join(base, fn) + libs.append(fn) + return libs + + +def get_path(): + """Returns the path to the directory for `numba.cext`. + """ + return os.path.abspath(os.path.join(os.path.dirname(__file__))) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/cext.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/cext.h new file mode 100644 index 0000000000000000000000000000000000000000..88188dc8530c07ba28341aff4b73ca0dddd087d2 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/cext.h @@ -0,0 +1,18 @@ +#ifndef NUMBA_EXTENSION_HELPER_H_ +#define NUMBA_EXTENSION_HELPER_H_ + +#include "Python.h" +#include "../_numba_common.h" + +/* Define all runtime-required symbols in this C module, but do not + export them outside the shared library if possible. */ +#define NUMBA_EXPORT_FUNC(_rettype) VISIBILITY_HIDDEN _rettype +#define NUMBA_EXPORT_DATA(_vartype) VISIBILITY_HIDDEN _vartype + +NUMBA_EXPORT_FUNC(Py_ssize_t) +aligned_size(Py_ssize_t sz); + +#include "dictobject.h" +#include "listobject.h" + +#endif // end NUMBA_EXTENSION_HELPER_H_ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/dictobject.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/dictobject.c new file mode 100644 index 0000000000000000000000000000000000000000..37db7593b7be0556b2e8ada64a408aa17e827045 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/dictobject.c @@ -0,0 +1,1191 @@ +/* The following is adapted from CPython3.7. +The exact commit is: + +- https://github.com/python/cpython/blob/44467e8ea4cea390b0718702291b4cfe8ddd67ed/Objects/dictobject.c + + +*/ + +/* Dictionary object implementation using a hash table */ + +/* The distribution includes a separate file, Objects/dictnotes.txt, + describing explorations into dictionary design and optimization. + It covers typical dictionary use patterns, the parameters for + tuning dictionaries, and several ideas for possible optimizations. +*/ + +/* PyDictKeysObject + +This implements the dictionary's hashtable. + +As of Python 3.6, this is compact and ordered. Basic idea is described here: +* https://mail.python.org/pipermail/python-dev/2012-December/123028.html +* https://morepypy.blogspot.com/2015/01/faster-more-memory-efficient-and-more.html + +layout: + ++---------------+ +| dk_refcnt | +| dk_size | +| dk_lookup | +| dk_usable | +| dk_nentries | ++---------------+ +| dk_indices | +| | ++---------------+ +| dk_entries | +| | ++---------------+ + +dk_indices is actual hashtable. It holds index in entries, or DKIX_EMPTY(-1) +or DKIX_DUMMY(-2). +Size of indices is dk_size. Type of each index in indices is vary on dk_size: + +* int8 for dk_size <= 128 +* int16 for 256 <= dk_size <= 2**15 +* int32 for 2**16 <= dk_size <= 2**31 +* int64 for 2**32 <= dk_size + +dk_entries is array of PyDictKeyEntry. It's size is USABLE_FRACTION(dk_size). +DK_ENTRIES(dk) can be used to get pointer to entries. + +NOTE: Since negative value is used for DKIX_EMPTY and DKIX_DUMMY, type of +dk_indices entry is signed integer and int16 is used for table which +dk_size == 256. +*/ + + +/* +The DictObject can be in one of two forms. + +Either: + A combined table: + ma_values == NULL, dk_refcnt == 1. + Values are stored in the me_value field of the PyDictKeysObject. +Or: + + (Numba dev notes: split table logic is removed) + + A split table: + ma_values != NULL, dk_refcnt >= 1 + Values are stored in the ma_values array. + Only string (unicode) keys are allowed. + All dicts sharing same key must have same insertion order. + +There are four kinds of slots in the table (slot is index, and +DK_ENTRIES(keys)[index] if index >= 0): + +1. Unused. index == DKIX_EMPTY + Does not hold an active (key, value) pair now and never did. Unused can + transition to Active upon key insertion. This is each slot's initial state. + +2. Active. index >= 0, me_key != NULL and me_value != NULL + Holds an active (key, value) pair. Active can transition to Dummy or + Pending upon key deletion (for combined and split tables respectively). + This is the only case in which me_value != NULL. + +3. Dummy. index == DKIX_DUMMY (combined only) + Previously held an active (key, value) pair, but that was deleted and an + active pair has not yet overwritten the slot. Dummy can transition to + Active upon key insertion. Dummy slots cannot be made Unused again + else the probe sequence in case of collision would have no way to know + they were once active. + +4. Pending. index >= 0, key != NULL, and value == NULL (split only) + Not yet inserted in split-table. +*/ + +/* +Preserving insertion order + +It's simple for combined table. Since dk_entries is mostly append only, we can +get insertion order by just iterating dk_entries. + +One exception is .popitem(). It removes last item in dk_entries and decrement +dk_nentries to achieve amortized O(1). Since there are DKIX_DUMMY remains in +dk_indices, we can't increment dk_usable even though dk_nentries is +decremented. + +In split table, inserting into pending entry is allowed only for dk_entries[ix] +where ix == mp->ma_used. Inserting into other index and deleting item cause +converting the dict to the combined table. +*/ + + +/* D_MINSIZE (adapted from PyDict_MINSIZE) + * is the starting size for any new dict. + * 8 allows dicts with no more than 5 active entries; experiments suggested + * this suffices for the majority of dicts (consisting mostly of usually-small + * dicts created to pass keyword arguments). + * Making this 8, rather than 4 reduces the number of resizes for most + * dictionaries, without any significant extra memory use. + */ +#define D_MINSIZE 8 + +#include "dictobject.h" + + +#if defined(_MSC_VER) +# if _MSC_VER <= 1900 /* Visual Studio 2014 */ + typedef __int8 int8_t; + typedef __int16 int16_t; + typedef __int32 int32_t; + typedef __int64 int64_t; +# endif + /* Use _alloca() to dynamically allocate on the stack on MSVC */ + #define STACK_ALLOC(Type, Name, Size) Type * const Name = _alloca(Size); +#else + #define STACK_ALLOC(Type, Name, Size) Type Name[Size]; +#endif + + +/*[clinic input] +class dict "PyDictObject *" "&PyDict_Type" +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=f157a5a0ce9589d6]*/ + + +/* +To ensure the lookup algorithm terminates, there must be at least one Unused +slot (NULL key) in the table. +To avoid slowing down lookups on a near-full table, we resize the table when +it's USABLE_FRACTION (currently two-thirds) full. +*/ + +#define PERTURB_SHIFT 5 + +/* +Major subtleties ahead: Most hash schemes depend on having a "good" hash +function, in the sense of simulating randomness. Python doesn't: its most +important hash functions (for ints) are very regular in common +cases: + + >>>[hash(i) for i in range(4)] + [0, 1, 2, 3] + +This isn't necessarily bad! To the contrary, in a table of size 2**i, taking +the low-order i bits as the initial table index is extremely fast, and there +are no collisions at all for dicts indexed by a contiguous range of ints. So +this gives better-than-random behavior in common cases, and that's very +desirable. + +OTOH, when collisions occur, the tendency to fill contiguous slices of the +hash table makes a good collision resolution strategy crucial. Taking only +the last i bits of the hash code is also vulnerable: for example, consider +the list [i << 16 for i in range(20000)] as a set of keys. Since ints are +their own hash codes, and this fits in a dict of size 2**15, the last 15 bits + of every hash code are all 0: they *all* map to the same table index. + +But catering to unusual cases should not slow the usual ones, so we just take +the last i bits anyway. It's up to collision resolution to do the rest. If +we *usually* find the key we're looking for on the first try (and, it turns +out, we usually do -- the table load factor is kept under 2/3, so the odds +are solidly in our favor), then it makes best sense to keep the initial index +computation dirt cheap. + +The first half of collision resolution is to visit table indices via this +recurrence: + + j = ((5*j) + 1) mod 2**i + +For any initial j in range(2**i), repeating that 2**i times generates each +int in range(2**i) exactly once (see any text on random-number generation for +proof). By itself, this doesn't help much: like linear probing (setting +j += 1, or j -= 1, on each loop trip), it scans the table entries in a fixed +order. This would be bad, except that's not the only thing we do, and it's +actually *good* in the common cases where hash keys are consecutive. In an +example that's really too small to make this entirely clear, for a table of +size 2**3 the order of indices is: + + 0 -> 1 -> 6 -> 7 -> 4 -> 5 -> 2 -> 3 -> 0 [and here it's repeating] + +If two things come in at index 5, the first place we look after is index 2, +not 6, so if another comes in at index 6 the collision at 5 didn't hurt it. +Linear probing is deadly in this case because there the fixed probe order +is the *same* as the order consecutive keys are likely to arrive. But it's +extremely unlikely hash codes will follow a 5*j+1 recurrence by accident, +and certain that consecutive hash codes do not. + +The other half of the strategy is to get the other bits of the hash code +into play. This is done by initializing a (unsigned) vrbl "perturb" to the +full hash code, and changing the recurrence to: + + perturb >>= PERTURB_SHIFT; + j = (5*j) + 1 + perturb; + use j % 2**i as the next table index; + +Now the probe sequence depends (eventually) on every bit in the hash code, +and the pseudo-scrambling property of recurring on 5*j+1 is more valuable, +because it quickly magnifies small differences in the bits that didn't affect +the initial index. Note that because perturb is unsigned, if the recurrence +is executed often enough perturb eventually becomes and remains 0. At that +point (very rarely reached) the recurrence is on (just) 5*j+1 again, and +that's certain to find an empty slot eventually (since it generates every int +in range(2**i), and we make sure there's always at least one empty slot). + +Selecting a good value for PERTURB_SHIFT is a balancing act. You want it +small so that the high bits of the hash code continue to affect the probe +sequence across iterations; but you want it large so that in really bad cases +the high-order hash bits have an effect on early iterations. 5 was "the +best" in minimizing total collisions across experiments Tim Peters ran (on +both normal and pathological cases), but 4 and 6 weren't significantly worse. + +Historical: Reimer Behrends contributed the idea of using a polynomial-based +approach, using repeated multiplication by x in GF(2**n) where an irreducible +polynomial for each table size was chosen such that x was a primitive root. +Christian Tismer later extended that to use division by x instead, as an +efficient way to get the high bits of the hash code into play. This scheme +also gave excellent collision statistics, but was more expensive: two +if-tests were required inside the loop; computing "the next" index took about +the same number of operations but without as much potential parallelism +(e.g., computing 5*j can go on at the same time as computing 1+perturb in the +above, and then shifting perturb can be done while the table index is being +masked); and the PyDictObject struct required a member to hold the table's +polynomial. In Tim's experiments the current scheme ran faster, produced +equally good collision statistics, needed less code & used less memory. + +*/ + +#define DKIX_EMPTY (-1) +#define DKIX_DUMMY (-2) /* Used internally */ +#define DKIX_ERROR (-3) + +typedef enum { + OK = 0, + OK_REPLACED = 1, + ERR_NO_MEMORY = -1, + ERR_DICT_MUTATED = -2, + ERR_ITER_EXHAUSTED = -3, + ERR_DICT_EMPTY = -4, + ERR_CMP_FAILED = -5, +} Status; + + +#ifndef NDEBUG +static +int mem_cmp_zeros(void *obj, size_t n){ + int diff = 0; + char *mem = obj; + char *it; + for (it = mem; it < mem + n; ++it) { + if (*it != 0) diff += 1; + } + return diff; +} +#endif + +#define D_MASK(dk) ((dk)->size-1) +#define D_GROWTH_RATE(d) ((d)->used*3) + +static int +ix_size(Py_ssize_t size) { + if ( size < 0xff ) return 1; + if ( size < 0xffff ) return 2; + if ( size < 0xffffffff ) return 4; + return sizeof(int64_t); +} + +#ifndef NDEBUG +/* NOTE: This function is only used in assert()s */ +/* Align pointer *ptr* to pointer size */ +static void* +aligned_pointer(void *ptr) { + return (void*)aligned_size((size_t)ptr); +} +#endif + +/* lookup indices. returns DKIX_EMPTY, DKIX_DUMMY, or ix >=0 */ +static Py_ssize_t +get_index(NB_DictKeys *dk, Py_ssize_t i) +{ + Py_ssize_t s = dk->size; + Py_ssize_t ix; + + if (s <= 0xff) { + int8_t *indices = (int8_t*)(dk->indices); + assert (i < dk->size); + ix = indices[i]; + } + else if (s <= 0xffff) { + int16_t *indices = (int16_t*)(dk->indices); + ix = indices[i]; + } +#if SIZEOF_VOID_P > 4 + else if (s > 0xffffffff) { + int64_t *indices = (int64_t*)(dk->indices); + ix = indices[i]; + } +#endif + else { + int32_t *indices = (int32_t*)(dk->indices); + ix = indices[i]; + } + assert(ix >= DKIX_DUMMY); + return ix; +} + +/* write to indices. */ +static void +set_index(NB_DictKeys *dk, Py_ssize_t i, Py_ssize_t ix) +{ + Py_ssize_t s = dk->size; + + assert(ix >= DKIX_DUMMY); + + if (s <= 0xff) { + int8_t *indices = (int8_t*)(dk->indices); + assert(ix <= 0x7f); + indices[i] = (char)ix; + } + else if (s <= 0xffff) { + int16_t *indices = (int16_t*)(dk->indices); + assert(ix <= 0x7fff); + indices[i] = (int16_t)ix; + } +#if SIZEOF_VOID_P > 4 + else if (s > 0xffffffff) { + int64_t *indices = (int64_t*)(dk->indices); + indices[i] = ix; + } +#endif + else { + int32_t *indices = (int32_t*)(dk->indices); + assert(ix <= 0x7fffffff); + indices[i] = (int32_t)ix; + } +} + + +/* USABLE_FRACTION is the maximum dictionary load. + * Increasing this ratio makes dictionaries more dense resulting in more + * collisions. Decreasing it improves sparseness at the expense of spreading + * indices over more cache lines and at the cost of total memory consumed. + * + * USABLE_FRACTION must obey the following: + * (0 < USABLE_FRACTION(n) < n) for all n >= 2 + * + * USABLE_FRACTION should be quick to calculate. + * Fractions around 1/2 to 2/3 seem to work well in practice. + */ +#define USABLE_FRACTION(n) (((n) << 1)/3) + +/* Alternative fraction that is otherwise close enough to 2n/3 to make + * little difference. 8 * 2/3 == 8 * 5/8 == 5. 16 * 2/3 == 16 * 5/8 == 10. + * 32 * 2/3 = 21, 32 * 5/8 = 20. + * Its advantage is that it is faster to compute on machines with slow division. + * #define USABLE_FRACTION(n) (((n) >> 1) + ((n) >> 2) - ((n) >> 3)) + */ + +/* GROWTH_RATE. Growth rate upon hitting maximum load. + * Currently set to used*3. + * This means that dicts double in size when growing without deletions, + * but have more head room when the number of deletions is on a par with the + * number of insertions. See also bpo-17563 and bpo-33205. + * + * GROWTH_RATE was set to used*4 up to version 3.2. + * GROWTH_RATE was set to used*2 in version 3.3.0 + * GROWTH_RATE was set to used*2 + capacity/2 in 3.4.0-3.6.0. + */ +#define GROWTH_RATE(d) ((d)->ma_used*3) + + +static NB_DictEntry* +get_entry(NB_DictKeys *dk, Py_ssize_t idx) { + Py_ssize_t offset; + char *ptr; + + assert (idx < dk->size); + offset = idx * dk->entry_size; + ptr = dk->indices + dk->entry_offset + offset; + return (NB_DictEntry*)ptr; +} + +static void +zero_key(NB_DictKeys *dk, char *data){ + memset(data, 0, dk->key_size); +} + +static void +zero_val(NB_DictKeys *dk, char *data){ + memset(data, 0, dk->val_size); +} + +static void +copy_key(NB_DictKeys *dk, char *dst, const char *src){ + memcpy(dst, src, dk->key_size); +} + +static void +copy_val(NB_DictKeys *dk, char *dst, const char *src){ + memcpy(dst, src, dk->val_size); +} + +/* Returns -1 for error; 0 for not equal; 1 for equal */ +static int +key_equal(NB_DictKeys *dk, const char *lhs, const char *rhs) { + if ( dk->methods.key_equal ) { + return dk->methods.key_equal(lhs, rhs); + } else { + return memcmp(lhs, rhs, dk->key_size) == 0; + } +} + +static char * +entry_get_key(NB_DictKeys *dk, NB_DictEntry* entry) { + char * out = entry->keyvalue; + assert (out == aligned_pointer(out)); + return out; +} + +static char * +entry_get_val(NB_DictKeys *dk, NB_DictEntry* entry) { + char * out = entry_get_key(dk, entry) + aligned_size(dk->key_size); + assert (out == aligned_pointer(out)); + return out; +} + +static void +dk_incref_key(NB_DictKeys *dk, const char *key) { + if ( dk->methods.key_incref ) { + dk->methods.key_incref(key); + } +} + +static void +dk_decref_key(NB_DictKeys *dk, const char *key) { + if ( dk->methods.key_decref ) { + dk->methods.key_decref(key); + } +} + +static void +dk_incref_val(NB_DictKeys *dk, const char *val) { + if ( dk->methods.value_incref ) { + dk->methods.value_incref(val); + } +} + +static void +dk_decref_val(NB_DictKeys *dk, const char *val) { + if ( dk->methods.value_decref ) { + dk->methods.value_decref(val); + } +} + + +void +numba_dictkeys_free(NB_DictKeys *dk) { + /* Clear all references from the entries */ + Py_ssize_t i; + NB_DictEntry *ep; + + for (i = 0; i < dk->nentries; i++) { + ep = get_entry(dk, i); + if (ep->hash != DKIX_EMPTY) { + dk_decref_key(dk, entry_get_key(dk, ep)); + dk_decref_val(dk, entry_get_val(dk, ep)); + } + } + /* Deallocate */ + free(dk); +} + +void +numba_dict_free(NB_Dict *d) { + numba_dictkeys_free(d->keys); + free(d); +} + +Py_ssize_t +numba_dict_length(NB_Dict *d) { + return d->used; +} + +/* Allocate new dictionary keys + +Adapted from CPython's new_keys_object(). +*/ +int +numba_dictkeys_new(NB_DictKeys **out, Py_ssize_t size, Py_ssize_t key_size, Py_ssize_t val_size) { + Py_ssize_t usable = USABLE_FRACTION(size); + Py_ssize_t index_size = ix_size(size); + Py_ssize_t entry_size = aligned_size(sizeof(NB_DictEntry) + aligned_size(key_size) + aligned_size(val_size)); + Py_ssize_t entry_offset = aligned_size(index_size * size); + Py_ssize_t alloc_size = sizeof(NB_DictKeys) + entry_offset + entry_size * usable; + + NB_DictKeys *dk = malloc(aligned_size(alloc_size)); + if (!dk) return ERR_NO_MEMORY; + + assert ( size >= D_MINSIZE ); + + dk->size = size; + dk->usable = usable; + dk->nentries = 0; + dk->key_size = key_size; + dk->val_size = val_size; + dk->entry_offset = entry_offset; + dk->entry_size = entry_size; + + assert (aligned_pointer(dk->indices) == dk->indices ); + /* Ensure that the method table is all nulls */ + memset(&dk->methods, 0x00, sizeof(type_based_methods_table)); + /* Ensure hash is (-1) for empty entry */ + memset(dk->indices, 0xff, entry_offset + entry_size * usable); + + *out = dk; + return OK; +} + + +/* Allocate new dictionary */ +int +numba_dict_new(NB_Dict **out, Py_ssize_t size, Py_ssize_t key_size, Py_ssize_t val_size) { + NB_DictKeys* dk; + NB_Dict *d; + int status = numba_dictkeys_new(&dk, size, key_size, val_size); + if (status != OK) return status; + + d = malloc(sizeof(NB_Dict)); + if (!d) { + numba_dictkeys_free(dk); + return ERR_NO_MEMORY; + } + + d->used = 0; + d->keys = dk; + *out = d; + return OK; +} + +/* +Adapted from CPython lookdict_index(). + +Search index of hash table from offset of entry table +*/ +static Py_ssize_t +lookdict_index(NB_DictKeys *dk, Py_hash_t hash, Py_ssize_t index) +{ + size_t mask = D_MASK(dk); + size_t perturb = (size_t)hash; + size_t i = (size_t)hash & mask; + + for (;;) { + Py_ssize_t ix = get_index(dk, i); + if (ix == index) { + return i; + } + if (ix == DKIX_EMPTY) { + return DKIX_EMPTY; + } + perturb >>= PERTURB_SHIFT; + i = mask & (i*5 + perturb + 1); + } + assert(0 && "unreachable"); +} + +/* + +Adapted from the CPython3.7 lookdict(). + +The basic lookup function used by all operations. +This is based on Algorithm D from Knuth Vol. 3, Sec. 6.4. +Open addressing is preferred over chaining since the link overhead for +chaining would be substantial (100% with typical malloc overhead). + +The initial probe index is computed as hash mod the table size. Subsequent +probe indices are computed as explained earlier. + +All arithmetic on hash should ignore overflow. + +The details in this version are due to Tim Peters, building on many past +contributions by Reimer Behrends, Jyrki Alakuijala, Vladimir Marangozov and +Christian Tismer. + +lookdict() is general-purpose, and may return DKIX_ERROR if (and only if) a +comparison raises an exception. +lookdict_unicode() below is specialized to string keys, comparison of which can +never raise an exception; that function can never return DKIX_ERROR when key +is string. Otherwise, it falls back to lookdict(). +lookdict_unicode_nodummy is further specialized for string keys that cannot be +the value. +For both, when the key isn't found a DKIX_EMPTY is returned. +*/ +Py_ssize_t +numba_dict_lookup(NB_Dict *d, const char *key_bytes, Py_hash_t hash, char *oldval_bytes) +{ + NB_DictKeys *dk = d->keys; + size_t mask = D_MASK(dk); + size_t perturb = hash; + size_t i = (size_t)hash & mask; + + for (;;) { + Py_ssize_t ix = get_index(dk, i); + if (ix == DKIX_EMPTY) { + zero_val(dk, oldval_bytes); + return ix; + } + if (ix >= 0) { + NB_DictEntry *ep = get_entry(dk, ix); + const char *startkey = NULL; + if (ep->hash == hash) { + int cmp; + + startkey = entry_get_key(dk, ep); + cmp = key_equal(dk, startkey, key_bytes); + if (cmp < 0) { + // error'ed in comparison + memset(oldval_bytes, 0, dk->val_size); + return DKIX_ERROR; + } + if (cmp > 0) { + // key is equal; retrieve the value. + copy_val(dk, oldval_bytes, entry_get_val(dk, ep)); + return ix; + } + } + } + perturb >>= PERTURB_SHIFT; + i = (i*5 + perturb + 1) & mask; + } + assert(0 && "unreachable"); +} + + +/* Internal function to find slot for an item from its hash + when it is known that the key is not present in the dict. + + The dict must be combined. */ +static Py_ssize_t +find_empty_slot(NB_DictKeys *dk, Py_hash_t hash){ + size_t mask; + size_t i; + Py_ssize_t ix; + size_t perturb; + + assert(dk != NULL); + + mask = D_MASK(dk); + i = hash & mask; + ix = get_index(dk, i); + for (perturb = hash; ix >= 0;) { + perturb >>= PERTURB_SHIFT; + i = (i*5 + perturb + 1) & mask; + ix = get_index(dk, i); + } + return i; +} + +static int +insertion_resize(NB_Dict *d) +{ + return numba_dict_resize(d, D_GROWTH_RATE(d)); +} + +int +numba_dict_insert( + NB_Dict *d, + const char *key_bytes, + Py_hash_t hash, + const char *val_bytes, + char *oldval_bytes + ) +{ + + NB_DictKeys *dk = d->keys; + + Py_ssize_t ix = numba_dict_lookup(d, key_bytes, hash, oldval_bytes); + if (ix == DKIX_ERROR) { + // exception in key comparison in lookup. + return ERR_CMP_FAILED; + } + + if (ix == DKIX_EMPTY) { + /* Insert into new slot */ + Py_ssize_t hashpos; + NB_DictEntry *ep; + + if (dk->usable <= 0) { + /* Need to resize */ + if (insertion_resize(d) != OK) + return ERR_NO_MEMORY; + else + dk = d->keys; // reload + } + hashpos = find_empty_slot(dk, hash); + ep = get_entry(dk, dk->nentries); + set_index(dk, hashpos, dk->nentries); + copy_key(dk, entry_get_key(dk, ep), key_bytes); + assert ( hash != -1 ); + ep->hash = hash; + copy_val(dk, entry_get_val(dk, ep), val_bytes); + + /* incref */ + dk_incref_key(dk, key_bytes); + dk_incref_val(dk, val_bytes); + + d->used += 1; + dk->usable -= 1; + dk->nentries += 1; + assert (dk->usable >= 0); + return OK; + } else { + /* Replace existing value in the slot at ix */ + /* decref old value */ + dk_decref_val(dk, oldval_bytes); + // Replace the previous value + copy_val(dk, entry_get_val(dk, get_entry(dk, ix)), val_bytes); + + /* incref */ + dk_incref_val(dk, val_bytes); + return OK_REPLACED; + } +} + +/* +Adapted from build_indices(). +Internal routine used by dictresize() to build a hashtable of entries. +*/ +void +build_indices(NB_DictKeys *keys, Py_ssize_t n) { + size_t mask = (size_t)D_MASK(keys); + Py_ssize_t ix; + for (ix = 0; ix != n; ix++) { + size_t perturb; + Py_hash_t hash = get_entry(keys, ix)->hash; + size_t i = hash & mask; + for (perturb = hash; get_index(keys, i) != DKIX_EMPTY;) { + perturb >>= PERTURB_SHIFT; + i = mask & (i*5 + perturb + 1); + } + set_index(keys, i, ix); + } +} + +/* + +Adapted from CPython dictresize(). + +Restructure the table by allocating a new table and reinserting all +items again. When entries have been deleted, the new table may +actually be smaller than the old one. +If a table is split (its keys and hashes are shared, its values are not), +then the values are temporarily copied into the table, it is resized as +a combined table, then the me_value slots in the old table are NULLed out. +After resizing a table is always combined, +but can be resplit by make_keys_shared(). +*/ +int +numba_dict_resize(NB_Dict *d, Py_ssize_t minsize) { + Py_ssize_t newsize, numentries; + NB_DictKeys *oldkeys; + int status; + + /* Find the smallest table size > minused. */ + for (newsize = D_MINSIZE; + newsize < minsize && newsize > 0; + newsize <<= 1) + ; + if (newsize <= 0) { + return ERR_NO_MEMORY; + } + oldkeys = d->keys; + + /* NOTE: Current odict checks mp->ma_keys to detect resize happen. + * So we can't reuse oldkeys even if oldkeys->dk_size == newsize. + * TODO: Try reusing oldkeys when reimplement odict. + */ + + /* Allocate a new table. */ + status = numba_dictkeys_new( + &d->keys, newsize, oldkeys->key_size, oldkeys->val_size + ); + if (status != OK) { + d->keys = oldkeys; + return status; + } + // New table must be large enough. + assert(d->keys->usable >= d->used); + // Copy method table + memcpy(&d->keys->methods, &oldkeys->methods, sizeof(type_based_methods_table)); + + numentries = d->used; + + if (oldkeys->nentries == numentries) { + NB_DictEntry *oldentries, *newentries; + + oldentries = get_entry(oldkeys, 0); + newentries = get_entry(d->keys, 0); + memcpy(newentries, oldentries, numentries * oldkeys->entry_size); + // to avoid decref + memset(oldentries, 0xff, numentries * oldkeys->entry_size); + } + else { + Py_ssize_t i; + size_t epi = 0; + for (i=0; ihash == (-1) hash means it is empty + + Here, we skip until a non empty entry is encountered. + */ + while( get_entry(oldkeys, epi)->hash == DKIX_EMPTY ) { + assert( mem_cmp_zeros(entry_get_val(oldkeys, get_entry(oldkeys, epi)), oldkeys->val_size) == 0 ); + epi += 1; + } + memcpy( + get_entry(d->keys, i), + get_entry(oldkeys, epi), + oldkeys->entry_size + ); + get_entry(oldkeys, epi)->hash = DKIX_EMPTY; // to avoid decref + epi += 1; + + } + + } + numba_dictkeys_free(oldkeys); + + build_indices(d->keys, numentries); + d->keys->usable -= numentries; + d->keys->nentries = numentries; + return OK; +} + +/* + Adapted from CPython delitem_common + */ +int +numba_dict_delitem(NB_Dict *d, Py_hash_t hash, Py_ssize_t ix) +{ + Py_ssize_t hashpos; + NB_DictEntry *ep; + NB_DictKeys *dk = d->keys; + + hashpos = lookdict_index(dk, hash, ix); + assert(hashpos >= 0); + + d->used -= 1; + ep = get_entry(dk, ix); + set_index(dk, hashpos, DKIX_DUMMY); + + /* decref */ + dk_decref_key(dk, entry_get_key(dk, ep)); + dk_decref_val(dk, entry_get_val(dk, ep)); + + /* zero the entries */ + zero_key(dk, entry_get_key(dk, ep)); + zero_val(dk, entry_get_val(dk, ep)); + ep->hash = DKIX_EMPTY; // to mark it as empty; + + return OK; +} + + +/** + * Adapted from dict_popitem + * + */ +int +numba_dict_popitem(NB_Dict *d, char *key_bytes, char *val_bytes) +{ + Py_ssize_t i, j; + char *key_ptr, *val_ptr; + NB_DictEntry *ep = NULL; + + if (d->used == 0) { + return ERR_DICT_EMPTY; + } + + /* Pop last item */ + i = d->keys->nentries - 1; + while (i >= 0 && (ep = get_entry(d->keys, i))->hash == DKIX_EMPTY ) { + i--; + } + assert(i >= 0); + + j = lookdict_index(d->keys, ep->hash, i); + assert(j >= 0); + assert(get_index(d->keys, j) == i); + set_index(d->keys, j, DKIX_DUMMY); + + key_ptr = entry_get_key(d->keys, ep); + val_ptr = entry_get_val(d->keys, ep); + + copy_key(d->keys, key_bytes, key_ptr); + copy_val(d->keys, val_bytes, val_ptr); + + zero_key(d->keys, key_ptr); + zero_val(d->keys, val_ptr); + + /* We can't dk_usable++ since there is DKIX_DUMMY in indices */ + d->keys->nentries = i; + d->used--; + + return OK; +} + +void +numba_dict_dump(NB_Dict *d) { + long long i, j, k; + long long size, n; + char *cp; + NB_DictEntry *ep; + NB_DictKeys *dk = d->keys; + + n = d->used; + size = dk->nentries; + + printf("Dict dump\n"); + printf(" key_size = %lld\n", (long long)d->keys->key_size); + printf(" val_size = %lld\n", (long long)d->keys->val_size); + + for (i = 0, j = 0; i < size; i++) { + ep = get_entry(dk, i); + if (ep->hash != DKIX_EMPTY) { + long long hash = ep->hash; + printf(" key="); + for (cp=entry_get_key(dk, ep), k=0; k < d->keys->key_size; ++k, ++cp){ + printf("%02x ", ((int)*cp) & 0xff); + } + printf(" hash=%llu value=", hash); + for (cp=entry_get_val(dk, ep), k=0; k < d->keys->val_size; ++k, ++cp){ + printf("%02x ", ((int)*cp) & 0xff); + } + printf("\n"); + j++; + } + } + printf("j = %lld; n = %lld\n", j, n); + assert(j == n); +} + +size_t +numba_dict_iter_sizeof() { + return sizeof(NB_DictIter); +} + +void +numba_dict_iter(NB_DictIter *it, NB_Dict *d) { + it->parent = d; + it->parent_keys = d->keys; + it->size = d->used; + it->pos = 0; +} + +int +numba_dict_iter_next(NB_DictIter *it, const char **key_ptr, const char **val_ptr) { + /* Detect dictionary mutation during iteration */ + NB_DictKeys *dk; + if (it->parent->keys != it->parent_keys || + it->parent->used != it->size) { + return ERR_DICT_MUTATED; + } + dk = it->parent_keys; + while ( it->pos < dk->nentries ) { + NB_DictEntry *ep = get_entry(dk, it->pos++); + if ( ep->hash != DKIX_EMPTY ) { + *key_ptr = entry_get_key(dk, ep); + *val_ptr = entry_get_val(dk, ep); + return OK; + } + } + return ERR_ITER_EXHAUSTED; +} + +int +numba_dict_insert_ez( + NB_Dict *d, + const char *key_bytes, + Py_hash_t hash, + const char *val_bytes + ) +{ + STACK_ALLOC(char, old, d->keys->val_size); + return numba_dict_insert(d, key_bytes, hash, val_bytes, old); +} + +int +numba_dict_new_minsize(NB_Dict **out, Py_ssize_t key_size, Py_ssize_t val_size) +{ + return numba_dict_new(out, D_MINSIZE, key_size, val_size); +} + +void +numba_dict_set_method_table(NB_Dict *d, type_based_methods_table *methods) +{ + memcpy(&d->keys->methods, methods, sizeof(type_based_methods_table)); +} + + +#define CHECK(CASE) { \ + if ( !(CASE) ) { \ + printf("'%s' failed file %s:%d\n", #CASE, __FILE__, __LINE__); \ + return 1; \ + } \ +} + +int +numba_test_dict(void) { + NB_Dict *d; + int status; + Py_ssize_t ix; + Py_ssize_t usable; + Py_ssize_t it_count; + const char *it_key, *it_val; + NB_DictIter iter; + +#if defined(_MSC_VER) + /* So that VS2008 compiler is happy */ + char *got_key, *got_value; + got_key = _alloca(4); + got_value = _alloca(8); +#else + char got_key[4]; + char got_value[8]; +#endif + puts("test_dict"); + + status = numba_dict_new(&d, D_MINSIZE, 4, 8); + CHECK(status == OK); + CHECK(d->keys->size == D_MINSIZE); + CHECK(d->keys->key_size == 4); + CHECK(d->keys->val_size == 8); + CHECK(ix_size(d->keys->size) == 1); + printf("aligned_size(index_size * size) = %d\n", (int)(aligned_size(ix_size(d->keys->size) * d->keys->size))); + + printf("d %p\n", d); + printf("d->usable = %u\n", (int)d->keys->usable); + usable = d->keys->usable; + printf("d[0] %d\n", (int)((char*)get_entry(d->keys, 0) - (char*)d->keys)); + CHECK ((char*)get_entry(d->keys, 0) - (char*)d->keys->indices == d->keys->entry_offset); + printf("d[1] %d\n", (int)((char*)get_entry(d->keys, 1) - (char*)d->keys)); + CHECK ((char*)get_entry(d->keys, 1) - (char*)d->keys->indices == d->keys->entry_offset + d->keys->entry_size); + + ix = numba_dict_lookup(d, "bef", 0xbeef, got_value); + printf("ix = %d\n", (int)ix); + CHECK (ix == DKIX_EMPTY); + + // insert 1st key + status = numba_dict_insert(d, "bef", 0xbeef, "1234567", got_value); + CHECK (status == OK); + CHECK (d->used == 1); + CHECK (d->keys->usable == usable - d->used); + + // insert same key + status = numba_dict_insert(d, "bef", 0xbeef, "1234567", got_value); + CHECK (status == OK_REPLACED); + printf("got_value %s\n", got_value); + CHECK (d->used == 1); + CHECK (d->keys->usable == usable - d->used); + + // insert 2nd key + status = numba_dict_insert(d, "beg", 0xbeef, "1234568", got_value); + CHECK (status == OK); + CHECK (d->used == 2); + CHECK (d->keys->usable == usable - d->used); + + // insert 3rd key + status = numba_dict_insert(d, "beh", 0xcafe, "1234569", got_value); + CHECK (status == OK); + CHECK (d->used == 3); + CHECK (d->keys->usable == usable - d->used); + + // replace key "bef"'s value + status = numba_dict_insert(d, "bef", 0xbeef, "7654321", got_value); + CHECK (status == OK_REPLACED); + CHECK (d->used == 3); + CHECK (d->keys->usable == usable - d->used); + + // insert 4th key + status = numba_dict_insert(d, "bei", 0xcafe, "0_0_0_1", got_value); + CHECK (status == OK); + CHECK (d->used == 4); + CHECK (d->keys->usable == usable - d->used); + + // insert 5th key + status = numba_dict_insert(d, "bej", 0xcafe, "0_0_0_2", got_value); + CHECK (status == OK); + CHECK (d->used == 5); + CHECK (d->keys->usable == usable - d->used); + + // insert 6th key & triggers resize + status = numba_dict_insert(d, "bek", 0xcafe, "0_0_0_3", got_value); + CHECK (status == OK); + CHECK (d->used == 6); + CHECK (d->keys->usable == USABLE_FRACTION(d->keys->size) - d->used); + + // Dump + numba_dict_dump(d); + + // Make sure everything are still in there + ix = numba_dict_lookup(d, "bef", 0xbeef, got_value); + CHECK (ix >= 0); + CHECK (memcpy(got_value, "7654321", d->keys->val_size)); + + ix = numba_dict_lookup(d, "beg", 0xbeef, got_value); + CHECK (ix >= 0); + CHECK (memcpy(got_value, "1234567", d->keys->val_size)); + + ix = numba_dict_lookup(d, "beh", 0xcafe, got_value); + printf("ix = %d\n", (int)ix); + CHECK (ix >= 0); + CHECK (memcpy(got_value, "1234569", d->keys->val_size)); + + ix = numba_dict_lookup(d, "bei", 0xcafe, got_value); + CHECK (ix >= 0); + CHECK (memcpy(got_value, "0_0_0_1", d->keys->val_size)); + + ix = numba_dict_lookup(d, "bej", 0xcafe, got_value); + CHECK (ix >= 0); + CHECK (memcpy(got_value, "0_0_0_2", d->keys->val_size)); + + ix = numba_dict_lookup(d, "bek", 0xcafe, got_value); + CHECK (ix >= 0); + CHECK (memcpy(got_value, "0_0_0_3", d->keys->val_size)); + + // Test delete + ix = numba_dict_lookup(d, "beg", 0xbeef, got_value); + status = numba_dict_delitem(d, 0xbeef, ix); + CHECK (status == OK); + + ix = numba_dict_lookup(d, "beg", 0xbeef, got_value); + CHECK (ix == DKIX_EMPTY); // not found + + ix = numba_dict_lookup(d, "bef", 0xbeef, got_value); + CHECK (ix >= 0); + ix = numba_dict_lookup(d, "beh", 0xcafe, got_value); + CHECK (ix >= 0); + + + // Test popitem + // They are always the last item + status = numba_dict_popitem(d, got_key, got_value); + CHECK(status == OK); + CHECK(memcmp("bek", got_key, d->keys->key_size) == 0); + CHECK(memcmp("0_0_0_3", got_value, d->keys->val_size) == 0); + + status = numba_dict_popitem(d, got_key, got_value); + CHECK(status == OK); + CHECK(memcmp("bej", got_key, d->keys->key_size) == 0); + CHECK(memcmp("0_0_0_2", got_value, d->keys->val_size) == 0); + + // Test iterator + CHECK( d->used > 0 ); + numba_dict_iter(&iter, d); + it_count = 0; + while ( (status = numba_dict_iter_next(&iter, &it_key, &it_val)) == OK) { + it_count += 1; // valid items + CHECK(it_key != NULL); + CHECK(it_val != NULL); + } + + CHECK(status == ERR_ITER_EXHAUSTED); + CHECK(d->used == it_count); + + numba_dict_free(d); + return 0; + +} + +#undef CHECK diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/dictobject.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/dictobject.h new file mode 100644 index 0000000000000000000000000000000000000000..74b555a028293c2ab21b8adca3b40d0268bae635 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/dictobject.h @@ -0,0 +1,222 @@ +/* Adapted from CPython3.7 Objects/dict-common.h */ +#include "Python.h" +#include "../_pymodule.h" +#include "cext.h" + +#ifndef NUMBA_DICT_COMMON_H +#define NUMBA_DICT_COMMON_H + +typedef struct { + /* Uses Py_ssize_t instead of Py_hash_t to guarantee word size alignment */ + Py_ssize_t hash; + char keyvalue[]; +} NB_DictEntry; + + +typedef int (*dict_key_comparator_t)(const char *lhs, const char *rhs); +typedef void (*dict_refcount_op_t)(const void*); + + +typedef struct { + dict_key_comparator_t key_equal; + dict_refcount_op_t key_incref; + dict_refcount_op_t key_decref; + dict_refcount_op_t value_incref; + dict_refcount_op_t value_decref; +} type_based_methods_table; + + +typedef struct { + /* hash table size */ + Py_ssize_t size; + /* Usable size of the hash table. + Also, size of the entries */ + Py_ssize_t usable; + /* hash table used entries */ + Py_ssize_t nentries; + /* Entry info + - key_size is the sizeof key type + - val_size is the sizeof value type + - entry_size is key_size + val_size + alignment + */ + Py_ssize_t key_size, val_size, entry_size; + /* Byte offset from indices to the first entry. */ + Py_ssize_t entry_offset; + + /* Method table for type-dependent operations. */ + type_based_methods_table methods; + + /* hash table */ + char indices[]; +} NB_DictKeys; + + +typedef struct { + /* num of elements in the hashtable */ + Py_ssize_t used; + NB_DictKeys *keys; +} NB_Dict; + + +typedef struct { + /* parent dictionary */ + NB_Dict *parent; + /* parent keys object */ + NB_DictKeys *parent_keys; + /* dict size */ + Py_ssize_t size; + /* iterator position; indicates the next position to read */ + Py_ssize_t pos; +} NB_DictIter; + + + +/* A test function for the dict +Returns 0 for OK; 1 for failure. +*/ +NUMBA_EXPORT_FUNC(int) +numba_test_dict(void); + +/* Allocate a new dict +Parameters +- NB_Dict **out + Output for the new dictionary. +- Py_ssize_t size + Hashtable size. Must be power of two. +- Py_ssize_t key_size + Size of a key entry. +- Py_ssize_t val_size + Size of a value entry. +*/ +NUMBA_EXPORT_FUNC(int) +numba_dict_new(NB_Dict **out, Py_ssize_t size, Py_ssize_t key_size, Py_ssize_t val_size); + +/* Free a dict */ +NUMBA_EXPORT_FUNC(void) +numba_dict_free(NB_Dict *d); + +/* Returns length of a dict */ +NUMBA_EXPORT_FUNC(Py_ssize_t) +numba_dict_length(NB_Dict *d); + +/* Allocates a new dict at the minimal size +See numba_dict_new(). +*/ +NUMBA_EXPORT_FUNC(int) +numba_dict_new_minsize(NB_Dict **out, Py_ssize_t key_size, Py_ssize_t val_size); + +/* Set the method table for type specific operations +*/ +NUMBA_EXPORT_FUNC(void) +numba_dict_set_method_table(NB_Dict *d, type_based_methods_table *methods); + +/* Lookup a key + +Parameters +- NB_Dict *d + The dictionary object. +- const char *key_bytes + The key as a byte buffer. +- Py_hash_t hash + The precomputed hash of the key. +- char *oldval_bytes + An output parameter to store the associated value if the key is found. + Must point to memory of sufficient size to store the value. +*/ +NUMBA_EXPORT_FUNC(Py_ssize_t) +numba_dict_lookup(NB_Dict *d, const char *key_bytes, Py_hash_t hash, char *oldval_bytes); + +/* Resize the dict to at least *minsize*. +*/ +NUMBA_EXPORT_FUNC(int) +numba_dict_resize(NB_Dict *d, Py_ssize_t minsize); + +/* Insert to the dict + +Parameters +- NB_Dict *d + The dictionary object. +- const char *key_bytes + The key as a byte buffer. +- Py_hash_t hash + The precomputed hash of key. +- const char *val_bytes + The value as a byte buffer. +- char *oldval_bytes + An output buffer to store the replaced value. + Must point to memory of sufficient size to store the value. + +Returns +- < 0 for error +- 0 for ok +- 1 for ok and oldval_bytes has a copy of the replaced value. +*/ +NUMBA_EXPORT_FUNC(int) +numba_dict_insert(NB_Dict *d, const char *key_bytes, Py_hash_t hash, const char *val_bytes, char *oldval_bytes); + +/* Same as numba_dict_insert() but oldval_bytes is not needed */ +NUMBA_EXPORT_FUNC(int) +numba_dict_insert_ez(NB_Dict *d, const char *key_bytes, Py_hash_t hash, const char *val_bytes); + +/* Delete an entry from the dict +Parameters +- NB_Dict *d + The dictionary +- Py_hash_t hash + Precomputed hash of the key to be deleted +- Py_ssize_t ix + Precomputed entry index of the key to be deleted. + Usually results of numba_dict_lookup(). +*/ +NUMBA_EXPORT_FUNC(int) +numba_dict_delitem(NB_Dict *d, Py_hash_t hash, Py_ssize_t ix); + +/* Remove an item from the dict +Parameters +- NB_Dict *d + The dictionary +- char *key_bytes + Output. The key as a byte buffer +- char *val_bytes + Output. The value as a byte buffer +*/ +NUMBA_EXPORT_FUNC(int) +numba_dict_popitem(NB_Dict *d, char *key_bytes, char *val_bytes); + +/* Returns the sizeof a dictionary iterator +*/ +NUMBA_EXPORT_FUNC(size_t) +numba_dict_iter_sizeof(void); + +/* Fill a NB_DictIter for a dictionary to begin iteration +Parameters +- NB_DictIter *it + Output. Must points to memory of size at least `numba_dict_iter_sizeof()`. +- NB_Dict *d + The dictionary to be iterated. +*/ +NUMBA_EXPORT_FUNC(void) +numba_dict_iter(NB_DictIter *it, NB_Dict *d); + +/* Advance the iterator +Parameters +- NB_DictIter *it + The iterator +- const char **key_ptr + Output pointer for the key. Points to data in the dictionary. +- const char **val_ptr + Output pointer for the key. Points to data in the dictionary. + +Returns +- 0 for success; valid key_ptr and val_ptr +- ERR_ITER_EXHAUSTED for end of iterator. +- ERR_DICT_MUTATED for detected dictionary mutation. +*/ +NUMBA_EXPORT_FUNC(int) +numba_dict_iter_next(NB_DictIter *it, const char **key_ptr, const char **val_ptr); + + +NUMBA_EXPORT_FUNC(void) +numba_dict_dump(NB_Dict *); + +#endif diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/listobject.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/listobject.c new file mode 100644 index 0000000000000000000000000000000000000000..9de03f5ff1d61b73bf6e4a340d51388bc09f2bc9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/listobject.c @@ -0,0 +1,977 @@ +#include "listobject.h" + +/* This implements the C component of the Numba typed list. It is loosely + * inspired by the list implementation of the cpython list with some parts + * taken from the cpython slice implementation. The exact commit-id of the + * relevant files are: + * + * https://github.com/python/cpython/blob/51ddab8dae056867f3595ab3400bffc93f67c8d4/Objects/listobject.c + * https://github.com/python/cpython/blob/51ddab8dae056867f3595ab3400bffc93f67c8d4/Objects/sliceobject.c + * + * Algorithmically, this list is very similar to the cpython implementation so + * it should have the same performance (Big-O) characteristics for accessing, + * adding and removing elements/items. Specifically, it implements the same + * algorithms for list overallocation and growth. However, it never deals with + * PyObject types and instead must be typed with a type-size. As a result, the + * typed-list is type homogeneous and in contrast to the cpython version can + * not store a mixture of arbitrarily typed objects. Reference counting via the + * Numba Runtime (NRT) is supported and incrementing and decrementing functions + * are store as part of the struct and can be setup from the compiler level. + * + * Importantly, only a very limited subset of the cpython c functions have been + * ported over and the rest have been implemented (in Python) at the compiler + * level using the c functions provided. Additionally, initialization of, and + * iteration over, a ListIter is provided + * + * The following functions are implemented for the list: + * + * - Check valid index valid_index + * - Creation numba_list_new + * - Deletion numba_list_free + * - Accessing the length numba_list_length + * - Appending to the list numba_list_append + * - Getting an item numba_list_setitem + * - Setting an item numba_list_getitem + * - Resizing the list numba_list_resize + * - Deleting an item numba_list_delitem + * - Deleting a slice numba_list_delete_slice + * + * As you can see, only a single function for slices is implemented. The rest + * is all done entirely at the compiler level which then calls the c functions + * to mutate the list accordingly. Since slicing allows for replace, insert and + * delete operations over multiple items, we can simply implement those using + * the basic functions above. + * + * The following additional functions are implemented for the list, these are + * needed to make the list work within Numba. + * + * - Accessing the allocation numba_list_allocated + * - Copying an item copy_item + * - Calling incref on item list_incref_item + * - Calling decref on item list_decref_item + * - Set method table numba_list_set_method_table + * + * The following functions are implemented for the iterator: + * + * - Size of the iterator numba_list_iter_size + * - Initialization of iter numba_list_iter + * - Get next item from iter numba_list_iter_next + * + * Two methods are provided to query and set the 'is_mutable': + * + * - Query numba_list_is_mutable + * - Set numba_list_set_is_mutable + * + * Lastly a set of pure C level tests are provided which come in handy when + * needing to use valgrind and friends. + * + */ + + +/* Return status for the list functions. + */ +typedef enum { + LIST_OK = 0, + LIST_ERR_INDEX = -1, + LIST_ERR_NO_MEMORY = -2, + LIST_ERR_MUTATED = -3, + LIST_ERR_ITER_EXHAUSTED = -4, + LIST_ERR_IMMUTABLE = -5, +} ListStatus; + +/* Copy an item from a list. + * + * lp: a list + * dst: destination pointer + * src: source pointer + */ +static void +copy_item(NB_List *lp, char *dst, const char *src){ + memcpy(dst, src, lp->item_size); +} + +/* Increment a reference to an item in a list. + * + * lp: a list + * item: the item to increment the reference for + */ +static void +list_incref_item(NB_List *lp, const char *item){ + if (lp->methods.item_incref) { + lp->methods.item_incref(item); + } +} + +/* Decrement a reference to an item in a list. + * + * lp: a list + * item: the item to decrement the reference for + */ +static void +list_decref_item(NB_List *lp, const char *item){ + if (lp->methods.item_decref) { + lp->methods.item_decref(item); + } +} + +/* Setup the method table for a list. + * + * This function is used from the compiler level to initialize the internal + * method table. + * + * lp: a list + * methods: the methods table to set up + */ +void +numba_list_set_method_table(NB_List *lp, list_type_based_methods_table *methods) +{ + memcpy(&lp->methods, methods, sizeof(list_type_based_methods_table)); +} + +/* Check if a list index is valid. + * + * i: the index to check + * limit: the size of a list + * + * Adapted from CPython's valid_index(). + * + * FIXME: need to find a way to inline this, even for Python 2.7 on Windows + */ +static int +valid_index(Py_ssize_t i, Py_ssize_t limit){ + /* The cast to size_t lets us use just a single comparison + to check whether i is in the range: 0 <= i < limit. + + See: Section 14.2 "Bounds Checking" in the Agner Fog + optimization manual found at: + https://www.agner.org/optimize/optimizing_cpp.pdf + */ + return (size_t) i < (size_t) limit; +} + +/* Initialize a new list. + * + * out: pointer to hold an initialized list + * item_size: the size in bytes of the items in the list + * allocated: preallocation of the list in items + * + * This will allocate sufficient memory to hold the list structure and any + * items if requested (allocated != 0). See _listobject.h for more information + * on the NB_List struct. + */ +int +numba_list_new(NB_List **out, Py_ssize_t item_size, Py_ssize_t allocated){ + NB_List *lp; + char *items; + // allocate memory to hold the struct + lp = malloc(aligned_size(sizeof(NB_List))); + if (lp == NULL) { + return LIST_ERR_NO_MEMORY; + } + // set up members + lp->size = 0; + lp->item_size = item_size; + lp->allocated = allocated; + lp->is_mutable = 1; + // set method table to zero */ + memset(&lp->methods, 0x00, sizeof(list_type_based_methods_table)); + // allocate memory to hold items, if requested + if (allocated != 0) { + items = malloc(aligned_size(lp->item_size * allocated)); + // allocated was definitely not zero, if malloc returns NULL + // this is definitely an error + if (items == NULL) { + // free previously allocated struct to avoid leaking memory + free(lp); + return LIST_ERR_NO_MEMORY; + } + lp->items = items; + } + else { + // be explicit + lp->items = NULL; + } + *out = lp; + return LIST_OK; +} + +/* Free the memory associated with a list. + * + * lp: a list + */ +void +numba_list_free(NB_List *lp) { + // decref all items, if needed + Py_ssize_t i; + if (lp->methods.item_decref) { + for (i = 0; i < lp->size; i++) { + char *item = lp->items + lp->item_size * i; + list_decref_item(lp, item); + } + } + // free items and list + if (lp->items != NULL) { + free(lp->items); + } + free(lp); +} + +/* Return the base pointer of the list items. + */ +char * +numba_list_base_ptr(NB_List *lp) +{ + return lp->items; +} + +/* Return the address of the list size. + */ +Py_ssize_t +numba_list_size_address(NB_List *lp) +{ + return (Py_ssize_t)&lp->size; +} + + +/* Return the length of a list. + * + * lp: a list + */ +Py_ssize_t +numba_list_length(NB_List *lp) { + return lp->size; +} + +/* Return the current allocation of a list. + * + * lp: a list + */ +Py_ssize_t +numba_list_allocated(NB_List *lp) { + return lp->allocated; +} + +/* Return the mutability status of the list + * + * lp: a list + * + */ +int +numba_list_is_mutable(NB_List *lp){ + return lp->is_mutable; +} + +/* Set the is_mutable attribute + * + * lp: a list + * is_mutable: an int, 0(False) or 1(True) + * + */ +void +numba_list_set_is_mutable(NB_List *lp, int is_mutable){ + lp->is_mutable = is_mutable; +} + +/* Set an item in a list. + * + * lp: a list + * index: the index of the item to set (must be in range 0 <= index < len(list)) + * item: the item to set + * + * This assume there is already an element at the given index that will be + * overwritten and thereby have its reference decremented. DO NOT use this to + * write to an unassigned location. + */ +int +numba_list_setitem(NB_List *lp, Py_ssize_t index, const char *item) { + char *loc; + // check for mutability + if (!lp->is_mutable) { + return LIST_ERR_IMMUTABLE; + } + // check index is valid + // FIXME: this can be (and probably is) checked at the compiler level + if (!valid_index(index, lp->size)) { + return LIST_ERR_INDEX; + } + // set item at desired location + loc = lp->items + lp-> item_size * index; + list_decref_item(lp, loc); + copy_item(lp, loc, item); + list_incref_item(lp, loc); + return LIST_OK; +} + +/* Get an item from a list. + * + * lp: a list + * index: the index of the item to get (must be in range 0 <= index < len(list)) + * out: a pointer to hold the item + */ +int +numba_list_getitem(NB_List *lp, Py_ssize_t index, char *out) { + char *loc; + // check index is valid + // FIXME: this can be (and probably is) checked at the compiler level + if (!valid_index(index, lp->size)) { + return LIST_ERR_INDEX; + } + // get item at desired location + loc = lp->items + lp->item_size * index; + copy_item(lp, out, loc); + return LIST_OK; +} + +/* Append an item to the end of a list. + * + * lp: a list + * item: the item to append. + */ +int +numba_list_append(NB_List *lp, const char *item) { + char *loc; + // check for mutability + if (!lp->is_mutable) { + return LIST_ERR_IMMUTABLE; + } + // resize by one, will change list size + int result = numba_list_resize(lp, lp->size + 1); + if(result < LIST_OK) { + return result; + } + // insert item at index: original size before resize + loc = lp->items + lp->item_size * (lp->size - 1); + copy_item(lp, loc, item); + list_incref_item(lp, loc); + return LIST_OK; +} + +/* Resize a list. + * + * lp: a list + * newsize: the desired new size of the list. + * + * This will increase or decrease the size of the list, including reallocating + * the required memory and increasing the total allocation (additional free + * space to hold new items). + * + * + * Adapted from CPython's list_resize(). + * + * Ensure lp->items has room for at least newsize elements, and set + * lp->size to newsize. If newsize > lp->size on entry, the content + * of the new slots at exit is undefined heap trash; it's the caller's + * responsibility to overwrite them with sane values. + * The number of allocated elements may grow, shrink, or stay the same. + * Failure is impossible if newsize <= lp->allocated on entry, although + * that partly relies on an assumption that the system realloc() never + * fails when passed a number of bytes <= the number of bytes last + * allocated (the C standard doesn't guarantee this, but it's hard to + * imagine a realloc implementation where it wouldn't be true). + * Note that lp->items may change, and even if newsize is less + * than lp->size on entry. + */ +int +numba_list_resize(NB_List *lp, Py_ssize_t newsize) { + char * items; + // check for mutability + if (!lp->is_mutable) { + return LIST_ERR_IMMUTABLE; + } + size_t new_allocated, num_allocated_bytes; + /* Bypass realloc() when a previous overallocation is large enough + to accommodate the newsize. If the newsize falls lower than half + the allocated size, then proceed with the realloc() to shrink the list. + */ + if (lp->allocated >= newsize && newsize >= (lp->allocated >> 1)) { + assert(lp->items != NULL || newsize == 0); + lp->size = newsize; + return LIST_OK; + } + /* This over-allocates proportional to the list size, making room + * for additional growth. The over-allocation is mild, but is + * enough to give linear-time amortized behavior over a long + * sequence of appends() in the presence of a poorly-performing + * system realloc(). + * The growth pattern is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ... + * Note: new_allocated won't overflow because the largest possible value + * is PY_SSIZE_T_MAX * (9 / 8) + 6 which always fits in a size_t. + */ + new_allocated = (size_t)newsize + (newsize >> 3) + (newsize < 9 ? 3 : 6); + if (new_allocated > (size_t)PY_SSIZE_T_MAX / lp->item_size) { + return LIST_ERR_NO_MEMORY; + } + + if (newsize == 0) + new_allocated = 0; + num_allocated_bytes = new_allocated * lp->item_size; + items = realloc(lp->items, aligned_size(num_allocated_bytes)); + // realloc may return NULL if requested size is 0 + if (num_allocated_bytes != 0 && items == NULL) { + return LIST_ERR_NO_MEMORY; + } + lp->items = items; + lp->size = newsize; + lp->allocated = (Py_ssize_t)new_allocated; + return LIST_OK; +} + +/* Delete a single item. + * + * lp: a list + * index: the index of the item to delete + * (must be in range 0 <= index < len(list)) + * + * */ +int +numba_list_delitem(NB_List *lp, Py_ssize_t index) { + int result; + char *loc, *new_loc; + Py_ssize_t leftover_bytes; + // check for mutability + if (!lp->is_mutable) { + return LIST_ERR_IMMUTABLE; + } + // check index is valid + // FIXME: this can be (and probably is) checked at the compiler level + if (!valid_index(index, lp->size)) { + return LIST_ERR_INDEX; + } + // obtain item and decref if needed + loc = lp->items + lp->item_size * index; + list_decref_item(lp, loc); + if (index != lp->size - 1) { + // delitem from somewhere other than the end, incur the memory copy + leftover_bytes = (lp->size - 1 - index) * lp->item_size; + new_loc = lp->items + (lp->item_size * (index + 1)); + // use memmove instead of memcpy since we may be dealing with + // overlapping regions of memory and the behaviour of memcpy is + // undefined in such situation (C99). + memmove(loc, new_loc, leftover_bytes); + } + // finally, shrink list by one + result = numba_list_resize(lp, lp->size - 1); + if(result < LIST_OK) { + // Since we are decreasing the size, this should never happen + return result; + } + return LIST_OK; + +} + +/* Delete a slice + * + * start: the start index of ths slice + * stop: the stop index of the slice (not included) + * step: the step to take + * + * This function assumes that the start and stop were clipped appropriately. + * I.e. if step > 0 start >= 0 and stop <= len(l) and + * if step < 0 start <= length and stop >= -1 + * step != 0 and no Python negative indexing allowed. + * + * This code was copied and edited from the relevant section in + * list_ass_subscript from the cpython implementation, see the top of this file + * for the exact source + */ +int +numba_list_delete_slice(NB_List *lp, + Py_ssize_t start, Py_ssize_t stop, Py_ssize_t step) { + int result, i, slicelength, new_length; + char *loc, *new_loc; + Py_ssize_t leftover_bytes, cur, lim; + // check for mutability + if (!lp->is_mutable) { + return LIST_ERR_IMMUTABLE; + } + // calculate the slicelength, taken from PySlice_AdjustIndices, see the top + // of this file for the exact source + if (step > 0) { + slicelength = start < stop ? (stop - start - 1) / step + 1 : 0; + } else { + slicelength = stop < start ? (start - stop - 1) / -step + 1 : 0; + } + if (slicelength <= 0){ + return LIST_OK; + } + new_length = lp->size - slicelength; + // reverse step and indices + if (step < 0) { + stop = start + 1; + start = stop + step * (slicelength - 1) - 1; + step = -step; + } + if (step == 1) { + // decref if needed + if (lp->methods.item_decref) { + for (i = start ; i < stop ; i++){ + loc = lp->items + lp->item_size * i; + lp->methods.item_decref(loc); + } + } + // memmove items into place + leftover_bytes = (lp->size - stop) * lp->item_size; + loc = lp->items + lp->item_size * start; + new_loc = lp->items + lp->item_size * stop; + memmove(loc, new_loc, leftover_bytes); + } + else { // step != 1 + /* drawing pictures might help understand these for + * loops. Basically, we memmove the parts of the + * list that are *not* part of the slice: step-1 + * items for each item that is part of the slice, + * and then tail end of the list that was not + * covered by the slice + * + * */ + for (cur = start, // index of item to be deleted + i = 0; // counter of total items deleted so far + cur < stop; + cur += step, + i++) { + lim = step - 1; // number of leftover items after deletion of item + // clip limit, in case we are at the end of the slice, and there + // are now less than step-1 items to be moved + if (cur + step >= lp->size) { + lim = lp->size - cur - 1; + } + // decref item being removed + loc = lp->items + lp->item_size * cur; + list_decref_item(lp, loc); + /* memmove the aforementioned step-1 (or less) items + * dst : index of deleted item minus total deleted sofar + * src : index of deleted item plus one (next item) + */ + memmove(lp->items + lp->item_size * (cur - i), + lp->items + lp->item_size * (cur + 1), + lim * lp->item_size); + } + // memmove tail of the list + cur = start + slicelength * step; + if (cur < lp->size) { + memmove(lp->items + lp->item_size * (cur - slicelength), + lp->items + lp->item_size * cur, + (lp->size - cur) * lp->item_size); + } + } + // resize to correct size + result = numba_list_resize(lp, new_length); + if(result < LIST_OK) { + // Since we are decreasing the size, this should never happen + return result; + } + return LIST_OK; +} + + +/* Return the size of the list iterator (NB_ListIter) struct. + */ +size_t +numba_list_iter_sizeof() { + return sizeof(NB_ListIter); +} + +/* Initialize a list iterator (NB_ListIter). + * + * it: an iterator + * lp: a list to iterate over + */ +void +numba_list_iter(NB_ListIter *it, NB_List *lp) { + // set members of iterator + it->parent = lp; + it->size = lp->size; + it->pos = 0; +} + +/* Obtain the next item from a list iterator. + * + * it: an iterator + * item_ptr: pointer to hold the next item + */ +int +numba_list_iter_next(NB_ListIter *it, const char **item_ptr) { + NB_List *lp; + lp = it->parent; + /* FIXME: Detect list mutation during iteration */ + if (lp->size != it->size) { + return LIST_ERR_MUTATED; + } + // get next element + if (it->pos < lp->size) { + *item_ptr = lp->items + lp->item_size * it->pos++; + return LIST_OK; + }else{ + return LIST_ERR_ITER_EXHAUSTED; + } +} + + +#define CHECK(CASE) { \ + if ( !(CASE) ) { \ + printf("'%s' failed file %s:%d\n", #CASE, __FILE__, __LINE__); \ + return -1; \ + } \ +} + +/* Basic C based tests for the list. + */ +int +numba_test_list(void) { + NB_List *lp = NULL; + int status, i; + Py_ssize_t it_count; + const char *it_item = NULL; + NB_ListIter iter; + char got_item[4] = "\x00\x00\x00\x00"; + const char *test_items_1 = NULL, *test_items_2 = NULL; + char *test_items_3 = NULL; + puts("test_list"); + + + status = numba_list_new(&lp, 4, 0); + CHECK(status == LIST_OK); + CHECK(lp->item_size == 4); + CHECK(lp->size == 0); + CHECK(lp->allocated == 0); + CHECK(lp->is_mutable == 1); + + // flip and check the is_mutable bit + CHECK(numba_list_is_mutable(lp) == 1); + numba_list_set_is_mutable(lp, 0); + CHECK(numba_list_is_mutable(lp) == 0); + numba_list_set_is_mutable(lp, 1); + CHECK(numba_list_is_mutable(lp) == 1); + + // append 1st item, this will cause a realloc + status = numba_list_append(lp, "abc"); + CHECK(status == LIST_OK); + CHECK(lp->size == 1); + CHECK(lp->allocated == 4); + status = numba_list_getitem(lp, 0, got_item); + CHECK(status == LIST_OK); + CHECK(memcmp(got_item, "abc", 4) == 0); + + // append 2nd item + status = numba_list_append(lp, "def"); + CHECK(status == LIST_OK); + CHECK(lp->size == 2); + CHECK(lp->allocated == 4); + status = numba_list_getitem(lp, 1, got_item); + CHECK(status == LIST_OK); + CHECK(memcmp(got_item, "def", 4) == 0); + + // append 3rd item + status = numba_list_append(lp, "ghi"); + CHECK(status == LIST_OK); + CHECK(lp->size == 3); + CHECK(lp->allocated == 4); + status = numba_list_getitem(lp, 2, got_item); + CHECK(status == LIST_OK); + CHECK(memcmp(got_item, "ghi", 4) == 0); + + // append 4th item + status = numba_list_append(lp, "jkl"); + CHECK(status == LIST_OK); + CHECK(lp->size == 4); + CHECK(lp->allocated == 4); + status = numba_list_getitem(lp, 3, got_item); + CHECK(status == LIST_OK); + CHECK(memcmp(got_item, "jkl", 4) == 0); + + // append 5th item, this will cause another realloc + status = numba_list_append(lp, "mno"); + CHECK(status == LIST_OK); + CHECK(lp->size == 5); + CHECK(lp->allocated == 8); + status = numba_list_getitem(lp, 4, got_item); + CHECK(status == LIST_OK); + CHECK(memcmp(got_item, "mno", 4) == 0); + + // overwrite 1st item + status = numba_list_setitem(lp, 0, "pqr"); + CHECK(status == LIST_OK); + CHECK(lp->size == 5); + CHECK(lp->allocated == 8); + status = numba_list_getitem(lp, 0, got_item); + CHECK(status == LIST_OK); + CHECK(memcmp(got_item, "pqr", 4) == 0); + + // get and del 1st item, check item shift + status = numba_list_getitem(lp, 0, got_item); + status = numba_list_delitem(lp, 0); + CHECK(status == LIST_OK); + CHECK(lp->size == 4); + CHECK(lp->allocated == 8); + CHECK(memcmp(got_item, "pqr", 4) == 0); + CHECK(memcmp(lp->items, "def\x00ghi\x00jkl\x00mno\x00", 16) == 0); + + // get and del last (4th) item, no shift since only last item affected + status = numba_list_getitem(lp, 3, got_item); + status = numba_list_delitem(lp, 3); + CHECK(status == LIST_OK); + CHECK(lp->size == 3); + CHECK(lp->allocated == 6); // this also shrinks the allocation + CHECK(memcmp(got_item, "mno", 4) == 0); + CHECK(memcmp(lp->items, "def\x00ghi\x00jkl\x00", 12) == 0); + + // flip and check the is_mutable member + CHECK(numba_list_is_mutable(lp) == 1); + numba_list_set_is_mutable(lp, 0); + CHECK(numba_list_is_mutable(lp) == 0); + + // ensure that any attempts to mutate an immutable list fail + CHECK(numba_list_setitem(lp, 0, "zzz") == LIST_ERR_IMMUTABLE); + CHECK(numba_list_append(lp, "zzz") == LIST_ERR_IMMUTABLE); + CHECK(numba_list_delitem(lp, 0) == LIST_ERR_IMMUTABLE); + CHECK(numba_list_resize(lp, 23) == LIST_ERR_IMMUTABLE); + CHECK(numba_list_delete_slice(lp, 0, 3, 1) == LIST_ERR_IMMUTABLE); + + // ensure that all attempts to query/read from and immutable list succeed + CHECK(numba_list_length(lp) == 3); + status = numba_list_getitem(lp, 0, got_item); + CHECK(status == LIST_OK); + CHECK(memcmp(got_item, "def", 4) == 0); + + // flip the is_mutable member back and check + numba_list_set_is_mutable(lp, 1); + CHECK(numba_list_is_mutable(lp) == 1); + + // test iterator + CHECK(lp->size > 0); + numba_list_iter(&iter, lp); + it_count = 0; + CHECK(iter.parent == lp); + CHECK(iter.pos == it_count); + + // current contents of list + test_items_1 = "def\x00ghi\x00jkl\x00"; + while ( (status = numba_list_iter_next(&iter, &it_item)) == LIST_OK) { + it_count += 1; + CHECK(iter.pos == it_count); // check iterator position + CHECK(it_item != NULL); // quick check item is non-null + // go fishing in test_items_1 + CHECK(memcmp((const char *)test_items_1 + ((it_count - 1) * 4), it_item, 4) == 0); + } + + CHECK(status == LIST_ERR_ITER_EXHAUSTED); + CHECK(lp->size == it_count); + + // free existing list + numba_list_free(lp); + + // test growth upon append and shrink during delitem + status = numba_list_new(&lp, 1, 0); + CHECK(status == LIST_OK); + CHECK(lp->item_size == 1); + CHECK(lp->size == 0); + CHECK(lp->allocated == 0); + + // first, grow the list + // Use exactly 17 elements, should go through the allocation pattern: + // 0, 4, 8, 16, 25 + for (i = 0; i < 17 ; i++) { + switch(i) { + // Check the allocation before + case 0: CHECK(lp->allocated == 0); break; + case 4: CHECK(lp->allocated == 4); break; + case 8: CHECK(lp->allocated == 8); break; + case 16: CHECK(lp->allocated == 16); break; + } + status = numba_list_append(lp, (const char*)&i); + CHECK(status == LIST_OK); + switch(i) { + // Check that the growth happened accordingly + case 0: CHECK(lp->allocated == 4); break; + case 4: CHECK(lp->allocated == 8); break; + case 8: CHECK(lp->allocated == 16); break; + case 16: CHECK(lp->allocated == 25); break; + } + } + CHECK(lp->size == 17); + + // Check current contents of list + test_items_2 = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"; + CHECK(memcmp(lp->items, test_items_2, 17) == 0); + + // Now, delete them again and check that list shrinks + for (i = 17; i > 0 ; i--) { + switch(i) { + // Check the allocation before delitem + case 17: CHECK(lp->allocated == 25); break; + case 12: CHECK(lp->allocated == 25); break; + case 9: CHECK(lp->allocated == 18); break; + case 6: CHECK(lp->allocated == 12); break; + case 4: CHECK(lp->allocated == 8); break; + case 3: CHECK(lp->allocated == 6); break; + case 2: CHECK(lp->allocated == 5); break; + case 1: CHECK(lp->allocated == 4); break; + } + status = numba_list_getitem(lp, i-1, got_item); + status = numba_list_delitem(lp, i-1); + CHECK(status == LIST_OK); + switch(i) { + // Check that the shrink happened accordingly + case 17: CHECK(lp->allocated == 25); break; + case 12: CHECK(lp->allocated == 18); break; + case 9: CHECK(lp->allocated == 12); break; + case 6: CHECK(lp->allocated == 8); break; + case 4: CHECK(lp->allocated == 6); break; + case 3: CHECK(lp->allocated == 5); break; + case 2: CHECK(lp->allocated == 4); break; + case 1: CHECK(lp->allocated == 0); break; + } + } + // free existing list + numba_list_free(lp); + + + // Setup list for testing delete_slice + status = numba_list_new(&lp, 1, 0); + CHECK(status == LIST_OK); + CHECK(lp->item_size == 1); + CHECK(lp->size == 0); + CHECK(lp->allocated == 0); + for (i = 0; i < 17 ; i++) { + status = numba_list_append(lp, (const char*)&i); + CHECK(status == LIST_OK); + } + CHECK(lp->size == 17); + test_items_3 = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"; + CHECK(memcmp(lp->items, test_items_3, 17) == 0); + + // delete multiple elements from the middle + status = numba_list_delete_slice(lp, 2, 5, 1); + CHECK(status == LIST_OK); + CHECK(lp->size == 14); + test_items_3 = "\x00\x01\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"; + CHECK(memcmp(lp->items, test_items_3, 14) == 0); + + // delete single element from start + status = numba_list_delete_slice(lp, 0, 1, 1); + CHECK(status == LIST_OK); + CHECK(lp->size == 13); + test_items_3 = "\x01\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"; + CHECK(memcmp(lp->items, test_items_3, 13) == 0); + + // delete single element from end + status = numba_list_delete_slice(lp, 12, 13, 1); + CHECK(status == LIST_OK); + CHECK(lp->size == 12); + test_items_3 = "\x01\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"; + CHECK(memcmp(lp->items, test_items_3, 12) == 0); + + // delete single element from middle + status = numba_list_delete_slice(lp, 4, 5, 1); + CHECK(status == LIST_OK); + CHECK(lp->size == 11); + test_items_3 = "\x01\x05\x06\x07\x09\x0a\x0b\x0c\x0d\x0e\x0f"; + CHECK(memcmp(lp->items, test_items_3, 11) == 0); + + // delete all elements except first and last + status = numba_list_delete_slice(lp, 1, 10, 1); + CHECK(status == LIST_OK); + CHECK(lp->size == 2); + test_items_3 = "\x01\x0f"; + CHECK(memcmp(lp->items, test_items_3, 2) == 0); + + // delete all remaining elements + status = numba_list_delete_slice(lp, 0, lp->size, 1); + CHECK(status == LIST_OK); + CHECK(lp->size == 0); + test_items_3 = ""; + CHECK(memcmp(lp->items, test_items_3, 0) == 0); + + // free existing list + numba_list_free(lp); + + // Setup list for testing delete_slice with non unary step + status = numba_list_new(&lp, 1, 0); + CHECK(status == LIST_OK); + CHECK(lp->item_size == 1); + CHECK(lp->size == 0); + CHECK(lp->allocated == 0); + for (i = 0; i < 17 ; i++) { + status = numba_list_append(lp, (const char*)&i); + CHECK(status == LIST_OK); + } + CHECK(lp->size == 17); + + // delete all items with odd index + status = numba_list_delete_slice(lp, 0, 17, 2); + CHECK(status == LIST_OK); + CHECK(lp->size == 8); + test_items_3 = "\x01\x03\x05\x07\x09\x0b\x0d\x0f"; + CHECK(memcmp(lp->items, test_items_3, 8) == 0); + + // delete with a step of 4, starting at index 1 + status = numba_list_delete_slice(lp, 1, 8, 4); + CHECK(status == LIST_OK); + CHECK(lp->size == 6); + test_items_3 = "\x01\x05\x07\x09\x0d\x0f"; + CHECK(memcmp(lp->items, test_items_3, 6) == 0); + + // delete with a step of 2, but finish before end of list + status = numba_list_delete_slice(lp, 0, 4, 2); + CHECK(status == LIST_OK); + CHECK(lp->size == 4); + test_items_3 = "\x05\x09\x0d\x0f"; + CHECK(memcmp(lp->items, test_items_3, 4) == 0); + + // no-op on empty slice + status = numba_list_delete_slice(lp, 0, 0, 1); + CHECK(status == LIST_OK); + CHECK(lp->size == 4); + test_items_3 = "\x05\x09\x0d\x0f"; + CHECK(memcmp(lp->items, test_items_3, 4) == 0); + + // no-op on empty slice, non-zero index + status = numba_list_delete_slice(lp, 2, 2, 1); + CHECK(status == LIST_OK); + CHECK(lp->size == 4); + test_items_3 = "\x05\x09\x0d\x0f"; + CHECK(memcmp(lp->items, test_items_3, 4) == 0); + + // free list and return 0 + numba_list_free(lp); + + // Setup list for testing delete_slice with negative step + status = numba_list_new(&lp, 1, 0); + CHECK(status == LIST_OK); + CHECK(lp->item_size == 1); + CHECK(lp->size == 0); + CHECK(lp->allocated == 0); + for (i = 0; i < 17 ; i++) { + status = numba_list_append(lp, (const char*)&i); + CHECK(status == LIST_OK); + } + CHECK(lp->size == 17); + + // delete all items using unary negative slice + status = numba_list_delete_slice(lp, 16, -1, -1); + CHECK(status == LIST_OK); + CHECK(lp->size == 0); + + // refill list + for (i = 0; i < 17 ; i++) { + status = numba_list_append(lp, (const char*)&i); + CHECK(status == LIST_OK); + } + + // delete all items using unary negative slice + // need to start at index of last item (16) and + // go beyond first item, i.e. -1 in Cd + status = numba_list_delete_slice(lp, 16, -1, -2); + CHECK(status == LIST_OK); + CHECK(lp->size == 8); + test_items_3 = "\x01\x03\x05\x07\x09\x0b\x0d\x0f"; + CHECK(memcmp(lp->items, test_items_3, 8) == 0); + + // free list and return 0 + numba_list_free(lp); + return 0; + + +} + +#undef CHECK diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/listobject.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/listobject.h new file mode 100644 index 0000000000000000000000000000000000000000..848ffe640a200f355d8df010670e22bff8ad6fb4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/listobject.h @@ -0,0 +1,135 @@ +/* Adapted from CPython3.7 Include/listobject.h + * + * The exact commit-id of the relevant file is: + * + * https://github.com/python/cpython/blob/51ddab8dae056867f3595ab3400bffc93f67c8d4/Include/listobject.h + * + * + * */ + +#ifndef NUMBA_LIST_H +#define NUMBA_LIST_H + +#include "Python.h" +#include "cext.h" + +typedef void (*list_refcount_op_t)(const void*); + +typedef struct { + list_refcount_op_t item_incref; + list_refcount_op_t item_decref; +} list_type_based_methods_table; + +/* This is the struct for the Numba typed list. It is largely inspired by the + * CPython list struct in listobject.h. In essence the list is a homogeneously + * typed container that can grow and shrink upon insertion and deletion. This + * means that appending an item to, or removing an item from, the end of the + * list, this will have a O(1) amortized runtime. This matches the + * behaviour of the CPython list type and it will grow with the same + * increments. + * + * 'items' contains space for 'allocated' elements. The number + * currently in use is 'size'. The size in bytes of the items stored in the + * list is given by 'item_size'. + * + * Invariants: + * 0 <= size <= allocated + * len(list) == size + * item == NULL implies size == allocated == 0 + * + * FIXME: list.sort() temporarily sets allocated to -1 to detect mutations. + * + * Items must normally not be NULL, except during construction when + * the list is not yet visible outside the function that builds it. + * + * Additionally, this list has boolean member 'is_mutable' that can be used to + * set a list as immutable. Two functions to query and set this member are + * provided. Any attempt to mutate an immutable list will result in a status + * of LIST_ERR_IMMUTABLE. + * + */ +typedef struct { + /* size of the list in items */ + Py_ssize_t size; + /* size of the list items in bytes */ + Py_ssize_t item_size; + /* total allocated slots in items */ + Py_ssize_t allocated; + /* is the list mutable */ + int is_mutable; + /* method table for type-dependent operations */ + list_type_based_methods_table methods; + /* array/pointer for items. Interpretation is governed by item_size */ + char * items; +} NB_List; + + +typedef struct { + /* parent list */ + NB_List *parent; + /* list size */ + Py_ssize_t size; + /* iterator position; indicates the next position to read */ + Py_ssize_t pos; +} NB_ListIter; + +NUMBA_EXPORT_FUNC(void) +numba_list_set_method_table(NB_List *lp, list_type_based_methods_table *methods); + +NUMBA_EXPORT_FUNC(int) +numba_list_new(NB_List **out, Py_ssize_t item_size, Py_ssize_t allocated); + +NUMBA_EXPORT_FUNC(void) +numba_list_free(NB_List *lp); + +NUMBA_EXPORT_FUNC(char *) +numba_list_base_ptr(NB_List *lp); + +NUMBA_EXPORT_FUNC(Py_ssize_t) +numba_list_size_address(NB_List *lp); + +NUMBA_EXPORT_FUNC(Py_ssize_t) +numba_list_length(NB_List *lp); + +NUMBA_EXPORT_FUNC(Py_ssize_t) +numba_list_allocated(NB_List *lp); + +NUMBA_EXPORT_FUNC(int) +numba_list_is_mutable(NB_List *lp); + +NUMBA_EXPORT_FUNC(void) +numba_list_set_is_mutable(NB_List *lp, int is_mutable); + +NUMBA_EXPORT_FUNC(int) +numba_list_setitem(NB_List *lp, Py_ssize_t index, const char *item); + +NUMBA_EXPORT_FUNC(int) +numba_list_getitem(NB_List *lp, Py_ssize_t index, char *out); + +NUMBA_EXPORT_FUNC(int) +numba_list_append(NB_List *lp, const char *item); + +// FIXME: should this be public? +NUMBA_EXPORT_FUNC(int) +numba_list_resize(NB_List *lp, Py_ssize_t newsize); + +NUMBA_EXPORT_FUNC(int) +numba_list_delitem(NB_List *lp, Py_ssize_t index); + +NUMBA_EXPORT_FUNC(int) +numba_list_delete_slice(NB_List *lp, + Py_ssize_t start, Py_ssize_t stop, Py_ssize_t step); + +NUMBA_EXPORT_FUNC(size_t) +numba_list_iter_sizeof(void); + +NUMBA_EXPORT_FUNC(void) +numba_list_iter(NB_ListIter *it, NB_List *l); + +NUMBA_EXPORT_FUNC(int) +numba_list_iter_next(NB_ListIter *it, const char **item_ptr); + +NUMBA_EXPORT_FUNC(int) +numba_test_list(void); + +#endif diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/utils.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/utils.c new file mode 100644 index 0000000000000000000000000000000000000000..17f4992f40d1eed2573b8dbda7cbdaa0d2280f76 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cext/utils.c @@ -0,0 +1,8 @@ +#include "cext.h" + +/* Align size *sz* to pointer width */ +Py_ssize_t +aligned_size(Py_ssize_t sz) { + Py_ssize_t alignment = sizeof(void*); + return sz + (alignment - sz % alignment) % alignment; +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/__init__.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..231a09795af5e7903fe039fc8e129a0360e55510 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/__init__.py @@ -0,0 +1,12 @@ +from __future__ import absolute_import + +# NOTE: The following imports are adapted to use as a vendored subpackage. +# from https://github.com/cloudpipe/cloudpickle/blob/d3279a0689b769d5315fc6ff00cd0f5897844526/cloudpickle/init.py +from .cloudpickle import * # noqa +from .cloudpickle_fast import CloudPickler, dumps, dump # noqa + +# Conform to the convention used by python serialization libraries, which +# expose their Pickler subclass at top-level under the "Pickler" name. +Pickler = CloudPickler + +__version__ = '1.6.0' diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/cloudpickle.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/cloudpickle.py new file mode 100644 index 0000000000000000000000000000000000000000..f7fd4dcbf4f9a8c559a1149ccd95856d2aedc2d2 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/cloudpickle.py @@ -0,0 +1,848 @@ +""" +This class is defined to override standard pickle functionality + +The goals of it follow: +-Serialize lambdas and nested functions to compiled byte code +-Deal with main module correctly +-Deal with other non-serializable objects + +It does not include an unpickler, as standard python unpickling suffices. + +This module was extracted from the `cloud` package, developed by `PiCloud, Inc. +`_. + +Copyright (c) 2012, Regents of the University of California. +Copyright (c) 2009 `PiCloud, Inc. `_. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the University of California, Berkeley nor the + names of its contributors may be used to endorse or promote + products derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" +from __future__ import print_function + +import builtins +import dis +import opcode +import platform +import sys +import types +import weakref +import uuid +import threading +import typing +import warnings + +from .compat import pickle +from typing import Generic, Union, Tuple, Callable +from pickle import _getattribute +from importlib._bootstrap import _find_spec + +try: # pragma: no branch + import typing_extensions as _typing_extensions + from typing_extensions import Literal, Final +except ImportError: + _typing_extensions = Literal = Final = None + +if sys.version_info >= (3, 5, 3): + from typing import ClassVar +else: # pragma: no cover + ClassVar = None + +if sys.version_info >= (3, 8): + from types import CellType +else: + def f(): + a = 1 + + def g(): + return a + return g + CellType = type(f().__closure__[0]) + + +# cloudpickle is meant for inter process communication: we expect all +# communicating processes to run the same Python version hence we favor +# communication speed over compatibility: +DEFAULT_PROTOCOL = pickle.HIGHEST_PROTOCOL + +# Track the provenance of reconstructed dynamic classes to make it possible to +# reconstruct instances from the matching singleton class definition when +# appropriate and preserve the usual "isinstance" semantics of Python objects. +_DYNAMIC_CLASS_TRACKER_BY_CLASS = weakref.WeakKeyDictionary() +_DYNAMIC_CLASS_TRACKER_BY_ID = weakref.WeakValueDictionary() +_DYNAMIC_CLASS_TRACKER_LOCK = threading.Lock() +_DYNAMIC_CLASS_TRACKER_REUSING = weakref.WeakSet() + +PYPY = platform.python_implementation() == "PyPy" + +builtin_code_type = None +if PYPY: + # builtin-code objects only exist in pypy + builtin_code_type = type(float.__new__.__code__) + +_extract_code_globals_cache = weakref.WeakKeyDictionary() + + +def _get_or_create_tracker_id(class_def): + with _DYNAMIC_CLASS_TRACKER_LOCK: + class_tracker_id = _DYNAMIC_CLASS_TRACKER_BY_CLASS.get(class_def) + if class_tracker_id is None: + class_tracker_id = uuid.uuid4().hex + _DYNAMIC_CLASS_TRACKER_BY_CLASS[class_def] = class_tracker_id + _DYNAMIC_CLASS_TRACKER_BY_ID[class_tracker_id] = class_def + return class_tracker_id + + +def _lookup_class_or_track(class_tracker_id, class_def): + if class_tracker_id is not None: + with _DYNAMIC_CLASS_TRACKER_LOCK: + orig_class_def = class_def + class_def = _DYNAMIC_CLASS_TRACKER_BY_ID.setdefault( + class_tracker_id, class_def) + _DYNAMIC_CLASS_TRACKER_BY_CLASS[class_def] = class_tracker_id + # Check if we are reusing a previous class_def + if orig_class_def is not class_def: + # Remember the class_def is being reused + _DYNAMIC_CLASS_TRACKER_REUSING.add(class_def) + return class_def + + +def _whichmodule(obj, name): + """Find the module an object belongs to. + + This function differs from ``pickle.whichmodule`` in two ways: + - it does not mangle the cases where obj's module is __main__ and obj was + not found in any module. + - Errors arising during module introspection are ignored, as those errors + are considered unwanted side effects. + """ + if sys.version_info[:2] < (3, 7) and isinstance(obj, typing.TypeVar): # pragma: no branch # noqa + # Workaround bug in old Python versions: prior to Python 3.7, + # T.__module__ would always be set to "typing" even when the TypeVar T + # would be defined in a different module. + # + # For such older Python versions, we ignore the __module__ attribute of + # TypeVar instances and instead exhaustively lookup those instances in + # all currently imported modules. + module_name = None + else: + module_name = getattr(obj, '__module__', None) + + if module_name is not None: + return module_name + # Protect the iteration by using a copy of sys.modules against dynamic + # modules that trigger imports of other modules upon calls to getattr or + # other threads importing at the same time. + for module_name, module in sys.modules.copy().items(): + # Some modules such as coverage can inject non-module objects inside + # sys.modules + if ( + module_name == '__main__' or + module is None or + not isinstance(module, types.ModuleType) + ): + continue + try: + if _getattribute(module, name)[0] is obj: + return module_name + except Exception: + pass + return None + + +def _is_importable(obj, name=None): + """Dispatcher utility to test the importability of various constructs.""" + if isinstance(obj, types.FunctionType): + return _lookup_module_and_qualname(obj, name=name) is not None + elif issubclass(type(obj), type): + return _lookup_module_and_qualname(obj, name=name) is not None + elif isinstance(obj, types.ModuleType): + # We assume that sys.modules is primarily used as a cache mechanism for + # the Python import machinery. Checking if a module has been added in + # is sys.modules therefore a cheap and simple heuristic to tell us whether + # we can assume that a given module could be imported by name in + # another Python process. + return obj.__name__ in sys.modules + else: + raise TypeError( + "cannot check importability of {} instances".format( + type(obj).__name__) + ) + + +def _lookup_module_and_qualname(obj, name=None): + if name is None: + name = getattr(obj, '__qualname__', None) + if name is None: # pragma: no cover + # This used to be needed for Python 2.7 support but is probably not + # needed anymore. However we keep the __name__ introspection in case + # users of cloudpickle rely on this old behavior for unknown reasons. + name = getattr(obj, '__name__', None) + + module_name = _whichmodule(obj, name) + + if module_name is None: + # In this case, obj.__module__ is None AND obj was not found in any + # imported module. obj is thus treated as dynamic. + return None + + if module_name == "__main__": + return None + + # Note: if module_name is in sys.modules, the corresponding module is + # assumed importable at unpickling time. See #357 + module = sys.modules.get(module_name, None) + if module is None: + # The main reason why obj's module would not be imported is that this + # module has been dynamically created, using for example + # types.ModuleType. The other possibility is that module was removed + # from sys.modules after obj was created/imported. But this case is not + # supported, as the standard pickle does not support it either. + return None + + try: + obj2, parent = _getattribute(module, name) + except AttributeError: + # obj was not found inside the module it points to + return None + if obj2 is not obj: + return None + return module, name + + +def _extract_code_globals(co): + """ + Find all globals names read or written to by codeblock co + """ + out_names = _extract_code_globals_cache.get(co) + if out_names is None: + names = co.co_names + out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} + + # Declaring a function inside another one using the "def ..." + # syntax generates a constant code object corresponding to one + # of the nested function's As the nested function may itself need + # global variables, we need to introspect its code, extract its + # globals, (look for code object in it's co_consts attribute..) and + # add the result to code_globals + if co.co_consts: + for const in co.co_consts: + if isinstance(const, types.CodeType): + out_names |= _extract_code_globals(const) + + _extract_code_globals_cache[co] = out_names + + return out_names + + +def _find_imported_submodules(code, top_level_dependencies): + """ + Find currently imported submodules used by a function. + + Submodules used by a function need to be detected and referenced for the + function to work correctly at depickling time. Because submodules can be + referenced as attribute of their parent package (``package.submodule``), we + need a special introspection technique that does not rely on GLOBAL-related + opcodes to find references of them in a code object. + + Example: + ``` + import concurrent.futures + import cloudpickle + def func(): + x = concurrent.futures.ThreadPoolExecutor + if __name__ == '__main__': + cloudpickle.dumps(func) + ``` + The globals extracted by cloudpickle in the function's state include the + concurrent package, but not its submodule (here, concurrent.futures), which + is the module used by func. Find_imported_submodules will detect the usage + of concurrent.futures. Saving this module alongside with func will ensure + that calling func once depickled does not fail due to concurrent.futures + not being imported + """ + + subimports = [] + # check if any known dependency is an imported package + for x in top_level_dependencies: + if (isinstance(x, types.ModuleType) and + hasattr(x, '__package__') and x.__package__): + # check if the package has any currently loaded sub-imports + prefix = x.__name__ + '.' + # A concurrent thread could mutate sys.modules, + # make sure we iterate over a copy to avoid exceptions + for name in list(sys.modules): + # Older versions of pytest will add a "None" module to + # sys.modules. + if name is not None and name.startswith(prefix): + # check whether the function can address the sub-module + tokens = set(name[len(prefix):].split('.')) + if not tokens - set(code.co_names): + subimports.append(sys.modules[name]) + return subimports + + +def cell_set(cell, value): + """Set the value of a closure cell. + + The point of this function is to set the cell_contents attribute of a cell + after its creation. This operation is necessary in case the cell contains a + reference to the function the cell belongs to, as when calling the + function's constructor + ``f = types.FunctionType(code, globals, name, argdefs, closure)``, + closure will not be able to contain the yet-to-be-created f. + + In Python3.7, cell_contents is writeable, so setting the contents of a cell + can be done simply using + >>> cell.cell_contents = value + + In earlier Python3 versions, the cell_contents attribute of a cell is read + only, but this limitation can be worked around by leveraging the Python 3 + ``nonlocal`` keyword. + + In Python2 however, this attribute is read only, and there is no + ``nonlocal`` keyword. For this reason, we need to come up with more + complicated hacks to set this attribute. + + The chosen approach is to create a function with a STORE_DEREF opcode, + which sets the content of a closure variable. Typically: + + >>> def inner(value): + ... lambda: cell # the lambda makes cell a closure + ... cell = value # cell is a closure, so this triggers a STORE_DEREF + + (Note that in Python2, A STORE_DEREF can never be triggered from an inner + function. The function g for example here + >>> def f(var): + ... def g(): + ... var += 1 + ... return g + + will not modify the closure variable ``var```inplace, but instead try to + load a local variable var and increment it. As g does not assign the local + variable ``var`` any initial value, calling f(1)() will fail at runtime.) + + Our objective is to set the value of a given cell ``cell``. So we need to + somewhat reference our ``cell`` object into the ``inner`` function so that + this object (and not the smoke cell of the lambda function) gets affected + by the STORE_DEREF operation. + + In inner, ``cell`` is referenced as a cell variable (an enclosing variable + that is referenced by the inner function). If we create a new function + cell_set with the exact same code as ``inner``, but with ``cell`` marked as + a free variable instead, the STORE_DEREF will be applied on its closure - + ``cell``, which we can specify explicitly during construction! The new + cell_set variable thus actually sets the contents of a specified cell! + + Note: we do not make use of the ``nonlocal`` keyword to set the contents of + a cell in early python3 versions to limit possible syntax errors in case + test and checker libraries decide to parse the whole file. + """ + + if sys.version_info[:2] >= (3, 7): # pragma: no branch + cell.cell_contents = value + else: + _cell_set = types.FunctionType( + _cell_set_template_code, {}, '_cell_set', (), (cell,),) + _cell_set(value) + + +def _make_cell_set_template_code(): + def _cell_set_factory(value): + lambda: cell + cell = value + + co = _cell_set_factory.__code__ + + _cell_set_template_code = types.CodeType( + co.co_argcount, + co.co_kwonlyargcount, # Python 3 only argument + co.co_nlocals, + co.co_stacksize, + co.co_flags, + co.co_code, + co.co_consts, + co.co_names, + co.co_varnames, + co.co_filename, + co.co_name, + co.co_firstlineno, + co.co_lnotab, + co.co_cellvars, # co_freevars is initialized with co_cellvars + (), # co_cellvars is made empty + ) + return _cell_set_template_code + + +if sys.version_info[:2] < (3, 7): + _cell_set_template_code = _make_cell_set_template_code() + +# relevant opcodes +STORE_GLOBAL = opcode.opmap['STORE_GLOBAL'] +DELETE_GLOBAL = opcode.opmap['DELETE_GLOBAL'] +LOAD_GLOBAL = opcode.opmap['LOAD_GLOBAL'] +GLOBAL_OPS = (STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL) +HAVE_ARGUMENT = dis.HAVE_ARGUMENT +EXTENDED_ARG = dis.EXTENDED_ARG + + +_BUILTIN_TYPE_NAMES = {} +for k, v in types.__dict__.items(): + if type(v) is type: + _BUILTIN_TYPE_NAMES[v] = k + + +def _builtin_type(name): + if name == "ClassType": # pragma: no cover + # Backward compat to load pickle files generated with cloudpickle + # < 1.3 even if loading pickle files from older versions is not + # officially supported. + return type + return getattr(types, name) + + +def _walk_global_ops(code): + """ + Yield (opcode, argument number) tuples for all + global-referencing instructions in *code*. + """ + for instr in dis.get_instructions(code): + op = instr.opcode + if op in GLOBAL_OPS: + yield op, instr.arg + + +def _extract_class_dict(cls): + """Retrieve a copy of the dict of a class without the inherited methods""" + clsdict = dict(cls.__dict__) # copy dict proxy to a dict + if len(cls.__bases__) == 1: + inherited_dict = cls.__bases__[0].__dict__ + else: + inherited_dict = {} + for base in reversed(cls.__bases__): + inherited_dict.update(base.__dict__) + to_remove = [] + for name, value in clsdict.items(): + try: + base_value = inherited_dict[name] + if value is base_value: + to_remove.append(name) + except KeyError: + pass + for name in to_remove: + clsdict.pop(name) + return clsdict + + +if sys.version_info[:2] < (3, 7): # pragma: no branch + def _is_parametrized_type_hint(obj): + # This is very cheap but might generate false positives. + # general typing Constructs + is_typing = getattr(obj, '__origin__', None) is not None + + # typing_extensions.Literal + is_litteral = getattr(obj, '__values__', None) is not None + + # typing_extensions.Final + is_final = getattr(obj, '__type__', None) is not None + + # typing.Union/Tuple for old Python 3.5 + is_union = getattr(obj, '__union_params__', None) is not None + is_tuple = getattr(obj, '__tuple_params__', None) is not None + is_callable = ( + getattr(obj, '__result__', None) is not None and + getattr(obj, '__args__', None) is not None + ) + return any((is_typing, is_litteral, is_final, is_union, is_tuple, + is_callable)) + + def _create_parametrized_type_hint(origin, args): + return origin[args] +else: + _is_parametrized_type_hint = None + _create_parametrized_type_hint = None + + +def parametrized_type_hint_getinitargs(obj): + # The distorted type check semantic for typing construct becomes: + # ``type(obj) is type(TypeHint)``, which means "obj is a + # parametrized TypeHint" + if type(obj) is type(Literal): # pragma: no branch + initargs = (Literal, obj.__values__) + elif type(obj) is type(Final): # pragma: no branch + initargs = (Final, obj.__type__) + elif type(obj) is type(ClassVar): + initargs = (ClassVar, obj.__type__) + elif type(obj) is type(Generic): + parameters = obj.__parameters__ + if len(obj.__parameters__) > 0: + # in early Python 3.5, __parameters__ was sometimes + # preferred to __args__ + initargs = (obj.__origin__, parameters) + + else: + initargs = (obj.__origin__, obj.__args__) + elif type(obj) is type(Union): + if sys.version_info < (3, 5, 3): # pragma: no cover + initargs = (Union, obj.__union_params__) + else: + initargs = (Union, obj.__args__) + elif type(obj) is type(Tuple): + if sys.version_info < (3, 5, 3): # pragma: no cover + initargs = (Tuple, obj.__tuple_params__) + else: + initargs = (Tuple, obj.__args__) + elif type(obj) is type(Callable): + if sys.version_info < (3, 5, 3): # pragma: no cover + args = obj.__args__ + result = obj.__result__ + if args != Ellipsis: + if isinstance(args, tuple): + args = list(args) + else: + args = [args] + else: + (*args, result) = obj.__args__ + if len(args) == 1 and args[0] is Ellipsis: + args = Ellipsis + else: + args = list(args) + initargs = (Callable, (args, result)) + else: # pragma: no cover + raise pickle.PicklingError( + "Cloudpickle Error: Unknown type {}".format(type(obj)) + ) + return initargs + + +# Tornado support + +def is_tornado_coroutine(func): + """ + Return whether *func* is a Tornado coroutine function. + Running coroutines are not supported. + """ + if 'tornado.gen' not in sys.modules: + return False + gen = sys.modules['tornado.gen'] + if not hasattr(gen, "is_coroutine_function"): + # Tornado version is too old + return False + return gen.is_coroutine_function(func) + + +def _rebuild_tornado_coroutine(func): + from tornado import gen + return gen.coroutine(func) + + +# including pickles unloading functions in this namespace +load = pickle.load +loads = pickle.loads + + +# hack for __import__ not working as desired +def subimport(name): + __import__(name) + return sys.modules[name] + + +def dynamic_subimport(name, vars): + mod = types.ModuleType(name) + mod.__dict__.update(vars) + mod.__dict__['__builtins__'] = builtins.__dict__ + return mod + + +def _gen_ellipsis(): + return Ellipsis + + +def _gen_not_implemented(): + return NotImplemented + + +def _get_cell_contents(cell): + try: + return cell.cell_contents + except ValueError: + # sentinel used by ``_fill_function`` which will leave the cell empty + return _empty_cell_value + + +def instance(cls): + """Create a new instance of a class. + + Parameters + ---------- + cls : type + The class to create an instance of. + + Returns + ------- + instance : cls + A new instance of ``cls``. + """ + return cls() + + +@instance +class _empty_cell_value(object): + """sentinel for empty closures + """ + @classmethod + def __reduce__(cls): + return cls.__name__ + + +def _fill_function(*args): + """Fills in the rest of function data into the skeleton function object + + The skeleton itself is create by _make_skel_func(). + """ + if len(args) == 2: + func = args[0] + state = args[1] + elif len(args) == 5: + # Backwards compat for cloudpickle v0.4.0, after which the `module` + # argument was introduced + func = args[0] + keys = ['globals', 'defaults', 'dict', 'closure_values'] + state = dict(zip(keys, args[1:])) + elif len(args) == 6: + # Backwards compat for cloudpickle v0.4.1, after which the function + # state was passed as a dict to the _fill_function it-self. + func = args[0] + keys = ['globals', 'defaults', 'dict', 'module', 'closure_values'] + state = dict(zip(keys, args[1:])) + else: + raise ValueError('Unexpected _fill_value arguments: %r' % (args,)) + + # - At pickling time, any dynamic global variable used by func is + # serialized by value (in state['globals']). + # - At unpickling time, func's __globals__ attribute is initialized by + # first retrieving an empty isolated namespace that will be shared + # with other functions pickled from the same original module + # by the same CloudPickler instance and then updated with the + # content of state['globals'] to populate the shared isolated + # namespace with all the global variables that are specifically + # referenced for this function. + func.__globals__.update(state['globals']) + + func.__defaults__ = state['defaults'] + func.__dict__ = state['dict'] + if 'annotations' in state: + func.__annotations__ = state['annotations'] + if 'doc' in state: + func.__doc__ = state['doc'] + if 'name' in state: + func.__name__ = state['name'] + if 'module' in state: + func.__module__ = state['module'] + if 'qualname' in state: + func.__qualname__ = state['qualname'] + if 'kwdefaults' in state: + func.__kwdefaults__ = state['kwdefaults'] + # _cloudpickle_subimports is a set of submodules that must be loaded for + # the pickled function to work correctly at unpickling time. Now that these + # submodules are depickled (hence imported), they can be removed from the + # object's state (the object state only served as a reference holder to + # these submodules) + if '_cloudpickle_submodules' in state: + state.pop('_cloudpickle_submodules') + + cells = func.__closure__ + if cells is not None: + for cell, value in zip(cells, state['closure_values']): + if value is not _empty_cell_value: + cell_set(cell, value) + + return func + + +def _make_empty_cell(): + if False: + # trick the compiler into creating an empty cell in our lambda + cell = None + raise AssertionError('this route should not be executed') + + return (lambda: cell).__closure__[0] + + +def _make_cell(value=_empty_cell_value): + cell = _make_empty_cell() + if value is not _empty_cell_value: + cell_set(cell, value) + return cell + + +def _make_skel_func(code, cell_count, base_globals=None): + """ Creates a skeleton function object that contains just the provided + code and the correct number of cells in func_closure. All other + func attributes (e.g. func_globals) are empty. + """ + # This function is deprecated and should be removed in cloudpickle 1.7 + warnings.warn( + "A pickle file created using an old (<=1.4.1) version of cloudpicke " + "is currently being loaded. This is not supported by cloudpickle and " + "will break in cloudpickle 1.7", category=UserWarning + ) + # This is backward-compatibility code: for cloudpickle versions between + # 0.5.4 and 0.7, base_globals could be a string or None. base_globals + # should now always be a dictionary. + if base_globals is None or isinstance(base_globals, str): + base_globals = {} + + base_globals['__builtins__'] = __builtins__ + + closure = ( + tuple(_make_empty_cell() for _ in range(cell_count)) + if cell_count >= 0 else + None + ) + return types.FunctionType(code, base_globals, None, None, closure) + + +def _make_skeleton_class(type_constructor, name, bases, type_kwargs, + class_tracker_id, extra): + """Build dynamic class with an empty __dict__ to be filled once memoized + + If class_tracker_id is not None, try to lookup an existing class definition + matching that id. If none is found, track a newly reconstructed class + definition under that id so that other instances stemming from the same + class id will also reuse this class definition. + + The "extra" variable is meant to be a dict (or None) that can be used for + forward compatibility shall the need arise. + """ + skeleton_class = types.new_class( + name, bases, {'metaclass': type_constructor}, + lambda ns: ns.update(type_kwargs) + ) + return _lookup_class_or_track(class_tracker_id, skeleton_class) + + +def _rehydrate_skeleton_class(skeleton_class, class_dict): + """Put attributes from `class_dict` back on `skeleton_class`. + + See CloudPickler.save_dynamic_class for more info. + """ + registry = None + for attrname, attr in class_dict.items(): + if attrname == "_abc_impl": + registry = attr + else: + setattr(skeleton_class, attrname, attr) + if registry is not None: + for subclass in registry: + skeleton_class.register(subclass) + + return skeleton_class + + +def _make_skeleton_enum(bases, name, qualname, members, module, + class_tracker_id, extra): + """Build dynamic enum with an empty __dict__ to be filled once memoized + + The creation of the enum class is inspired by the code of + EnumMeta._create_. + + If class_tracker_id is not None, try to lookup an existing enum definition + matching that id. If none is found, track a newly reconstructed enum + definition under that id so that other instances stemming from the same + class id will also reuse this enum definition. + + The "extra" variable is meant to be a dict (or None) that can be used for + forward compatibility shall the need arise. + """ + # enums always inherit from their base Enum class at the last position in + # the list of base classes: + enum_base = bases[-1] + metacls = enum_base.__class__ + classdict = metacls.__prepare__(name, bases) + + for member_name, member_value in members.items(): + classdict[member_name] = member_value + enum_class = metacls.__new__(metacls, name, bases, classdict) + enum_class.__module__ = module + enum_class.__qualname__ = qualname + + return _lookup_class_or_track(class_tracker_id, enum_class) + + +def _make_typevar(name, bound, constraints, covariant, contravariant, + class_tracker_id): + tv = typing.TypeVar( + name, *constraints, bound=bound, + covariant=covariant, contravariant=contravariant + ) + if class_tracker_id is not None: + return _lookup_class_or_track(class_tracker_id, tv) + else: # pragma: nocover + # Only for Python 3.5.3 compat. + return tv + + +def _decompose_typevar(obj): + try: + class_tracker_id = _get_or_create_tracker_id(obj) + except TypeError: # pragma: nocover + # TypeVar instances are not weakref-able in Python 3.5.3 + class_tracker_id = None + return ( + obj.__name__, obj.__bound__, obj.__constraints__, + obj.__covariant__, obj.__contravariant__, + class_tracker_id, + ) + + +def _typevar_reduce(obj): + # TypeVar instances have no __qualname__ hence we pass the name explicitly. + module_and_name = _lookup_module_and_qualname(obj, name=obj.__name__) + if module_and_name is None: + return (_make_typevar, _decompose_typevar(obj)) + return (getattr, module_and_name) + + +def _get_bases(typ): + if hasattr(typ, '__orig_bases__'): + # For generic types (see PEP 560) + bases_attr = '__orig_bases__' + else: + # For regular class objects + bases_attr = '__bases__' + return getattr(typ, bases_attr) + + +def _make_dict_keys(obj): + return dict.fromkeys(obj).keys() + + +def _make_dict_values(obj): + return {i: _ for i, _ in enumerate(obj)}.values() + + +def _make_dict_items(obj): + return obj.items() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/cloudpickle_fast.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/cloudpickle_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..2164504956109d44617e10f2609d24b0ea370422 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/cloudpickle_fast.py @@ -0,0 +1,775 @@ +""" +New, fast version of the CloudPickler. + +This new CloudPickler class can now extend the fast C Pickler instead of the +previous Python implementation of the Pickler class. Because this functionality +is only available for Python versions 3.8+, a lot of backward-compatibility +code is also removed. + +Note that the C Pickler sublassing API is CPython-specific. Therefore, some +guards present in cloudpickle.py that were written to handle PyPy specificities +are not present in cloudpickle_fast.py +""" +import _collections_abc +import abc +import copyreg +import io +import itertools +import logging +import sys +import struct +import types +import weakref +import typing + +from enum import Enum +from collections import ChainMap + +from .compat import pickle, Pickler +from .cloudpickle import ( + _extract_code_globals, _BUILTIN_TYPE_NAMES, DEFAULT_PROTOCOL, + _find_imported_submodules, _get_cell_contents, _is_importable, + _builtin_type, _get_or_create_tracker_id, _make_skeleton_class, + _make_skeleton_enum, _extract_class_dict, dynamic_subimport, subimport, + _typevar_reduce, _get_bases, _make_cell, _make_empty_cell, CellType, + _is_parametrized_type_hint, PYPY, cell_set, + parametrized_type_hint_getinitargs, _create_parametrized_type_hint, + builtin_code_type, + _make_dict_keys, _make_dict_values, _make_dict_items, + _DYNAMIC_CLASS_TRACKER_REUSING, +) + + +if pickle.HIGHEST_PROTOCOL >= 5 and not PYPY: + # Shorthands similar to pickle.dump/pickle.dumps + + def dump(obj, file, protocol=None, buffer_callback=None): + """Serialize obj as bytes streamed into file + + protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to + pickle.HIGHEST_PROTOCOL. This setting favors maximum communication + speed between processes running the same Python version. + + Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure + compatibility with older versions of Python. + """ + CloudPickler( + file, protocol=protocol, buffer_callback=buffer_callback + ).dump(obj) + + def dumps(obj, protocol=None, buffer_callback=None): + """Serialize obj as a string of bytes allocated in memory + + protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to + pickle.HIGHEST_PROTOCOL. This setting favors maximum communication + speed between processes running the same Python version. + + Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure + compatibility with older versions of Python. + """ + with io.BytesIO() as file: + cp = CloudPickler( + file, protocol=protocol, buffer_callback=buffer_callback + ) + cp.dump(obj) + return file.getvalue() + +else: + # Shorthands similar to pickle.dump/pickle.dumps + def dump(obj, file, protocol=None): + """Serialize obj as bytes streamed into file + + protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to + pickle.HIGHEST_PROTOCOL. This setting favors maximum communication + speed between processes running the same Python version. + + Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure + compatibility with older versions of Python. + """ + CloudPickler(file, protocol=protocol).dump(obj) + + def dumps(obj, protocol=None): + """Serialize obj as a string of bytes allocated in memory + + protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to + pickle.HIGHEST_PROTOCOL. This setting favors maximum communication + speed between processes running the same Python version. + + Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure + compatibility with older versions of Python. + """ + with io.BytesIO() as file: + cp = CloudPickler(file, protocol=protocol) + cp.dump(obj) + return file.getvalue() + + +load, loads = pickle.load, pickle.loads + + +# COLLECTION OF OBJECTS __getnewargs__-LIKE METHODS +# ------------------------------------------------- + +def _class_getnewargs(obj): + type_kwargs = {} + if "__slots__" in obj.__dict__: + type_kwargs["__slots__"] = obj.__slots__ + + __dict__ = obj.__dict__.get('__dict__', None) + if isinstance(__dict__, property): + type_kwargs['__dict__'] = __dict__ + + return (type(obj), obj.__name__, _get_bases(obj), type_kwargs, + _get_or_create_tracker_id(obj), None) + + +def _enum_getnewargs(obj): + members = dict((e.name, e.value) for e in obj) + return (obj.__bases__, obj.__name__, obj.__qualname__, members, + obj.__module__, _get_or_create_tracker_id(obj), None) + + +# COLLECTION OF OBJECTS RECONSTRUCTORS +# ------------------------------------ +def _file_reconstructor(retval): + return retval + + +# COLLECTION OF OBJECTS STATE GETTERS +# ----------------------------------- +def _function_getstate(func): + # - Put func's dynamic attributes (stored in func.__dict__) in state. These + # attributes will be restored at unpickling time using + # f.__dict__.update(state) + # - Put func's members into slotstate. Such attributes will be restored at + # unpickling time by iterating over slotstate and calling setattr(func, + # slotname, slotvalue) + slotstate = { + "__name__": func.__name__, + "__qualname__": func.__qualname__, + "__annotations__": func.__annotations__, + "__kwdefaults__": func.__kwdefaults__, + "__defaults__": func.__defaults__, + "__module__": func.__module__, + "__doc__": func.__doc__, + "__closure__": func.__closure__, + } + + f_globals_ref = _extract_code_globals(func.__code__) + f_globals = {k: func.__globals__[k] for k in f_globals_ref if k in + func.__globals__} + + closure_values = ( + list(map(_get_cell_contents, func.__closure__)) + if func.__closure__ is not None else () + ) + + # Extract currently-imported submodules used by func. Storing these modules + # in a smoke _cloudpickle_subimports attribute of the object's state will + # trigger the side effect of importing these modules at unpickling time + # (which is necessary for func to work correctly once depickled) + slotstate["_cloudpickle_submodules"] = _find_imported_submodules( + func.__code__, itertools.chain(f_globals.values(), closure_values)) + slotstate["__globals__"] = f_globals + + state = func.__dict__ + return state, slotstate + + +def _class_getstate(obj): + clsdict = _extract_class_dict(obj) + clsdict.pop('__weakref__', None) + + if issubclass(type(obj), abc.ABCMeta): + # If obj is an instance of an ABCMeta subclass, dont pickle the + # cache/negative caches populated during isinstance/issubclass + # checks, but pickle the list of registered subclasses of obj. + clsdict.pop('_abc_cache', None) + clsdict.pop('_abc_negative_cache', None) + clsdict.pop('_abc_negative_cache_version', None) + registry = clsdict.pop('_abc_registry', None) + if registry is None: + # in Python3.7+, the abc caches and registered subclasses of a + # class are bundled into the single _abc_impl attribute + clsdict.pop('_abc_impl', None) + (registry, _, _, _) = abc._get_dump(obj) + + clsdict["_abc_impl"] = [subclass_weakref() + for subclass_weakref in registry] + else: + # In the above if clause, registry is a set of weakrefs -- in + # this case, registry is a WeakSet + clsdict["_abc_impl"] = [type_ for type_ in registry] + + if "__slots__" in clsdict: + # pickle string length optimization: member descriptors of obj are + # created automatically from obj's __slots__ attribute, no need to + # save them in obj's state + if isinstance(obj.__slots__, str): + clsdict.pop(obj.__slots__) + else: + for k in obj.__slots__: + clsdict.pop(k, None) + + clsdict.pop('__dict__', None) # unpicklable property object + + return (clsdict, {}) + + +def _enum_getstate(obj): + clsdict, slotstate = _class_getstate(obj) + + members = dict((e.name, e.value) for e in obj) + # Cleanup the clsdict that will be passed to _rehydrate_skeleton_class: + # Those attributes are already handled by the metaclass. + for attrname in ["_generate_next_value_", "_member_names_", + "_member_map_", "_member_type_", + "_value2member_map_"]: + clsdict.pop(attrname, None) + for member in members: + clsdict.pop(member) + # Special handling of Enum subclasses + return clsdict, slotstate + + +# COLLECTIONS OF OBJECTS REDUCERS +# ------------------------------- +# A reducer is a function taking a single argument (obj), and that returns a +# tuple with all the necessary data to re-construct obj. Apart from a few +# exceptions (list, dict, bytes, int, etc.), a reducer is necessary to +# correctly pickle an object. +# While many built-in objects (Exceptions objects, instances of the "object" +# class, etc), are shipped with their own built-in reducer (invoked using +# obj.__reduce__), some do not. The following methods were created to "fill +# these holes". + +def _code_reduce(obj): + """codeobject reducer""" + if hasattr(obj, "co_posonlyargcount"): # pragma: no branch + args = ( + obj.co_argcount, obj.co_posonlyargcount, + obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, + obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, + obj.co_varnames, obj.co_filename, obj.co_name, + obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, + obj.co_cellvars + ) + else: + args = ( + obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, + obj.co_stacksize, obj.co_flags, obj.co_code, obj.co_consts, + obj.co_names, obj.co_varnames, obj.co_filename, + obj.co_name, obj.co_firstlineno, obj.co_lnotab, + obj.co_freevars, obj.co_cellvars + ) + return types.CodeType, args + + +def _cell_reduce(obj): + """Cell (containing values of a function's free variables) reducer""" + try: + obj.cell_contents + except ValueError: # cell is empty + return _make_empty_cell, () + else: + return _make_cell, (obj.cell_contents, ) + + +def _classmethod_reduce(obj): + orig_func = obj.__func__ + return type(obj), (orig_func,) + + +def _file_reduce(obj): + """Save a file""" + import io + + if not hasattr(obj, "name") or not hasattr(obj, "mode"): + raise pickle.PicklingError( + "Cannot pickle files that do not map to an actual file" + ) + if obj is sys.stdout: + return getattr, (sys, "stdout") + if obj is sys.stderr: + return getattr, (sys, "stderr") + if obj is sys.stdin: + raise pickle.PicklingError("Cannot pickle standard input") + if obj.closed: + raise pickle.PicklingError("Cannot pickle closed files") + if hasattr(obj, "isatty") and obj.isatty(): + raise pickle.PicklingError( + "Cannot pickle files that map to tty objects" + ) + if "r" not in obj.mode and "+" not in obj.mode: + raise pickle.PicklingError( + "Cannot pickle files that are not opened for reading: %s" + % obj.mode + ) + + name = obj.name + + retval = io.StringIO() + + try: + # Read the whole file + curloc = obj.tell() + obj.seek(0) + contents = obj.read() + obj.seek(curloc) + except IOError as e: + raise pickle.PicklingError( + "Cannot pickle file %s as it cannot be read" % name + ) from e + retval.write(contents) + retval.seek(curloc) + + retval.name = name + return _file_reconstructor, (retval,) + + +def _getset_descriptor_reduce(obj): + return getattr, (obj.__objclass__, obj.__name__) + + +def _mappingproxy_reduce(obj): + return types.MappingProxyType, (dict(obj),) + + +def _memoryview_reduce(obj): + return bytes, (obj.tobytes(),) + + +def _module_reduce(obj): + if _is_importable(obj): + return subimport, (obj.__name__,) + else: + obj.__dict__.pop('__builtins__', None) + return dynamic_subimport, (obj.__name__, vars(obj)) + + +def _method_reduce(obj): + return (types.MethodType, (obj.__func__, obj.__self__)) + + +def _logger_reduce(obj): + return logging.getLogger, (obj.name,) + + +def _root_logger_reduce(obj): + return logging.getLogger, () + + +def _property_reduce(obj): + return property, (obj.fget, obj.fset, obj.fdel, obj.__doc__) + + +def _weakset_reduce(obj): + return weakref.WeakSet, (list(obj),) + + +def _dynamic_class_reduce(obj): + """ + Save a class that can't be stored as module global. + + This method is used to serialize classes that are defined inside + functions, or that otherwise can't be serialized as attribute lookups + from global modules. + """ + if Enum is not None and issubclass(obj, Enum): + return ( + _make_skeleton_enum, _enum_getnewargs(obj), _enum_getstate(obj), + None, None, _class_setstate + ) + else: + return ( + _make_skeleton_class, _class_getnewargs(obj), _class_getstate(obj), + None, None, _class_setstate + ) + + +def _class_reduce(obj): + """Select the reducer depending on the dynamic nature of the class obj""" + if obj is type(None): # noqa + return type, (None,) + elif obj is type(Ellipsis): + return type, (Ellipsis,) + elif obj is type(NotImplemented): + return type, (NotImplemented,) + elif obj in _BUILTIN_TYPE_NAMES: + return _builtin_type, (_BUILTIN_TYPE_NAMES[obj],) + elif not _is_importable(obj): + return _dynamic_class_reduce(obj) + return NotImplemented + + +def _dict_keys_reduce(obj): + # Safer not to ship the full dict as sending the rest might + # be unintended and could potentially cause leaking of + # sensitive information + return _make_dict_keys, (list(obj), ) + + +def _dict_values_reduce(obj): + # Safer not to ship the full dict as sending the rest might + # be unintended and could potentially cause leaking of + # sensitive information + return _make_dict_values, (list(obj), ) + + +def _dict_items_reduce(obj): + return _make_dict_items, (dict(obj), ) + + +# COLLECTIONS OF OBJECTS STATE SETTERS +# ------------------------------------ +# state setters are called at unpickling time, once the object is created and +# it has to be updated to how it was at unpickling time. + + +def _function_setstate(obj, state): + """Update the state of a dynaamic function. + + As __closure__ and __globals__ are readonly attributes of a function, we + cannot rely on the native setstate routine of pickle.load_build, that calls + setattr on items of the slotstate. Instead, we have to modify them inplace. + """ + state, slotstate = state + obj.__dict__.update(state) + + obj_globals = slotstate.pop("__globals__") + obj_closure = slotstate.pop("__closure__") + # _cloudpickle_subimports is a set of submodules that must be loaded for + # the pickled function to work correctly at unpickling time. Now that these + # submodules are depickled (hence imported), they can be removed from the + # object's state (the object state only served as a reference holder to + # these submodules) + slotstate.pop("_cloudpickle_submodules") + + obj.__globals__.update(obj_globals) + obj.__globals__["__builtins__"] = __builtins__ + + if obj_closure is not None: + for i, cell in enumerate(obj_closure): + try: + value = cell.cell_contents + except ValueError: # cell is empty + continue + cell_set(obj.__closure__[i], value) + + for k, v in slotstate.items(): + setattr(obj, k, v) + + +def _class_setstate(obj, state): + # Check if class is being reused and needs bypass setstate logic. + if obj in _DYNAMIC_CLASS_TRACKER_REUSING: + return obj + + state, slotstate = state + registry = None + for attrname, attr in state.items(): + if attrname == "_abc_impl": + registry = attr + else: + setattr(obj, attrname, attr) + if registry is not None: + for subclass in registry: + obj.register(subclass) + + return obj + + +class CloudPickler(Pickler): + # set of reducers defined and used by cloudpickle (private) + _dispatch_table = {} + _dispatch_table[classmethod] = _classmethod_reduce + _dispatch_table[io.TextIOWrapper] = _file_reduce + _dispatch_table[logging.Logger] = _logger_reduce + _dispatch_table[logging.RootLogger] = _root_logger_reduce + _dispatch_table[memoryview] = _memoryview_reduce + _dispatch_table[property] = _property_reduce + _dispatch_table[staticmethod] = _classmethod_reduce + _dispatch_table[CellType] = _cell_reduce + _dispatch_table[types.CodeType] = _code_reduce + _dispatch_table[types.GetSetDescriptorType] = _getset_descriptor_reduce + _dispatch_table[types.ModuleType] = _module_reduce + _dispatch_table[types.MethodType] = _method_reduce + _dispatch_table[types.MappingProxyType] = _mappingproxy_reduce + _dispatch_table[weakref.WeakSet] = _weakset_reduce + _dispatch_table[typing.TypeVar] = _typevar_reduce + _dispatch_table[_collections_abc.dict_keys] = _dict_keys_reduce + _dispatch_table[_collections_abc.dict_values] = _dict_values_reduce + _dispatch_table[_collections_abc.dict_items] = _dict_items_reduce + + + dispatch_table = ChainMap(_dispatch_table, copyreg.dispatch_table) + + # function reducers are defined as instance methods of CloudPickler + # objects, as they rely on a CloudPickler attribute (globals_ref) + def _dynamic_function_reduce(self, func): + """Reduce a function that is not pickleable via attribute lookup.""" + newargs = self._function_getnewargs(func) + state = _function_getstate(func) + return (types.FunctionType, newargs, state, None, None, + _function_setstate) + + def _function_reduce(self, obj): + """Reducer for function objects. + + If obj is a top-level attribute of a file-backed module, this + reducer returns NotImplemented, making the CloudPickler fallback to + traditional _pickle.Pickler routines to save obj. Otherwise, it reduces + obj using a custom cloudpickle reducer designed specifically to handle + dynamic functions. + + As opposed to cloudpickle.py, There no special handling for builtin + pypy functions because cloudpickle_fast is CPython-specific. + """ + if _is_importable(obj): + return NotImplemented + else: + return self._dynamic_function_reduce(obj) + + def _function_getnewargs(self, func): + code = func.__code__ + + # base_globals represents the future global namespace of func at + # unpickling time. Looking it up and storing it in + # CloudpiPickler.globals_ref allow functions sharing the same globals + # at pickling time to also share them once unpickled, at one condition: + # since globals_ref is an attribute of a CloudPickler instance, and + # that a new CloudPickler is created each time pickle.dump or + # pickle.dumps is called, functions also need to be saved within the + # same invocation of cloudpickle.dump/cloudpickle.dumps (for example: + # cloudpickle.dumps([f1, f2])). There is no such limitation when using + # CloudPickler.dump, as long as the multiple invocations are bound to + # the same CloudPickler. + base_globals = self.globals_ref.setdefault(id(func.__globals__), {}) + + if base_globals == {}: + # Add module attributes used to resolve relative imports + # instructions inside func. + for k in ["__package__", "__name__", "__path__", "__file__"]: + if k in func.__globals__: + base_globals[k] = func.__globals__[k] + + # Do not bind the free variables before the function is created to + # avoid infinite recursion. + if func.__closure__ is None: + closure = None + else: + closure = tuple( + _make_empty_cell() for _ in range(len(code.co_freevars))) + + return code, base_globals, None, None, closure + + def dump(self, obj): + try: + return Pickler.dump(self, obj) + except RuntimeError as e: + if "recursion" in e.args[0]: + msg = ( + "Could not pickle object as excessively deep recursion " + "required." + ) + raise pickle.PicklingError(msg) from e + else: + raise + + if pickle.HIGHEST_PROTOCOL >= 5: + # `CloudPickler.dispatch` is only left for backward compatibility - note + # that when using protocol 5, `CloudPickler.dispatch` is not an + # extension of `Pickler.dispatch` dictionary, because CloudPickler + # subclasses the C-implemented Pickler, which does not expose a + # `dispatch` attribute. Earlier versions of the protocol 5 CloudPickler + # used `CloudPickler.dispatch` as a class-level attribute storing all + # reducers implemented by cloudpickle, but the attribute name was not a + # great choice given the meaning of `Cloudpickler.dispatch` when + # `CloudPickler` extends the pure-python pickler. + dispatch = dispatch_table + + # Implementation of the reducer_override callback, in order to + # efficiently serialize dynamic functions and classes by subclassing + # the C-implemented Pickler. + # TODO: decorrelate reducer_override (which is tied to CPython's + # implementation - would it make sense to backport it to pypy? - and + # pickle's protocol 5 which is implementation agnostic. Currently, the + # availability of both notions coincide on CPython's pickle and the + # pickle5 backport, but it may not be the case anymore when pypy + # implements protocol 5 + def __init__(self, file, protocol=None, buffer_callback=None): + if protocol is None: + protocol = DEFAULT_PROTOCOL + Pickler.__init__( + self, file, protocol=protocol, buffer_callback=buffer_callback + ) + # map functions __globals__ attribute ids, to ensure that functions + # sharing the same global namespace at pickling time also share + # their global namespace at unpickling time. + self.globals_ref = {} + self.proto = int(protocol) + + def reducer_override(self, obj): + """Type-agnostic reducing callback for function and classes. + + For performance reasons, subclasses of the C _pickle.Pickler class + cannot register custom reducers for functions and classes in the + dispatch_table. Reducer for such types must instead implemented in + the special reducer_override method. + + Note that method will be called for any object except a few + builtin-types (int, lists, dicts etc.), which differs from reducers + in the Pickler's dispatch_table, each of them being invoked for + objects of a specific type only. + + This property comes in handy for classes: although most classes are + instances of the ``type`` metaclass, some of them can be instances + of other custom metaclasses (such as enum.EnumMeta for example). In + particular, the metaclass will likely not be known in advance, and + thus cannot be special-cased using an entry in the dispatch_table. + reducer_override, among other things, allows us to register a + reducer that will be called for any class, independently of its + type. + + + Notes: + + * reducer_override has the priority over dispatch_table-registered + reducers. + * reducer_override can be used to fix other limitations of + cloudpickle for other types that suffered from type-specific + reducers, such as Exceptions. See + https://github.com/cloudpipe/cloudpickle/issues/248 + """ + if sys.version_info[:2] < (3, 7) and _is_parametrized_type_hint(obj): # noqa # pragma: no branch + return ( + _create_parametrized_type_hint, + parametrized_type_hint_getinitargs(obj) + ) + t = type(obj) + try: + is_anyclass = issubclass(t, type) + except TypeError: # t is not a class (old Boost; see SF #502085) + is_anyclass = False + + if is_anyclass: + return _class_reduce(obj) + elif isinstance(obj, types.FunctionType): + return self._function_reduce(obj) + else: + # fallback to save_global, including the Pickler's + # distpatch_table + return NotImplemented + + else: + # When reducer_override is not available, hack the pure-Python + # Pickler's types.FunctionType and type savers. Note: the type saver + # must override Pickler.save_global, because pickle.py contains a + # hard-coded call to save_global when pickling meta-classes. + dispatch = Pickler.dispatch.copy() + + def __init__(self, file, protocol=None): + if protocol is None: + protocol = DEFAULT_PROTOCOL + Pickler.__init__(self, file, protocol=protocol) + # map functions __globals__ attribute ids, to ensure that functions + # sharing the same global namespace at pickling time also share + # their global namespace at unpickling time. + self.globals_ref = {} + assert hasattr(self, 'proto') + + def _save_reduce_pickle5(self, func, args, state=None, listitems=None, + dictitems=None, state_setter=None, obj=None): + save = self.save + write = self.write + self.save_reduce( + func, args, state=None, listitems=listitems, + dictitems=dictitems, obj=obj + ) + # backport of the Python 3.8 state_setter pickle operations + save(state_setter) + save(obj) # simple BINGET opcode as obj is already memoized. + save(state) + write(pickle.TUPLE2) + # Trigger a state_setter(obj, state) function call. + write(pickle.REDUCE) + # The purpose of state_setter is to carry-out an + # inplace modification of obj. We do not care about what the + # method might return, so its output is eventually removed from + # the stack. + write(pickle.POP) + + def save_global(self, obj, name=None, pack=struct.pack): + """ + Save a "global". + + The name of this method is somewhat misleading: all types get + dispatched here. + """ + if obj is type(None): # noqa + return self.save_reduce(type, (None,), obj=obj) + elif obj is type(Ellipsis): + return self.save_reduce(type, (Ellipsis,), obj=obj) + elif obj is type(NotImplemented): + return self.save_reduce(type, (NotImplemented,), obj=obj) + elif obj in _BUILTIN_TYPE_NAMES: + return self.save_reduce( + _builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj) + + if sys.version_info[:2] < (3, 7) and _is_parametrized_type_hint(obj): # noqa # pragma: no branch + # Parametrized typing constructs in Python < 3.7 are not + # compatible with type checks and ``isinstance`` semantics. For + # this reason, it is easier to detect them using a + # duck-typing-based check (``_is_parametrized_type_hint``) than + # to populate the Pickler's dispatch with type-specific savers. + self.save_reduce( + _create_parametrized_type_hint, + parametrized_type_hint_getinitargs(obj), + obj=obj + ) + elif name is not None: + Pickler.save_global(self, obj, name=name) + elif not _is_importable(obj, name=name): + self._save_reduce_pickle5(*_dynamic_class_reduce(obj), obj=obj) + else: + Pickler.save_global(self, obj, name=name) + dispatch[type] = save_global + + def save_function(self, obj, name=None): + """ Registered with the dispatch to handle all function types. + + Determines what kind of function obj is (e.g. lambda, defined at + interactive prompt, etc) and handles the pickling appropriately. + """ + if _is_importable(obj, name=name): + return Pickler.save_global(self, obj, name=name) + elif PYPY and isinstance(obj.__code__, builtin_code_type): + return self.save_pypy_builtin_func(obj) + else: + return self._save_reduce_pickle5( + *self._dynamic_function_reduce(obj), obj=obj + ) + + def save_pypy_builtin_func(self, obj): + """Save pypy equivalent of builtin functions. + PyPy does not have the concept of builtin-functions. Instead, + builtin-functions are simple function instances, but with a + builtin-code attribute. + Most of the time, builtin functions should be pickled by attribute. + But PyPy has flaky support for __qualname__, so some builtin + functions such as float.__new__ will be classified as dynamic. For + this reason only, we created this special routine. Because + builtin-functions are not expected to have closure or globals, + there is no additional hack (compared the one already implemented + in pickle) to protect ourselves from reference cycles. A simple + (reconstructor, newargs, obj.__dict__) tuple is save_reduced. Note + also that PyPy improved their support for __qualname__ in v3.6, so + this routing should be removed when cloudpickle supports only PyPy + 3.6 and later. + """ + rv = (types.FunctionType, (obj.__code__, {}, obj.__name__, + obj.__defaults__, obj.__closure__), + obj.__dict__) + self.save_reduce(*rv, obj=obj) + + dispatch[types.FunctionType] = save_function diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/compat.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/compat.py new file mode 100644 index 0000000000000000000000000000000000000000..afa285f62903daa758cd38f1bd8b22ea49df3473 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/cloudpickle/compat.py @@ -0,0 +1,13 @@ +import sys + + +if sys.version_info < (3, 8): + try: + import pickle5 as pickle # noqa: F401 + from pickle5 import Pickler # noqa: F401 + except ImportError: + import pickle # noqa: F401 + from pickle import _Pickler as Pickler # noqa: F401 +else: + import pickle # noqa: F401 + from _pickle import Pickler # noqa: F401 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/__init__.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/analysis.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..5b65599c5e1c5ba28e70834a2fbc5c30c6363435 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/analysis.py @@ -0,0 +1,722 @@ +""" +Utils for IR analysis +""" +import operator +from functools import reduce +from collections import namedtuple, defaultdict + +from .controlflow import CFGraph +from numba.core import types, errors, ir, consts +from numba.misc import special + +# +# Analysis related to variable lifetime +# + +_use_defs_result = namedtuple('use_defs_result', 'usemap,defmap') + +# other packages that define new nodes add calls for finding defs +# format: {type:function} +ir_extension_usedefs = {} + + +def compute_use_defs(blocks): + """ + Find variable use/def per block. + """ + + var_use_map = {} # { block offset -> set of vars } + var_def_map = {} # { block offset -> set of vars } + for offset, ir_block in blocks.items(): + var_use_map[offset] = use_set = set() + var_def_map[offset] = def_set = set() + for stmt in ir_block.body: + if type(stmt) in ir_extension_usedefs: + func = ir_extension_usedefs[type(stmt)] + func(stmt, use_set, def_set) + continue + if isinstance(stmt, ir.Assign): + if isinstance(stmt.value, ir.Inst): + rhs_set = set(var.name for var in stmt.value.list_vars()) + elif isinstance(stmt.value, ir.Var): + rhs_set = set([stmt.value.name]) + elif isinstance(stmt.value, (ir.Arg, ir.Const, ir.Global, + ir.FreeVar)): + rhs_set = () + else: + raise AssertionError('unreachable', type(stmt.value)) + # If lhs not in rhs of the assignment + if stmt.target.name not in rhs_set: + def_set.add(stmt.target.name) + + for var in stmt.list_vars(): + # do not include locally defined vars to use-map + if var.name not in def_set: + use_set.add(var.name) + + return _use_defs_result(usemap=var_use_map, defmap=var_def_map) + + +def compute_live_map(cfg, blocks, var_use_map, var_def_map): + """ + Find variables that must be alive at the ENTRY of each block. + We use a simple fix-point algorithm that iterates until the set of + live variables is unchanged for each block. + """ + def fix_point_progress(dct): + """Helper function to determine if a fix-point has been reached. + """ + return tuple(len(v) for v in dct.values()) + + def fix_point(fn, dct): + """Helper function to run fix-point algorithm. + """ + old_point = None + new_point = fix_point_progress(dct) + while old_point != new_point: + fn(dct) + old_point = new_point + new_point = fix_point_progress(dct) + + def def_reach(dct): + """Find all variable definition reachable at the entry of a block + """ + for offset in var_def_map: + used_or_defined = var_def_map[offset] | var_use_map[offset] + dct[offset] |= used_or_defined + # Propagate to outgoing nodes + for out_blk, _ in cfg.successors(offset): + dct[out_blk] |= dct[offset] + + def liveness(dct): + """Find live variables. + + Push var usage backward. + """ + for offset in dct: + # Live vars here + live_vars = dct[offset] + for inc_blk, _data in cfg.predecessors(offset): + # Reachable at the predecessor + reachable = live_vars & def_reach_map[inc_blk] + # But not defined in the predecessor + dct[inc_blk] |= reachable - var_def_map[inc_blk] + + live_map = {} + for offset in blocks.keys(): + live_map[offset] = set(var_use_map[offset]) + + def_reach_map = defaultdict(set) + fix_point(def_reach, def_reach_map) + fix_point(liveness, live_map) + return live_map + + +_dead_maps_result = namedtuple('dead_maps_result', 'internal,escaping,combined') + + +def compute_dead_maps(cfg, blocks, live_map, var_def_map): + """ + Compute the end-of-live information for variables. + `live_map` contains a mapping of block offset to all the living + variables at the ENTRY of the block. + """ + # The following three dictionaries will be + # { block offset -> set of variables to delete } + # all vars that should be deleted at the start of the successors + escaping_dead_map = defaultdict(set) + # all vars that should be deleted within this block + internal_dead_map = defaultdict(set) + # all vars that should be deleted after the function exit + exit_dead_map = defaultdict(set) + + for offset, ir_block in blocks.items(): + # live vars WITHIN the block will include all the locally + # defined variables + cur_live_set = live_map[offset] | var_def_map[offset] + # vars alive in the outgoing blocks + outgoing_live_map = dict((out_blk, live_map[out_blk]) + for out_blk, _data in cfg.successors(offset)) + # vars to keep alive for the terminator + terminator_liveset = set(v.name + for v in ir_block.terminator.list_vars()) + # vars to keep alive in the successors + combined_liveset = reduce(operator.or_, outgoing_live_map.values(), + set()) + # include variables used in terminator + combined_liveset |= terminator_liveset + # vars that are dead within the block because they are not + # propagated to any outgoing blocks + internal_set = cur_live_set - combined_liveset + internal_dead_map[offset] = internal_set + # vars that escape this block + escaping_live_set = cur_live_set - internal_set + for out_blk, new_live_set in outgoing_live_map.items(): + # successor should delete the unused escaped vars + new_live_set = new_live_set | var_def_map[out_blk] + escaping_dead_map[out_blk] |= escaping_live_set - new_live_set + + # if no outgoing blocks + if not outgoing_live_map: + # insert var used by terminator + exit_dead_map[offset] = terminator_liveset + + # Verify that the dead maps cover all live variables + all_vars = reduce(operator.or_, live_map.values(), set()) + internal_dead_vars = reduce(operator.or_, internal_dead_map.values(), + set()) + escaping_dead_vars = reduce(operator.or_, escaping_dead_map.values(), + set()) + exit_dead_vars = reduce(operator.or_, exit_dead_map.values(), set()) + dead_vars = (internal_dead_vars | escaping_dead_vars | exit_dead_vars) + missing_vars = all_vars - dead_vars + if missing_vars: + # There are no exit points + if not cfg.exit_points(): + # We won't be able to verify this + pass + else: + msg = 'liveness info missing for vars: {0}'.format(missing_vars) + raise RuntimeError(msg) + + combined = dict((k, internal_dead_map[k] | escaping_dead_map[k]) + for k in blocks) + + return _dead_maps_result(internal=internal_dead_map, + escaping=escaping_dead_map, + combined=combined) + + +def compute_live_variables(cfg, blocks, var_def_map, var_dead_map): + """ + Compute the live variables at the beginning of each block + and at each yield point. + The ``var_def_map`` and ``var_dead_map`` indicates the variable defined + and deleted at each block, respectively. + """ + # live var at the entry per block + block_entry_vars = defaultdict(set) + + def fix_point_progress(): + return tuple(map(len, block_entry_vars.values())) + + old_point = None + new_point = fix_point_progress() + + # Propagate defined variables and still live the successors. + # (note the entry block automatically gets an empty set) + + # Note: This is finding the actual available variables at the entry + # of each block. The algorithm in compute_live_map() is finding + # the variable that must be available at the entry of each block. + # This is top-down in the dataflow. The other one is bottom-up. + while old_point != new_point: + # We iterate until the result stabilizes. This is necessary + # because of loops in the graphself. + for offset in blocks: + # vars available + variable defined + avail = block_entry_vars[offset] | var_def_map[offset] + # subtract variables deleted + avail -= var_dead_map[offset] + # add ``avail`` to each successors + for succ, _data in cfg.successors(offset): + block_entry_vars[succ] |= avail + + old_point = new_point + new_point = fix_point_progress() + + return block_entry_vars + + +# +# Analysis related to controlflow +# + +def compute_cfg_from_blocks(blocks): + cfg = CFGraph() + for k in blocks: + cfg.add_node(k) + + for k, b in blocks.items(): + term = b.terminator + for target in term.get_targets(): + cfg.add_edge(k, target) + + cfg.set_entry_point(min(blocks)) + cfg.process() + return cfg + + +def find_top_level_loops(cfg): + """ + A generator that yields toplevel loops given a control-flow-graph + """ + blocks_in_loop = set() + # get loop bodies + for loop in cfg.loops().values(): + insiders = set(loop.body) | set(loop.entries) | set(loop.exits) + insiders.discard(loop.header) + blocks_in_loop |= insiders + # find loop that is not part of other loops + for loop in cfg.loops().values(): + if loop.header not in blocks_in_loop: + yield _fix_loop_exit(cfg, loop) + + +def _fix_loop_exit(cfg, loop): + """ + Fixes loop.exits for Py3.8 bytecode CFG changes. + This is to handle `break` inside loops. + """ + # Computes the common postdoms of exit nodes + postdoms = cfg.post_dominators() + exits = reduce( + operator.and_, + [postdoms[b] for b in loop.exits], + loop.exits, + ) + if exits: + # Put the non-common-exits as body nodes + body = loop.body | loop.exits - exits + return loop._replace(exits=exits, body=body) + else: + return loop + + +# Used to describe a nullified condition in dead branch pruning +nullified = namedtuple('nullified', 'condition, taken_br, rewrite_stmt') + + +# Functions to manipulate IR +def dead_branch_prune(func_ir, called_args): + """ + Removes dead branches based on constant inference from function args. + This directly mutates the IR. + + func_ir is the IR + called_args are the actual arguments with which the function is called + """ + from numba.core.ir_utils import (get_definition, guard, find_const, + GuardException) + + DEBUG = 0 + + def find_branches(func_ir): + # find *all* branches + branches = [] + for blk in func_ir.blocks.values(): + branch_or_jump = blk.body[-1] + if isinstance(branch_or_jump, ir.Branch): + branch = branch_or_jump + pred = guard(get_definition, func_ir, branch.cond.name) + if pred is not None and pred.op == "call": + function = guard(get_definition, func_ir, pred.func) + if (function is not None and + isinstance(function, ir.Global) and + function.value is bool): + condition = guard(get_definition, func_ir, pred.args[0]) + if condition is not None: + branches.append((branch, condition, blk)) + return branches + + def do_prune(take_truebr, blk): + keep = branch.truebr if take_truebr else branch.falsebr + # replace the branch with a direct jump + jmp = ir.Jump(keep, loc=branch.loc) + blk.body[-1] = jmp + return 1 if keep == branch.truebr else 0 + + def prune_by_type(branch, condition, blk, *conds): + # this prunes a given branch and fixes up the IR + # at least one needs to be a NoneType + lhs_cond, rhs_cond = conds + lhs_none = isinstance(lhs_cond, types.NoneType) + rhs_none = isinstance(rhs_cond, types.NoneType) + if lhs_none or rhs_none: + try: + take_truebr = condition.fn(lhs_cond, rhs_cond) + except Exception: + return False, None + if DEBUG > 0: + kill = branch.falsebr if take_truebr else branch.truebr + print("Pruning %s" % kill, branch, lhs_cond, rhs_cond, + condition.fn) + taken = do_prune(take_truebr, blk) + return True, taken + return False, None + + def prune_by_value(branch, condition, blk, *conds): + lhs_cond, rhs_cond = conds + try: + take_truebr = condition.fn(lhs_cond, rhs_cond) + except Exception: + return False, None + if DEBUG > 0: + kill = branch.falsebr if take_truebr else branch.truebr + print("Pruning %s" % kill, branch, lhs_cond, rhs_cond, condition.fn) + taken = do_prune(take_truebr, blk) + return True, taken + + def prune_by_predicate(branch, pred, blk): + try: + # Just to prevent accidents, whilst already guarded, ensure this + # is an ir.Const + if not isinstance(pred, (ir.Const, ir.FreeVar, ir.Global)): + raise TypeError('Expected constant Numba IR node') + take_truebr = bool(pred.value) + except TypeError: + return False, None + if DEBUG > 0: + kill = branch.falsebr if take_truebr else branch.truebr + print("Pruning %s" % kill, branch, pred) + taken = do_prune(take_truebr, blk) + return True, taken + + class Unknown(object): + pass + + def resolve_input_arg_const(input_arg_idx): + """ + Resolves an input arg to a constant (if possible) + """ + input_arg_ty = called_args[input_arg_idx] + + # comparing to None? + if isinstance(input_arg_ty, types.NoneType): + return input_arg_ty + + # is it a kwarg default + if isinstance(input_arg_ty, types.Omitted): + val = input_arg_ty.value + if isinstance(val, types.NoneType): + return val + elif val is None: + return types.NoneType('none') + + # literal type, return the type itself so comparisons like `x == None` + # still work as e.g. x = types.int64 will never be None/NoneType so + # the branch can still be pruned + return getattr(input_arg_ty, 'literal_type', Unknown()) + + if DEBUG > 1: + print("before".center(80, '-')) + print(func_ir.dump()) + + phi2lbl = dict() + phi2asgn = dict() + for lbl, blk in func_ir.blocks.items(): + for stmt in blk.body: + if isinstance(stmt, ir.Assign): + if isinstance(stmt.value, ir.Expr) and stmt.value.op == 'phi': + phi2lbl[stmt.value] = lbl + phi2asgn[stmt.value] = stmt + + # This looks for branches where: + # at least one arg of the condition is in input args and const + # at least one an arg of the condition is a const + # if the condition is met it will replace the branch with a jump + branch_info = find_branches(func_ir) + # stores conditions that have no impact post prune + nullified_conditions = [] + + for branch, condition, blk in branch_info: + const_conds = [] + if isinstance(condition, ir.Expr) and condition.op == 'binop': + prune = prune_by_value + for arg in [condition.lhs, condition.rhs]: + resolved_const = Unknown() + arg_def = guard(get_definition, func_ir, arg) + if isinstance(arg_def, ir.Arg): + # it's an e.g. literal argument to the function + resolved_const = resolve_input_arg_const(arg_def.index) + prune = prune_by_type + else: + # it's some const argument to the function, cannot use guard + # here as the const itself may be None + try: + resolved_const = find_const(func_ir, arg) + if resolved_const is None: + resolved_const = types.NoneType('none') + except GuardException: + pass + + if not isinstance(resolved_const, Unknown): + const_conds.append(resolved_const) + + # lhs/rhs are consts + if len(const_conds) == 2: + # prune the branch, switch the branch for an unconditional jump + prune_stat, taken = prune(branch, condition, blk, *const_conds) + if(prune_stat): + # add the condition to the list of nullified conditions + nullified_conditions.append(nullified(condition, taken, + True)) + else: + # see if this is a branch on a constant value predicate + resolved_const = Unknown() + try: + pred_call = get_definition(func_ir, branch.cond) + resolved_const = find_const(func_ir, pred_call.args[0]) + if resolved_const is None: + resolved_const = types.NoneType('none') + except GuardException: + pass + + if not isinstance(resolved_const, Unknown): + prune_stat, taken = prune_by_predicate(branch, condition, blk) + if(prune_stat): + # add the condition to the list of nullified conditions + nullified_conditions.append(nullified(condition, taken, + False)) + + # 'ERE BE DRAGONS... + # It is the evaluation of the condition expression that often trips up type + # inference, so ideally it would be removed as it is effectively rendered + # dead by the unconditional jump if a branch was pruned. However, there may + # be references to the condition that exist in multiple places (e.g. dels) + # and we cannot run DCE here as typing has not taken place to give enough + # information to run DCE safely. Upshot of all this is the condition gets + # rewritten below into a benign const that typing will be happy with and DCE + # can remove it and its reference post typing when it is safe to do so + # (if desired). It is required that the const is assigned a value that + # indicates the branch taken as its mutated value would be read in the case + # of object mode fall back in place of the condition itself. For + # completeness the func_ir._definitions and ._consts are also updated to + # make the IR state self consistent. + + deadcond = [x.condition for x in nullified_conditions] + for _, cond, blk in branch_info: + if cond in deadcond: + for x in blk.body: + if isinstance(x, ir.Assign) and x.value is cond: + # rewrite the condition as a true/false bit + nullified_info = nullified_conditions[deadcond.index(cond)] + # only do a rewrite of conditions, predicates need to retain + # their value as they may be used later. + if nullified_info.rewrite_stmt: + branch_bit = nullified_info.taken_br + x.value = ir.Const(branch_bit, loc=x.loc) + # update the specific definition to the new const + defns = func_ir._definitions[x.target.name] + repl_idx = defns.index(cond) + defns[repl_idx] = x.value + + # Check post dominators of dead nodes from in the original CFG for use of + # vars that are being removed in the dead blocks which might be referred to + # by phi nodes. + # + # Multiple things to fix up: + # + # 1. Cases like: + # + # A A + # |\ | + # | B --> B + # |/ | + # C C + # + # i.e. the branch is dead but the block is still alive. In this case CFG + # simplification will fuse A-B-C and any phi in C can be updated as an + # direct assignment from the last assigned version in the dominators of the + # fused block. + # + # 2. Cases like: + # + # A A + # / \ | + # B C --> B + # \ / | + # D D + # + # i.e. the block C is dead. In this case the phis in D need updating to + # reflect the collapse of the phi condition. This should result in a direct + # assignment of the surviving version in B to the LHS of the phi in D. + + new_cfg = compute_cfg_from_blocks(func_ir.blocks) + dead_blocks = new_cfg.dead_nodes() + + # for all phis that are still in live blocks. + for phi, lbl in phi2lbl.items(): + if lbl in dead_blocks: + continue + new_incoming = [x[0] for x in new_cfg.predecessors(lbl)] + if set(new_incoming) != set(phi.incoming_blocks): + # Something has changed in the CFG... + if len(new_incoming) == 1: + # There's now just one incoming. Replace the PHI node by a + # direct assignment + idx = phi.incoming_blocks.index(new_incoming[0]) + phi2asgn[phi].value = phi.incoming_values[idx] + else: + # There's more than one incoming still, then look through the + # incoming and remove dead + ic_val_tmp = [] + ic_blk_tmp = [] + for ic_val, ic_blk in zip(phi.incoming_values, + phi.incoming_blocks): + if ic_blk in dead_blocks: + continue + else: + ic_val_tmp.append(ic_val) + ic_blk_tmp.append(ic_blk) + phi.incoming_values.clear() + phi.incoming_values.extend(ic_val_tmp) + phi.incoming_blocks.clear() + phi.incoming_blocks.extend(ic_blk_tmp) + + # Remove dead blocks, this is safe as it relies on the CFG only. + for dead in dead_blocks: + del func_ir.blocks[dead] + + # if conditions were nullified then consts were rewritten, update + if nullified_conditions: + func_ir._consts = consts.ConstantInference(func_ir) + + if DEBUG > 1: + print("after".center(80, '-')) + print(func_ir.dump()) + + +def rewrite_semantic_constants(func_ir, called_args): + """ + This rewrites values known to be constant by their semantics as ir.Const + nodes, this is to give branch pruning the best chance possible of killing + branches. An example might be rewriting len(tuple) as the literal length. + + func_ir is the IR + called_args are the actual arguments with which the function is called + """ + DEBUG = 0 + + if DEBUG > 1: + print(("rewrite_semantic_constants: " + + func_ir.func_id.func_name).center(80, '-')) + print("before".center(80, '*')) + func_ir.dump() + + def rewrite_statement(func_ir, stmt, new_val): + """ + Rewrites the stmt as a ir.Const new_val and fixes up the entries in + func_ir._definitions + """ + stmt.value = ir.Const(new_val, stmt.loc) + defns = func_ir._definitions[stmt.target.name] + repl_idx = defns.index(val) + defns[repl_idx] = stmt.value + + def rewrite_array_ndim(val, func_ir, called_args): + # rewrite Array.ndim as const(ndim) + if getattr(val, 'op', None) == 'getattr': + if val.attr == 'ndim': + arg_def = guard(get_definition, func_ir, val.value) + if isinstance(arg_def, ir.Arg): + argty = called_args[arg_def.index] + if isinstance(argty, types.Array): + rewrite_statement(func_ir, stmt, argty.ndim) + + def rewrite_tuple_len(val, func_ir, called_args): + # rewrite len(tuple) as const(len(tuple)) + if getattr(val, 'op', None) == 'call': + func = guard(get_definition, func_ir, val.func) + if (func is not None and isinstance(func, ir.Global) and + getattr(func, 'value', None) is len): + + (arg,) = val.args + arg_def = guard(get_definition, func_ir, arg) + if isinstance(arg_def, ir.Arg): + argty = called_args[arg_def.index] + if isinstance(argty, types.BaseTuple): + rewrite_statement(func_ir, stmt, argty.count) + elif (isinstance(arg_def, ir.Expr) and + arg_def.op == 'typed_getitem'): + argty = arg_def.dtype + if isinstance(argty, types.BaseTuple): + rewrite_statement(func_ir, stmt, argty.count) + + from numba.core.ir_utils import get_definition, guard + for blk in func_ir.blocks.values(): + for stmt in blk.body: + if isinstance(stmt, ir.Assign): + val = stmt.value + if isinstance(val, ir.Expr): + rewrite_array_ndim(val, func_ir, called_args) + rewrite_tuple_len(val, func_ir, called_args) + + if DEBUG > 1: + print("after".center(80, '*')) + func_ir.dump() + print('-' * 80) + + +def find_literally_calls(func_ir, argtypes): + """An analysis to find `numba.literally` call inside the given IR. + When an unsatisfied literal typing request is found, a `ForceLiteralArg` + exception is raised. + + Parameters + ---------- + + func_ir : numba.ir.FunctionIR + + argtypes : Sequence[numba.types.Type] + The argument types. + """ + from numba.core import ir_utils + + marked_args = set() + first_loc = {} + # Scan for literally calls + for blk in func_ir.blocks.values(): + for assign in blk.find_exprs(op='call'): + var = ir_utils.guard(ir_utils.get_definition, func_ir, assign.func) + if isinstance(var, (ir.Global, ir.FreeVar)): + fnobj = var.value + else: + fnobj = ir_utils.guard(ir_utils.resolve_func_from_module, + func_ir, var) + if fnobj is special.literally: + # Found + [arg] = assign.args + defarg = func_ir.get_definition(arg) + if isinstance(defarg, ir.Arg): + argindex = defarg.index + marked_args.add(argindex) + first_loc.setdefault(argindex, assign.loc) + # Signal the dispatcher to force literal typing + for pos in marked_args: + query_arg = argtypes[pos] + do_raise = (isinstance(query_arg, types.InitialValue) and + query_arg.initial_value is None) + if do_raise: + loc = first_loc[pos] + raise errors.ForceLiteralArg(marked_args, loc=loc) + + if not isinstance(query_arg, (types.Literal, types.InitialValue)): + loc = first_loc[pos] + raise errors.ForceLiteralArg(marked_args, loc=loc) + + +ir_extension_use_alloca = {} + + +def must_use_alloca(blocks): + """ + Analyzes a dictionary of blocks to find variables that must be + stack allocated with alloca. For each statement in the blocks, + determine if that statement requires certain variables to be + stack allocated. This function uses the extension point + ir_extension_use_alloca to allow other IR node types like parfors + to register to be processed by this analysis function. At the + moment, parfors are the only IR node types that may require + something to be stack allocated. + """ + use_alloca_vars = set() + + for ir_block in blocks.values(): + for stmt in ir_block.body: + if type(stmt) in ir_extension_use_alloca: + func = ir_extension_use_alloca[type(stmt)] + func(stmt, use_alloca_vars) + continue + + return use_alloca_vars diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/__init__.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/pretty_annotate.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/pretty_annotate.py new file mode 100644 index 0000000000000000000000000000000000000000..6e4f43b91189ee7bfd07dac3af2c40ef317eef2c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/pretty_annotate.py @@ -0,0 +1,283 @@ +""" +This module implements code highlighting of numba function annotations. +""" + +from warnings import warn + +warn("The pretty_annotate functionality is experimental and might change API", + FutureWarning) + +def hllines(code, style): + try: + from pygments import highlight + from pygments.lexers import PythonLexer + from pygments.formatters import HtmlFormatter + except ImportError: + raise ImportError("please install the 'pygments' package") + pylex = PythonLexer() + "Given a code string, return a list of html-highlighted lines" + hf = HtmlFormatter(noclasses=True, style=style, nowrap=True) + res = highlight(code, pylex, hf) + return res.splitlines() + + +def htlines(code, style): + try: + from pygments import highlight + from pygments.lexers import PythonLexer + # TerminalFormatter does not support themes, Terminal256 should, + # but seem to not work. + from pygments.formatters import TerminalFormatter + except ImportError: + raise ImportError("please install the 'pygments' package") + pylex = PythonLexer() + "Given a code string, return a list of ANSI-highlighted lines" + hf = TerminalFormatter(style=style) + res = highlight(code, pylex, hf) + return res.splitlines() + +def get_ansi_template(): + try: + from jinja2 import Template + except ImportError: + raise ImportError("please install the 'jinja2' package") + return Template(""" + {%- for func_key in func_data.keys() -%} + Function name: \x1b[34m{{func_data[func_key]['funcname']}}\x1b[39;49;00m + {%- if func_data[func_key]['filename'] -%} + {{'\n'}}In file: \x1b[34m{{func_data[func_key]['filename'] -}}\x1b[39;49;00m + {%- endif -%} + {{'\n'}}With signature: \x1b[34m{{func_key[1]}}\x1b[39;49;00m + {{- "\n" -}} + {%- for num, line, hl, hc in func_data[func_key]['pygments_lines'] -%} + {{-'\n'}}{{ num}}: {{hc-}} + {%- if func_data[func_key]['ir_lines'][num] -%} + {%- for ir_line, ir_line_type in func_data[func_key]['ir_lines'][num] %} + {{-'\n'}}--{{- ' '*func_data[func_key]['python_indent'][num]}} + {{- ' '*(func_data[func_key]['ir_indent'][num][loop.index0]+4) + }}{{ir_line }}\x1b[41m{{ir_line_type-}}\x1b[39;49;00m + {%- endfor -%} + {%- endif -%} + {%- endfor -%} + {%- endfor -%} + """) + return ansi_template + +def get_html_template(): + try: + from jinja2 import Template + except ImportError: + raise ImportError("please install the 'jinja2' package") + return Template(""" + + + + + + + {% for func_key in func_data.keys() %} + +
+ + {%- for num, line, hl, hc in func_data[func_key]['pygments_lines'] -%} + {%- if func_data[func_key]['ir_lines'][num] %} + + {% else -%} + + {%- endif -%} + {%- endfor -%} +
+
+ + + {{num}}: + {{' '*func_data[func_key]['python_indent'][num]}}{{hl}} + + + + + {%- for ir_line, ir_line_type in func_data[func_key]['ir_lines'][num] %} + + + + {%- endfor -%} + +
+   + {{- ' '*func_data[func_key]['python_indent'][num]}} + {{ ' '*func_data[func_key]['ir_indent'][num][loop.index0]}}{{ir_line|e -}} + {{ir_line_type}} + +
+
+
+ + {{num}}: + {{' '*func_data[func_key]['python_indent'][num]}}{{hl}} + +
+
+ {% endfor %} + + + """) + + +def reform_code(annotation): + """ + Extract the code from the Numba annotation datastructure. + + Pygments can only highlight full multi-line strings, the Numba + annotation is list of single lines, with indentation removed. + """ + ident_dict = annotation['python_indent'] + s= '' + for n,l in annotation['python_lines']: + s = s+' '*ident_dict[n]+l+'\n' + return s + + +class Annotate: + """ + Construct syntax highlighted annotation for a given jitted function: + + Example: + + >>> import numba + >>> from numba.pretty_annotate import Annotate + >>> @numba.jit + ... def test(q): + ... res = 0 + ... for i in range(q): + ... res += i + ... return res + ... + >>> test(10) + 45 + >>> Annotate(test) + + The last line will return an HTML and/or ANSI representation that will be + displayed accordingly in Jupyter/IPython. + + Function annotations persist across compilation for newly encountered + type signatures and as a result annotations are shown for all signatures + by default. + + Annotations for a specific signature can be shown by using the + ``signature`` parameter. + + >>> @numba.jit + ... def add(x, y): + ... return x + y + ... + >>> add(1, 2) + 3 + >>> add(1.3, 5.7) + 7.0 + >>> add.signatures + [(int64, int64), (float64, float64)] + >>> Annotate(add, signature=add.signatures[1]) # annotation for (float64, float64) + """ + def __init__(self, function, signature=None, **kwargs): + + style = kwargs.get('style', 'default') + if not function.signatures: + raise ValueError('function need to be jitted for at least one signature') + ann = function.get_annotation_info(signature=signature) + self.ann = ann + + for k,v in ann.items(): + res = hllines(reform_code(v), style) + rest = htlines(reform_code(v), style) + v['pygments_lines'] = [(a,b,c, d) for (a,b),c, d in zip(v['python_lines'], res, rest)] + + def _repr_html_(self): + return get_html_template().render(func_data=self.ann) + + def __repr__(self): + return get_ansi_template().render(func_data=self.ann) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/template.html b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/template.html new file mode 100644 index 0000000000000000000000000000000000000000..73e2f6f855d071bfd54770963dfb741eb700bcd9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/template.html @@ -0,0 +1,144 @@ + + + + + + + + + + + {% for func_key in func_data.keys() %} + + {% set loop1 = loop %} + + + +
+ + + {%- for num, line in func_data[func_key]['python_lines'] -%} + {%- if func_data[func_key]['ir_lines'][num] %} + + {% else -%} + + {%- endif -%} + {%- endfor -%} +
+
+ + + {{num}}: + {{func_data[func_key]['python_indent'][num]}}{{line|e}} + + + + + {%- for ir_line, ir_line_type in func_data[func_key]['ir_lines'][num] %} + + + + {%- endfor -%} + +
  + {{- func_data[func_key]['python_indent'][num]}} + {{func_data[func_key]['ir_indent'][num][loop.index0]}}{{ir_line|e -}} + {{ir_line_type}} + +
+
+
+ + {{num}}: + {{func_data[func_key]['python_indent'][num]}}{{line|e}} + +
+
+ +


+ + {% endfor %} + + + + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/type_annotations.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/type_annotations.py new file mode 100644 index 0000000000000000000000000000000000000000..47bd0125011fb06550dfd39fc7b56bba9a824cd6 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/annotations/type_annotations.py @@ -0,0 +1,283 @@ +from collections import defaultdict, OrderedDict +from collections.abc import Mapping +from contextlib import closing +import copy +import inspect +import os +import re +import sys +import textwrap +from io import StringIO + +import numba.core.dispatcher +from numba.core import ir + + +class SourceLines(Mapping): + def __init__(self, func): + + try: + lines, startno = inspect.getsourcelines(func) + except OSError: + self.lines = () + self.startno = 0 + else: + self.lines = textwrap.dedent(''.join(lines)).splitlines() + self.startno = startno + + def __getitem__(self, lineno): + try: + return self.lines[lineno - self.startno].rstrip() + except IndexError: + return '' + + def __iter__(self): + return iter((self.startno + i) for i in range(len(self.lines))) + + def __len__(self): + return len(self.lines) + + @property + def avail(self): + return bool(self.lines) + + +class TypeAnnotation(object): + + # func_data dict stores annotation data for all functions that are + # compiled. We store the data in the TypeAnnotation class since a new + # TypeAnnotation instance is created for each function that is compiled. + # For every function that is compiled, we add the type annotation data to + # this dict and write the html annotation file to disk (rewrite the html + # file for every function since we don't know if this is the last function + # to be compiled). + func_data = OrderedDict() + + def __init__(self, func_ir, typemap, calltypes, lifted, lifted_from, + args, return_type, html_output=None): + self.func_id = func_ir.func_id + self.blocks = func_ir.blocks + self.typemap = typemap + self.calltypes = calltypes + self.filename = func_ir.loc.filename + self.linenum = str(func_ir.loc.line) + self.signature = str(args) + ' -> ' + str(return_type) + + # lifted loop information + self.lifted = lifted + self.num_lifted_loops = len(lifted) + + # If this is a lifted loop function that is being compiled, lifted_from + # points to annotation data from function that this loop lifted function + # was lifted from. This is used to stick lifted loop annotations back + # into original function. + self.lifted_from = lifted_from + + def prepare_annotations(self): + # Prepare annotations + groupedinst = defaultdict(list) + found_lifted_loop = False + #for blkid, blk in self.blocks.items(): + for blkid in sorted(self.blocks.keys()): + blk = self.blocks[blkid] + groupedinst[blk.loc.line].append("label %s" % blkid) + for inst in blk.body: + lineno = inst.loc.line + + if isinstance(inst, ir.Assign): + if found_lifted_loop: + atype = 'XXX Lifted Loop XXX' + found_lifted_loop = False + elif (isinstance(inst.value, ir.Expr) and + inst.value.op == 'call'): + atype = self.calltypes[inst.value] + elif (isinstance(inst.value, ir.Const) and + isinstance(inst.value.value, numba.core.dispatcher.LiftedLoop)): + atype = 'XXX Lifted Loop XXX' + found_lifted_loop = True + else: + # TODO: fix parfor lowering so that typemap is valid. + atype = self.typemap.get(inst.target.name, "") + + aline = "%s = %s :: %s" % (inst.target, inst.value, atype) + elif isinstance(inst, ir.SetItem): + atype = self.calltypes[inst] + aline = "%s :: %s" % (inst, atype) + else: + aline = "%s" % inst + groupedinst[lineno].append(" %s" % aline) + return groupedinst + + def annotate(self): + source = SourceLines(self.func_id.func) + # if not source.avail: + # return "Source code unavailable" + + groupedinst = self.prepare_annotations() + + # Format annotations + io = StringIO() + with closing(io): + if source.avail: + print("# File: %s" % self.filename, file=io) + for num in source: + srcline = source[num] + ind = _getindent(srcline) + print("%s# --- LINE %d --- " % (ind, num), file=io) + for inst in groupedinst[num]: + print('%s# %s' % (ind, inst), file=io) + print(file=io) + print(srcline, file=io) + print(file=io) + if self.lifted: + print("# The function contains lifted loops", file=io) + for loop in self.lifted: + print("# Loop at line %d" % loop.get_source_location(), + file=io) + print("# Has %d overloads" % len(loop.overloads), + file=io) + for cres in loop.overloads.values(): + print(cres.type_annotation, file=io) + else: + print("# Source code unavailable", file=io) + for num in groupedinst: + for inst in groupedinst[num]: + print('%s' % (inst,), file=io) + print(file=io) + + return io.getvalue() + + def html_annotate(self, outfile): + # ensure that annotation information is assembled + self.annotate_raw() + # make a deep copy ahead of the pending mutations + func_data = copy.deepcopy(self.func_data) + + key = 'python_indent' + for this_func in func_data.values(): + if key in this_func: + idents = {} + for line, amount in this_func[key].items(): + idents[line] = ' ' * amount + this_func[key] = idents + + key = 'ir_indent' + for this_func in func_data.values(): + if key in this_func: + idents = {} + for line, ir_id in this_func[key].items(): + idents[line] = [' ' * amount for amount in ir_id] + this_func[key] = idents + + + + try: + from jinja2 import Template + except ImportError: + raise ImportError("please install the 'jinja2' package") + + root = os.path.join(os.path.dirname(__file__)) + template_filename = os.path.join(root, 'template.html') + with open(template_filename, 'r') as template: + html = template.read() + + template = Template(html) + rendered = template.render(func_data=func_data) + outfile.write(rendered) + + def annotate_raw(self): + """ + This returns "raw" annotation information i.e. it has no output format + specific markup included. + """ + python_source = SourceLines(self.func_id.func) + ir_lines = self.prepare_annotations() + line_nums = [num for num in python_source] + lifted_lines = [l.get_source_location() for l in self.lifted] + + def add_ir_line(func_data, line): + line_str = line.strip() + line_type = '' + if line_str.endswith('pyobject'): + line_str = line_str.replace('pyobject', '') + line_type = 'pyobject' + func_data['ir_lines'][num].append((line_str, line_type)) + indent_len = len(_getindent(line)) + func_data['ir_indent'][num].append(indent_len) + + func_key = (self.func_id.filename + ':' + str(self.func_id.firstlineno + 1), + self.signature) + if self.lifted_from is not None and self.lifted_from[1]['num_lifted_loops'] > 0: + # This is a lifted loop function that is being compiled. Get the + # numba ir for lines in loop function to use for annotating + # original python function that the loop was lifted from. + func_data = self.lifted_from[1] + for num in line_nums: + if num not in ir_lines.keys(): + continue + func_data['ir_lines'][num] = [] + func_data['ir_indent'][num] = [] + for line in ir_lines[num]: + add_ir_line(func_data, line) + if line.strip().endswith('pyobject'): + func_data['python_tags'][num] = 'object_tag' + # If any pyobject line is found, make sure original python + # line that was marked as a lifted loop start line is tagged + # as an object line instead. Lifted loop start lines should + # only be marked as lifted loop lines if the lifted loop + # was successfully compiled in nopython mode. + func_data['python_tags'][self.lifted_from[0]] = 'object_tag' + + # We're done with this lifted loop, so decrement lifted loop counter. + # When lifted loop counter hits zero, that means we're ready to write + # out annotations to html file. + self.lifted_from[1]['num_lifted_loops'] -= 1 + + elif func_key not in TypeAnnotation.func_data.keys(): + TypeAnnotation.func_data[func_key] = {} + func_data = TypeAnnotation.func_data[func_key] + + for i, loop in enumerate(self.lifted): + # Make sure that when we process each lifted loop function later, + # we'll know where it originally came from. + loop.lifted_from = (lifted_lines[i], func_data) + func_data['num_lifted_loops'] = self.num_lifted_loops + + func_data['filename'] = self.filename + func_data['funcname'] = self.func_id.func_name + func_data['python_lines'] = [] + func_data['python_indent'] = {} + func_data['python_tags'] = {} + func_data['ir_lines'] = {} + func_data['ir_indent'] = {} + + for num in line_nums: + func_data['python_lines'].append((num, python_source[num].strip())) + indent_len = len(_getindent(python_source[num])) + func_data['python_indent'][num] = indent_len + func_data['python_tags'][num] = '' + func_data['ir_lines'][num] = [] + func_data['ir_indent'][num] = [] + + for line in ir_lines[num]: + add_ir_line(func_data, line) + if num in lifted_lines: + func_data['python_tags'][num] = 'lifted_tag' + elif line.strip().endswith('pyobject'): + func_data['python_tags'][num] = 'object_tag' + return self.func_data + + + def __str__(self): + return self.annotate() + + +re_longest_white_prefix = re.compile(r'^\s*') + + +def _getindent(text): + m = re_longest_white_prefix.match(text) + if not m: + return '' + else: + return ' ' * len(m.group(0)) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/base.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/base.py new file mode 100644 index 0000000000000000000000000000000000000000..9622a3f09dadab87b5ec6f772db1ff0bb20377c0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/base.py @@ -0,0 +1,1255 @@ +from collections import defaultdict +import copy +import sys +from itertools import permutations, takewhile +from contextlib import contextmanager + +from llvmlite import ir as llvmir +from llvmlite.ir import Constant +import llvmlite.binding as ll + +from numba.core import types, utils, datamodel, debuginfo, funcdesc, config, cgutils, imputils +from numba.core import event, errors, targetconfig +from numba import _dynfunc, _helperlib +from numba.core.compiler_lock import global_compiler_lock +from numba.core.pythonapi import PythonAPI +from numba.core.imputils import (user_function, user_generator, + builtin_registry, impl_ret_borrowed, + RegistryLoader) +from numba.cpython import builtins + +GENERIC_POINTER = llvmir.PointerType(llvmir.IntType(8)) +PYOBJECT = GENERIC_POINTER +void_ptr = GENERIC_POINTER + + +class OverloadSelector(object): + """ + An object matching an actual signature against a registry of formal + signatures and choosing the best candidate, if any. + + In the current implementation: + - a "signature" is a tuple of type classes or type instances + - the "best candidate" is the most specific match + """ + + def __init__(self): + # A list of (formal args tuple, value) + self.versions = [] + self._cache = {} + + def find(self, sig): + out = self._cache.get(sig) + if out is None: + out = self._find(sig) + self._cache[sig] = out + return out + + def _find(self, sig): + candidates = self._select_compatible(sig) + if candidates: + return candidates[self._best_signature(candidates)] + else: + raise errors.NumbaNotImplementedError(f'{self}, {sig}') + + def _select_compatible(self, sig): + """ + Select all compatible signatures and their implementation. + """ + out = {} + for ver_sig, impl in self.versions: + if self._match_arglist(ver_sig, sig): + out[ver_sig] = impl + return out + + def _best_signature(self, candidates): + """ + Returns the best signature out of the candidates + """ + ordered, genericity = self._sort_signatures(candidates) + # check for ambiguous signatures + if len(ordered) > 1: + firstscore = genericity[ordered[0]] + same = list(takewhile(lambda x: genericity[x] == firstscore, + ordered)) + if len(same) > 1: + msg = ["{n} ambiguous signatures".format(n=len(same))] + for sig in same: + msg += ["{0} => {1}".format(sig, candidates[sig])] + raise errors.NumbaTypeError('\n'.join(msg)) + return ordered[0] + + def _sort_signatures(self, candidates): + """ + Sort signatures in ascending level of genericity. + + Returns a 2-tuple: + + * ordered list of signatures + * dictionary containing genericity scores + """ + # score by genericity + genericity = defaultdict(int) + for this, other in permutations(candidates.keys(), r=2): + matched = self._match_arglist(formal_args=this, actual_args=other) + if matched: + # genericity score +1 for every another compatible signature + genericity[this] += 1 + # order candidates in ascending level of genericity + ordered = sorted(candidates.keys(), key=lambda x: genericity[x]) + return ordered, genericity + + def _match_arglist(self, formal_args, actual_args): + """ + Returns True if the signature is "matching". + A formal signature is "matching" if the actual signature matches exactly + or if the formal signature is a compatible generic signature. + """ + # normalize VarArg + if formal_args and isinstance(formal_args[-1], types.VarArg): + ndiff = len(actual_args) - len(formal_args) + 1 + formal_args = formal_args[:-1] + (formal_args[-1].dtype,) * ndiff + + if len(formal_args) != len(actual_args): + return False + + for formal, actual in zip(formal_args, actual_args): + if not self._match(formal, actual): + return False + + return True + + def _match(self, formal, actual): + if formal == actual: + # formal argument matches actual arguments + return True + elif types.Any == formal: + # formal argument is any + return True + elif isinstance(formal, type) and issubclass(formal, types.Type): + if isinstance(actual, type) and issubclass(actual, formal): + # formal arg is a type class and actual arg is a subclass + return True + elif isinstance(actual, formal): + # formal arg is a type class of which actual arg is an instance + return True + + def append(self, value, sig): + """ + Add a formal signature and its associated value. + """ + assert isinstance(sig, tuple), (value, sig) + self.versions.append((sig, value)) + self._cache.clear() + + +@utils.runonce +def _load_global_helpers(): + """ + Execute once to install special symbols into the LLVM symbol table. + """ + # This is Py_None's real C name + ll.add_symbol("_Py_NoneStruct", id(None)) + + # Add Numba C helper functions + for c_helpers in (_helperlib.c_helpers, _dynfunc.c_helpers): + for py_name, c_address in c_helpers.items(): + c_name = "numba_" + py_name + ll.add_symbol(c_name, c_address) + + # Add Numpy C helpers (npy_XXX) + for c_name, c_address in _helperlib.npymath_exports.items(): + ll.add_symbol(c_name, c_address) + + # Add all built-in exception classes + for obj in utils.builtins.__dict__.values(): + if isinstance(obj, type) and issubclass(obj, BaseException): + ll.add_symbol("PyExc_%s" % (obj.__name__), id(obj)) + + +class BaseContext(object): + """ + + Notes on Structure + ------------------ + + Most objects are lowered as plain-old-data structure in the generated + llvm. They are passed around by reference (a pointer to the structure). + Only POD structure can live across function boundaries by copying the + data. + """ + # True if the target requires strict alignment + # Causes exception to be raised if the record members are not aligned. + strict_alignment = False + + # Force powi implementation as math.pow call + implement_powi_as_math_call = False + implement_pow_as_math_call = False + + # Emit Debug info + enable_debuginfo = False + DIBuilder = debuginfo.DIBuilder + + # Bound checking + @property + def enable_boundscheck(self): + if config.BOUNDSCHECK is not None: + return config.BOUNDSCHECK + return self._boundscheck + + @enable_boundscheck.setter + def enable_boundscheck(self, value): + self._boundscheck = value + + # NRT + enable_nrt = False + + # Auto parallelization + auto_parallel = False + + # PYCC + aot_mode = False + + # Error model for various operations (only FP exceptions currently) + error_model = None + + # Whether dynamic globals (CPU runtime addresses) is allowed + allow_dynamic_globals = False + + # Fast math flags + fastmath = False + + # python execution environment + environment = None + + # the function descriptor + fndesc = None + + def __init__(self, typing_context, target): + _load_global_helpers() + + self.address_size = utils.MACHINE_BITS + self.typing_context = typing_context + from numba.core.target_extension import target_registry + self.target_name = target + self.target = target_registry[target] + + # A mapping of installed registries to their loaders + self._registries = {} + # Declarations loaded from registries and other sources + self._defns = defaultdict(OverloadSelector) + self._getattrs = defaultdict(OverloadSelector) + self._setattrs = defaultdict(OverloadSelector) + self._casts = OverloadSelector() + self._get_constants = OverloadSelector() + # Other declarations + self._generators = {} + self.special_ops = {} + self.cached_internal_func = {} + self._pid = None + self._codelib_stack = [] + + self._boundscheck = False + + self.data_model_manager = datamodel.default_manager + + # Initialize + self.init() + + def init(self): + """ + For subclasses to add initializer + """ + + def refresh(self): + """ + Refresh context with new declarations from known registries. + Useful for third-party extensions. + """ + # load target specific registries + self.load_additional_registries() + + # Populate the builtin registry, this has to happen after loading + # additional registries as some of the "additional" registries write + # their implementations into the builtin_registry and would be missed if + # this ran first. + self.install_registry(builtin_registry) + + # Also refresh typing context, since @overload declarations can + # affect it. + self.typing_context.refresh() + + def load_additional_registries(self): + """ + Load target-specific registries. Can be overridden by subclasses. + """ + + def mangler(self, name, types, *, abi_tags=(), uid=None): + """ + Perform name mangling. + """ + return funcdesc.default_mangler(name, types, abi_tags=abi_tags, uid=uid) + + def get_env_name(self, fndesc): + """Get the environment name given a FunctionDescriptor. + + Use this instead of the ``fndesc.env_name`` so that the target-context + can provide necessary mangling of the symbol to meet ABI requirements. + """ + return fndesc.env_name + + def declare_env_global(self, module, envname): + """Declare the Environment pointer as a global of the module. + + The pointer is initialized to NULL. It must be filled by the runtime + with the actual address of the Env before the associated function + can be executed. + + Parameters + ---------- + module : + The LLVM Module + envname : str + The name of the global variable. + """ + if envname not in module.globals: + gv = llvmir.GlobalVariable(module, cgutils.voidptr_t, name=envname) + gv.linkage = 'common' + gv.initializer = cgutils.get_null_value(gv.type.pointee) + + return module.globals[envname] + + def get_arg_packer(self, fe_args): + return datamodel.ArgPacker(self.data_model_manager, fe_args) + + def get_data_packer(self, fe_types): + return datamodel.DataPacker(self.data_model_manager, fe_types) + + @property + def target_data(self): + raise NotImplementedError + + @utils.cached_property + def nonconst_module_attrs(self): + """ + All module attrs are constant for targets using BaseContext. + """ + return tuple() + + @utils.cached_property + def nrt(self): + from numba.core.runtime.context import NRTContext + return NRTContext(self, self.enable_nrt) + + def subtarget(self, **kws): + obj = copy.copy(self) # shallow copy + for k, v in kws.items(): + if not hasattr(obj, k): + raise NameError("unknown option {0!r}".format(k)) + setattr(obj, k, v) + if obj.codegen() is not self.codegen(): + # We can't share functions across different codegens + obj.cached_internal_func = {} + return obj + + def install_registry(self, registry): + """ + Install a *registry* (a imputils.Registry instance) of function + and attribute implementations. + """ + try: + loader = self._registries[registry] + except KeyError: + loader = RegistryLoader(registry) + self._registries[registry] = loader + self.insert_func_defn(loader.new_registrations('functions')) + self._insert_getattr_defn(loader.new_registrations('getattrs')) + self._insert_setattr_defn(loader.new_registrations('setattrs')) + self._insert_cast_defn(loader.new_registrations('casts')) + self._insert_get_constant_defn(loader.new_registrations('constants')) + + def insert_func_defn(self, defns): + for impl, func, sig in defns: + self._defns[func].append(impl, sig) + + def _insert_getattr_defn(self, defns): + for impl, attr, sig in defns: + self._getattrs[attr].append(impl, sig) + + def _insert_setattr_defn(self, defns): + for impl, attr, sig in defns: + self._setattrs[attr].append(impl, sig) + + def _insert_cast_defn(self, defns): + for impl, sig in defns: + self._casts.append(impl, sig) + + def _insert_get_constant_defn(self, defns): + for impl, sig in defns: + self._get_constants.append(impl, sig) + + def insert_user_function(self, func, fndesc, libs=()): + impl = user_function(fndesc, libs) + self._defns[func].append(impl, impl.signature) + + def insert_generator(self, genty, gendesc, libs=()): + assert isinstance(genty, types.Generator) + impl = user_generator(gendesc, libs) + self._generators[genty] = gendesc, impl + + def remove_user_function(self, func): + """ + Remove user function *func*. + KeyError is raised if the function isn't known to us. + """ + del self._defns[func] + + def get_external_function_type(self, fndesc): + argtypes = [self.get_argument_type(aty) + for aty in fndesc.argtypes] + # don't wrap in pointer + restype = self.get_argument_type(fndesc.restype) + fnty = llvmir.FunctionType(restype, argtypes) + return fnty + + def declare_function(self, module, fndesc): + fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes) + fn = cgutils.get_or_insert_function(module, fnty, fndesc.mangled_name) + self.call_conv.decorate_function(fn, fndesc.args, fndesc.argtypes, noalias=fndesc.noalias) + if fndesc.inline: + fn.attributes.add('alwaysinline') + # alwaysinline overrides optnone + fn.attributes.discard('noinline') + fn.attributes.discard('optnone') + return fn + + def declare_external_function(self, module, fndesc): + fnty = self.get_external_function_type(fndesc) + fn = cgutils.get_or_insert_function(module, fnty, fndesc.mangled_name) + assert fn.is_declaration + for ak, av in zip(fndesc.args, fn.args): + av.name = "arg.%s" % ak + return fn + + def insert_const_string(self, mod, string): + """ + Insert constant *string* (a str object) into module *mod*. + """ + stringtype = GENERIC_POINTER + name = ".const.%s" % string + text = cgutils.make_bytearray(string.encode("utf-8") + b"\x00") + gv = self.insert_unique_const(mod, name, text) + return Constant.bitcast(gv, stringtype) + + def insert_const_bytes(self, mod, bytes, name=None): + """ + Insert constant *byte* (a `bytes` object) into module *mod*. + """ + stringtype = GENERIC_POINTER + name = ".bytes.%s" % (name or hash(bytes)) + text = cgutils.make_bytearray(bytes) + gv = self.insert_unique_const(mod, name, text) + return Constant.bitcast(gv, stringtype) + + def insert_unique_const(self, mod, name, val): + """ + Insert a unique internal constant named *name*, with LLVM value + *val*, into module *mod*. + """ + try: + gv = mod.get_global(name) + except KeyError: + return cgutils.global_constant(mod, name, val) + else: + return gv + + def get_argument_type(self, ty): + return self.data_model_manager[ty].get_argument_type() + + def get_return_type(self, ty): + return self.data_model_manager[ty].get_return_type() + + def get_data_type(self, ty): + """ + Get a LLVM data representation of the Numba type *ty* that is safe + for storage. Record data are stored as byte array. + + The return value is a llvmlite.ir.Type object, or None if the type + is an opaque pointer (???). + """ + return self.data_model_manager[ty].get_data_type() + + def get_value_type(self, ty): + return self.data_model_manager[ty].get_value_type() + + def pack_value(self, builder, ty, value, ptr, align=None): + """ + Pack value into the array storage at *ptr*. + If *align* is given, it is the guaranteed alignment for *ptr* + (by default, the standard ABI alignment). + """ + dataval = self.data_model_manager[ty].as_data(builder, value) + builder.store(dataval, ptr, align=align) + + def unpack_value(self, builder, ty, ptr, align=None): + """ + Unpack value from the array storage at *ptr*. + If *align* is given, it is the guaranteed alignment for *ptr* + (by default, the standard ABI alignment). + """ + dm = self.data_model_manager[ty] + return dm.load_from_data_pointer(builder, ptr, align) + + def get_constant_generic(self, builder, ty, val): + """ + Return a LLVM constant representing value *val* of Numba type *ty*. + """ + try: + impl = self._get_constants.find((ty,)) + return impl(self, builder, ty, val) + except NotImplementedError: + raise NotImplementedError("Cannot lower constant of type '%s'" % (ty,)) + + def get_constant(self, ty, val): + """ + Same as get_constant_generic(), but without specifying *builder*. + Works only for simple types. + """ + # HACK: pass builder=None to preserve get_constant() API + return self.get_constant_generic(None, ty, val) + + def get_constant_undef(self, ty): + lty = self.get_value_type(ty) + return Constant(lty, llvmir.Undefined) + + def get_constant_null(self, ty): + lty = self.get_value_type(ty) + return Constant(lty, None) + + def get_function(self, fn, sig, _firstcall=True): + """ + Return the implementation of function *fn* for signature *sig*. + The return value is a callable with the signature (builder, args). + """ + assert sig is not None + sig = sig.as_function() + if isinstance(fn, types.Callable): + key = fn.get_impl_key(sig) + overloads = self._defns[key] + else: + key = fn + overloads = self._defns[key] + + try: + return _wrap_impl(overloads.find(sig.args), self, sig) + except errors.NumbaNotImplementedError: + pass + if isinstance(fn, types.Type): + # It's a type instance => try to find a definition for the type class + try: + return self.get_function(type(fn), sig) + except NotImplementedError: + # Raise exception for the type instance, for a better error message + pass + + # Automatically refresh the context to load new registries if we are + # calling the first time. + if _firstcall: + self.refresh() + return self.get_function(fn, sig, _firstcall=False) + + raise NotImplementedError("No definition for lowering %s%s" % (key, sig)) + + def get_generator_desc(self, genty): + """ + """ + return self._generators[genty][0] + + def get_generator_impl(self, genty): + """ + """ + res = self._generators[genty][1] + self.add_linking_libs(getattr(res, 'libs', ())) + return res + + def get_bound_function(self, builder, obj, ty): + assert self.get_value_type(ty) == obj.type + return obj + + def get_getattr(self, typ, attr): + """ + Get the getattr() implementation for the given type and attribute name. + The return value is a callable with the signature + (context, builder, typ, val, attr). + """ + const_attr = (typ, attr) not in self.nonconst_module_attrs + is_module = isinstance(typ, types.Module) + if is_module and const_attr: + # Implement getattr for module-level globals that we treat as + # constants. + # XXX We shouldn't have to retype this + attrty = self.typing_context.resolve_module_constants(typ, attr) + if attrty is None or isinstance(attrty, types.Dummy): + # No implementation required for dummies (functions, modules...), + # which are dealt with later + return None + else: + pyval = getattr(typ.pymod, attr) + def imp(context, builder, typ, val, attr): + llval = self.get_constant_generic(builder, attrty, pyval) + return impl_ret_borrowed(context, builder, attrty, llval) + return imp + + # Lookup specific getattr implementation for this type and attribute + overloads = self._getattrs[attr] + try: + return overloads.find((typ,)) + except errors.NumbaNotImplementedError: + pass + # Lookup generic getattr implementation for this type + overloads = self._getattrs[None] + try: + return overloads.find((typ,)) + except errors.NumbaNotImplementedError: + pass + + raise NotImplementedError("No definition for lowering %s.%s" % (typ, attr)) + + def get_setattr(self, attr, sig): + """ + Get the setattr() implementation for the given attribute name + and signature. + The return value is a callable with the signature (builder, args). + """ + assert len(sig.args) == 2 + typ = sig.args[0] + valty = sig.args[1] + + def wrap_setattr(impl): + def wrapped(builder, args): + return impl(self, builder, sig, args, attr) + return wrapped + + # Lookup specific setattr implementation for this type and attribute + overloads = self._setattrs[attr] + try: + return wrap_setattr(overloads.find((typ, valty))) + except errors.NumbaNotImplementedError: + pass + # Lookup generic setattr implementation for this type + overloads = self._setattrs[None] + try: + return wrap_setattr(overloads.find((typ, valty))) + except errors.NumbaNotImplementedError: + pass + + raise NotImplementedError("No definition for lowering %s.%s = %s" + % (typ, attr, valty)) + + def get_argument_value(self, builder, ty, val): + """ + Argument representation to local value representation + """ + return self.data_model_manager[ty].from_argument(builder, val) + + def get_returned_value(self, builder, ty, val): + """ + Return value representation to local value representation + """ + return self.data_model_manager[ty].from_return(builder, val) + + def get_return_value(self, builder, ty, val): + """ + Local value representation to return type representation + """ + return self.data_model_manager[ty].as_return(builder, val) + + def get_value_as_argument(self, builder, ty, val): + """Prepare local value representation as argument type representation + """ + return self.data_model_manager[ty].as_argument(builder, val) + + def get_value_as_data(self, builder, ty, val): + return self.data_model_manager[ty].as_data(builder, val) + + def get_data_as_value(self, builder, ty, val): + return self.data_model_manager[ty].from_data(builder, val) + + def pair_first(self, builder, val, ty): + """ + Extract the first element of a heterogeneous pair. + """ + pair = self.make_helper(builder, ty, val) + return pair.first + + def pair_second(self, builder, val, ty): + """ + Extract the second element of a heterogeneous pair. + """ + pair = self.make_helper(builder, ty, val) + return pair.second + + def cast(self, builder, val, fromty, toty): + """ + Cast a value of type *fromty* to type *toty*. + This implements implicit conversions as can happen due to the + granularity of the Numba type system, or lax Python semantics. + """ + if fromty == toty or toty == types.Any: + return val + try: + impl = self._casts.find((fromty, toty)) + return impl(self, builder, fromty, toty, val) + except errors.NumbaNotImplementedError: + raise errors.NumbaNotImplementedError( + "Cannot cast %s to %s: %s" % (fromty, toty, val)) + + def generic_compare(self, builder, key, argtypes, args): + """ + Compare the given LLVM values of the given Numba types using + the comparison *key* (e.g. '=='). The values are first cast to + a common safe conversion type. + """ + at, bt = argtypes + av, bv = args + ty = self.typing_context.unify_types(at, bt) + assert ty is not None + cav = self.cast(builder, av, at, ty) + cbv = self.cast(builder, bv, bt, ty) + fnty = self.typing_context.resolve_value_type(key) + # the sig is homogeneous in the unified casted type + cmpsig = fnty.get_call_type(self.typing_context, (ty, ty), {}) + cmpfunc = self.get_function(fnty, cmpsig) + self.add_linking_libs(getattr(cmpfunc, 'libs', ())) + return cmpfunc(builder, (cav, cbv)) + + def make_optional_none(self, builder, valtype): + optval = self.make_helper(builder, types.Optional(valtype)) + optval.valid = cgutils.false_bit + return optval._getvalue() + + def make_optional_value(self, builder, valtype, value): + optval = self.make_helper(builder, types.Optional(valtype)) + optval.valid = cgutils.true_bit + optval.data = value + return optval._getvalue() + + def is_true(self, builder, typ, val): + """ + Return the truth value of a value of the given Numba type. + """ + fnty = self.typing_context.resolve_value_type(bool) + sig = fnty.get_call_type(self.typing_context, (typ,), {}) + impl = self.get_function(fnty, sig) + return impl(builder, (val,)) + + def get_c_value(self, builder, typ, name, dllimport=False): + """ + Get a global value through its C-accessible *name*, with the given + LLVM type. + If *dllimport* is true, the symbol will be marked as imported + from a DLL (necessary for AOT compilation under Windows). + """ + module = builder.function.module + try: + gv = module.globals[name] + except KeyError: + gv = cgutils.add_global_variable(module, typ, name) + if dllimport and self.aot_mode and sys.platform == 'win32': + gv.storage_class = "dllimport" + return gv + + def call_external_function(self, builder, callee, argtys, args): + args = [self.get_value_as_argument(builder, ty, arg) + for ty, arg in zip(argtys, args)] + retval = builder.call(callee, args) + return retval + + def get_function_pointer_type(self, typ): + return self.data_model_manager[typ].get_data_type() + + def call_function_pointer(self, builder, funcptr, args, cconv=None): + return builder.call(funcptr, args, cconv=cconv) + + def print_string(self, builder, text): + mod = builder.module + cstring = GENERIC_POINTER + fnty = llvmir.FunctionType(llvmir.IntType(32), [cstring]) + puts = cgutils.get_or_insert_function(mod, fnty, "puts") + return builder.call(puts, [text]) + + def debug_print(self, builder, text): + mod = builder.module + cstr = self.insert_const_string(mod, str(text)) + self.print_string(builder, cstr) + + def printf(self, builder, format_string, *args): + mod = builder.module + if isinstance(format_string, str): + cstr = self.insert_const_string(mod, format_string) + else: + cstr = format_string + fnty = llvmir.FunctionType(llvmir.IntType(32), (GENERIC_POINTER,), var_arg=True) + fn = cgutils.get_or_insert_function(mod, fnty, "printf") + return builder.call(fn, (cstr,) + tuple(args)) + + def get_struct_type(self, struct): + """ + Get the LLVM struct type for the given Structure class *struct*. + """ + fields = [self.get_value_type(v) for _, v in struct._fields] + return llvmir.LiteralStructType(fields) + + def get_dummy_value(self): + return Constant(self.get_dummy_type(), None) + + def get_dummy_type(self): + return GENERIC_POINTER + + def _compile_subroutine_no_cache(self, builder, impl, sig, locals={}, + flags=None): + """ + Invoke the compiler to compile a function to be used inside a + nopython function, but without generating code to call that + function. + + Note this context's flags are not inherited. + """ + # Compile + from numba.core import compiler + + with global_compiler_lock: + codegen = self.codegen() + library = codegen.create_library(impl.__name__) + if flags is None: + + cstk = targetconfig.ConfigStack() + flags = compiler.Flags() + if cstk: + tls_flags = cstk.top() + if tls_flags.is_set("nrt") and tls_flags.nrt: + flags.nrt = True + + flags.no_compile = True + flags.no_cpython_wrapper = True + flags.no_cfunc_wrapper = True + + cres = compiler.compile_internal(self.typing_context, self, + library, + impl, sig.args, + sig.return_type, flags, + locals=locals) + + # Allow inlining the function inside callers. + self.active_code_library.add_linking_library(cres.library) + return cres + + def compile_subroutine(self, builder, impl, sig, locals={}, flags=None, + caching=True): + """ + Compile the function *impl* for the given *sig* (in nopython mode). + Return an instance of CompileResult. + + If *caching* evaluates True, the function keeps the compiled function + for reuse in *.cached_internal_func*. + """ + cache_key = (impl.__code__, sig, type(self.error_model)) + if not caching: + cached = None + else: + if impl.__closure__: + # XXX This obviously won't work if a cell's value is + # unhashable. + cache_key += tuple(c.cell_contents for c in impl.__closure__) + cached = self.cached_internal_func.get(cache_key) + if cached is None: + cres = self._compile_subroutine_no_cache(builder, impl, sig, + locals=locals, + flags=flags) + self.cached_internal_func[cache_key] = cres + + cres = self.cached_internal_func[cache_key] + # Allow inlining the function inside callers. + self.active_code_library.add_linking_library(cres.library) + return cres + + def compile_internal(self, builder, impl, sig, args, locals={}): + """ + Like compile_subroutine(), but also call the function with the given + *args*. + """ + cres = self.compile_subroutine(builder, impl, sig, locals) + return self.call_internal(builder, cres.fndesc, sig, args) + + def call_internal(self, builder, fndesc, sig, args): + """ + Given the function descriptor of an internally compiled function, + emit a call to that function with the given arguments. + """ + status, res = self.call_internal_no_propagate(builder, fndesc, sig, args) + with cgutils.if_unlikely(builder, status.is_error): + self.call_conv.return_status_propagate(builder, status) + + res = imputils.fix_returning_optional(self, builder, sig, status, res) + return res + + def call_internal_no_propagate(self, builder, fndesc, sig, args): + """Similar to `.call_internal()` but does not handle or propagate + the return status automatically. + """ + # Add call to the generated function + llvm_mod = builder.module + fn = self.declare_function(llvm_mod, fndesc) + status, res = self.call_conv.call_function(builder, fn, sig.return_type, + sig.args, args) + return status, res + + def call_unresolved(self, builder, name, sig, args): + """ + Insert a function call to an unresolved symbol with the given *name*. + + Note: this is used for recursive call. + + In the mutual recursion case:: + + @njit + def foo(): + ... # calls bar() + + @njit + def bar(): + ... # calls foo() + + foo() + + When foo() is called, the compilation of bar() is fully completed + (codegen'ed and loaded) before foo() is. Since MCJIT's eager compilation + doesn't allow loading modules with declare-only functions (which is + needed for foo() in bar()), the call_unresolved injects a global + variable that the "linker" can update even after the module is loaded by + MCJIT. The linker would allocate space for the global variable before + the bar() module is loaded. When later foo() module is defined, it will + update bar()'s reference to foo(). + + The legacy lazy JIT and the new ORC JIT would allow a declare-only + function be used in a module as long as it is defined by the time of its + first use. + """ + # Insert an unresolved reference to the function being called. + codegen = self.codegen() + fnty = self.call_conv.get_function_type(sig.return_type, sig.args) + fn = codegen.insert_unresolved_ref(builder, fnty, name) + # Normal call sequence + status, res = self.call_conv.call_function(builder, fn, sig.return_type, + sig.args, args) + with cgutils.if_unlikely(builder, status.is_error): + self.call_conv.return_status_propagate(builder, status) + + res = imputils.fix_returning_optional(self, builder, sig, status, res) + return res + + def get_executable(self, func, fndesc, env): + raise NotImplementedError + + def get_python_api(self, builder): + return PythonAPI(self, builder) + + def sentry_record_alignment(self, rectyp, attr): + """ + Assumes offset starts from a properly aligned location + """ + if self.strict_alignment: + offset = rectyp.offset(attr) + elemty = rectyp.typeof(attr) + if isinstance(elemty, types.NestedArray): + # For a NestedArray we need to consider the data type of + # elements of the array for alignment, not the array structure + # itself + elemty = elemty.dtype + align = self.get_abi_alignment(self.get_data_type(elemty)) + if offset % align: + msg = "{rec}.{attr} of type {type} is not aligned".format( + rec=rectyp, attr=attr, type=elemty) + raise TypeError(msg) + + def get_helper_class(self, typ, kind='value'): + """ + Get a helper class for the given *typ*. + """ + # XXX handle all types: complex, array, etc. + # XXX should it be a method on the model instead? this would allow a default kind... + return cgutils.create_struct_proxy(typ, kind) + + def _make_helper(self, builder, typ, value=None, ref=None, kind='value'): + cls = self.get_helper_class(typ, kind) + return cls(self, builder, value=value, ref=ref) + + def make_helper(self, builder, typ, value=None, ref=None): + """ + Get a helper object to access the *typ*'s members, + for the given value or reference. + """ + return self._make_helper(builder, typ, value, ref, kind='value') + + def make_data_helper(self, builder, typ, ref=None): + """ + As make_helper(), but considers the value as stored in memory, + rather than a live value. + """ + return self._make_helper(builder, typ, ref=ref, kind='data') + + def make_array(self, typ): + from numba.np import arrayobj + return arrayobj.make_array(typ) + + def populate_array(self, arr, **kwargs): + """ + Populate array structure. + """ + from numba.np import arrayobj + return arrayobj.populate_array(arr, **kwargs) + + def make_complex(self, builder, typ, value=None): + """ + Get a helper object to access the given complex numbers' members. + """ + assert isinstance(typ, types.Complex), typ + return self.make_helper(builder, typ, value) + + def make_tuple(self, builder, typ, values): + """ + Create a tuple of the given *typ* containing the *values*. + """ + tup = self.get_constant_undef(typ) + for i, val in enumerate(values): + tup = builder.insert_value(tup, val, i) + return tup + + def make_constant_array(self, builder, typ, ary): + """ + Create an array structure reifying the given constant array. + A low-level contiguous array constant is created in the LLVM IR. + """ + datatype = self.get_data_type(typ.dtype) + # don't freeze ary of non-contig or bigger than 1MB + size_limit = 10**6 + + if (self.allow_dynamic_globals and + (typ.layout not in 'FC' or ary.nbytes > size_limit)): + # get pointer from the ary + dataptr = ary.ctypes.data + data = self.add_dynamic_addr(builder, dataptr, info=str(type(dataptr))) + rt_addr = self.add_dynamic_addr(builder, id(ary), info=str(type(ary))) + else: + # Handle data: reify the flattened array in "C" or "F" order as a + # global array of bytes. + flat = ary.flatten(order=typ.layout) + # Note: we use `bytearray(flat.data)` instead of `bytearray(flat)` to + # workaround issue #1850 which is due to numpy issue #3147 + consts = cgutils.create_constant_array(llvmir.IntType(8), bytearray(flat.data)) + data = cgutils.global_constant(builder, ".const.array.data", consts) + # Ensure correct data alignment (issue #1933) + data.align = self.get_abi_alignment(datatype) + # No reference to parent ndarray + rt_addr = None + + # Handle shape + llintp = self.get_value_type(types.intp) + shapevals = [self.get_constant(types.intp, s) for s in ary.shape] + cshape = cgutils.create_constant_array(llintp, shapevals) + + # Handle strides + stridevals = [self.get_constant(types.intp, s) for s in ary.strides] + cstrides = cgutils.create_constant_array(llintp, stridevals) + + # Create array structure + cary = self.make_array(typ)(self, builder) + + intp_itemsize = self.get_constant(types.intp, ary.dtype.itemsize) + self.populate_array(cary, + data=builder.bitcast(data, cary.data.type), + shape=cshape, + strides=cstrides, + itemsize=intp_itemsize, + parent=rt_addr, + meminfo=None) + + return cary._getvalue() + + def add_dynamic_addr(self, builder, intaddr, info): + """ + Returns dynamic address as a void pointer `i8*`. + + Internally, a global variable is added to inform the lowerer about + the usage of dynamic addresses. Caching will be disabled. + """ + assert self.allow_dynamic_globals, "dyn globals disabled in this target" + assert isinstance(intaddr, int), 'dyn addr not of int type' + mod = builder.module + llvoidptr = self.get_value_type(types.voidptr) + addr = self.get_constant(types.uintp, intaddr).inttoptr(llvoidptr) + # Use a unique name by embedding the address value + symname = 'numba.dynamic.globals.{:x}'.format(intaddr) + gv = cgutils.add_global_variable(mod, llvoidptr, symname) + # Use linkonce linkage to allow merging with other GV of the same name. + # And, avoid optimization from assuming its value. + gv.linkage = 'linkonce' + gv.initializer = addr + return builder.load(gv) + + def get_abi_sizeof(self, ty): + """ + Get the ABI size of LLVM type *ty*. + """ + assert isinstance(ty, llvmir.Type), "Expected LLVM type" + return ty.get_abi_size(self.target_data) + + def get_abi_alignment(self, ty): + """ + Get the ABI alignment of LLVM type *ty*. + """ + assert isinstance(ty, llvmir.Type), "Expected LLVM type" + return ty.get_abi_alignment(self.target_data) + + def get_preferred_array_alignment(context, ty): + """ + Get preferred array alignment for Numba type *ty*. + """ + # AVX prefers 32-byte alignment + return 32 + + def post_lowering(self, mod, library): + """Run target specific post-lowering transformation here. + """ + + def create_module(self, name): + """Create a LLVM module + + The default implementation in BaseContext always raises a + ``NotImplementedError`` exception. Subclasses should implement + this method. + """ + raise NotImplementedError + + @property + def active_code_library(self): + """Get the active code library + """ + return self._codelib_stack[-1] + + @contextmanager + def push_code_library(self, lib): + """Push the active code library for the context + """ + self._codelib_stack.append(lib) + try: + yield + finally: + self._codelib_stack.pop() + + def add_linking_libs(self, libs): + """Add iterable of linking libraries to the *active_code_library*. + """ + colib = self.active_code_library + for lib in libs: + colib.add_linking_library(lib) + + def get_ufunc_info(self, ufunc_key): + """Get the ufunc implementation for a given ufunc object. + + The default implementation in BaseContext always raises a + ``NotImplementedError`` exception. Subclasses may raise ``KeyError`` + to signal that the given ``ufunc_key`` is not available. + + Parameters + ---------- + ufunc_key : NumPy ufunc + + Returns + ------- + res : dict[str, callable] + A mapping of a NumPy ufunc type signature to a lower-level + implementation. + """ + raise NotImplementedError(f"{self} does not support ufunc") + +class _wrap_impl(object): + """ + A wrapper object to call an implementation function with some predefined + (context, signature) arguments. + The wrapper also forwards attribute queries, which is important. + """ + + def __init__(self, imp, context, sig): + self._callable = _wrap_missing_loc(imp) + self._imp = self._callable() + self._context = context + self._sig = sig + + def __call__(self, builder, args, loc=None): + res = self._imp(self._context, builder, self._sig, args, loc=loc) + self._context.add_linking_libs(getattr(self, 'libs', ())) + return res + + def __getattr__(self, item): + return getattr(self._imp, item) + + def __repr__(self): + return "" % repr(self._callable) + +def _has_loc(fn): + """Does function *fn* take ``loc`` argument? + """ + sig = utils.pysignature(fn) + return 'loc' in sig.parameters + + +class _wrap_missing_loc(object): + + def __init__(self, fn): + self.func = fn # store this to help with debug + + def __call__(self): + """Wrap function for missing ``loc`` keyword argument. + Otherwise, return the original *fn*. + """ + fn = self.func + if not _has_loc(fn): + def wrapper(*args, **kwargs): + kwargs.pop('loc') # drop unused loc + return fn(*args, **kwargs) + + # Copy the following attributes from the wrapped. + # Following similar implementation as functools.wraps but + # ignore attributes if not available (i.e fix py2.7) + attrs = '__name__', 'libs' + for attr in attrs: + try: + val = getattr(fn, attr) + except AttributeError: + pass + else: + setattr(wrapper, attr, val) + + return wrapper + else: + return fn + + def __repr__(self): + return "" % self.func + + +@utils.runonce +def _initialize_llvm_lock_event(): + """Initial event triggers for LLVM lock + """ + def enter_fn(): + event.start_event("numba:llvm_lock") + + def exit_fn(): + event.end_event("numba:llvm_lock") + + ll.ffi.register_lock_callback(enter_fn, exit_fn) + + +_initialize_llvm_lock_event() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/boxing.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/boxing.py new file mode 100644 index 0000000000000000000000000000000000000000..011d3a87b360705c629c719d0397fdd2d45603eb --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/boxing.py @@ -0,0 +1,1317 @@ +""" +Boxing and unboxing of native Numba values to / from CPython objects. +""" + +from llvmlite import ir + +from numba.core import types, cgutils +from numba.core.pythonapi import box, unbox, reflect, NativeValue +from numba.core.errors import NumbaNotImplementedError +from numba.core.typing.typeof import typeof, Purpose + +from numba.cpython import setobj, listobj +from numba.np import numpy_support +from contextlib import contextmanager, ExitStack + + +# +# Scalar types +# + +@box(types.Boolean) +def box_bool(typ, val, c): + return c.pyapi.bool_from_bool(val) + +@unbox(types.Boolean) +def unbox_boolean(typ, obj, c): + istrue = c.pyapi.object_istrue(obj) + zero = ir.Constant(istrue.type, 0) + val = c.builder.icmp_signed('!=', istrue, zero) + return NativeValue(val, is_error=c.pyapi.c_api_error()) + + +@box(types.IntegerLiteral) +@box(types.BooleanLiteral) +def box_literal_integer(typ, val, c): + val = c.context.cast(c.builder, val, typ, typ.literal_type) + return c.box(typ.literal_type, val) + + +@box(types.Integer) +def box_integer(typ, val, c): + if typ.signed: + ival = c.builder.sext(val, c.pyapi.longlong) + return c.pyapi.long_from_longlong(ival) + else: + ullval = c.builder.zext(val, c.pyapi.ulonglong) + return c.pyapi.long_from_ulonglong(ullval) + +@unbox(types.Integer) +def unbox_integer(typ, obj, c): + ll_type = c.context.get_argument_type(typ) + val = cgutils.alloca_once(c.builder, ll_type) + longobj = c.pyapi.number_long(obj) + with c.pyapi.if_object_ok(longobj): + if typ.signed: + llval = c.pyapi.long_as_longlong(longobj) + else: + llval = c.pyapi.long_as_ulonglong(longobj) + c.pyapi.decref(longobj) + c.builder.store(c.builder.trunc(llval, ll_type), val) + return NativeValue(c.builder.load(val), + is_error=c.pyapi.c_api_error()) + + +@box(types.Float) +def box_float(typ, val, c): + if typ == types.float32: + dbval = c.builder.fpext(val, c.pyapi.double) + else: + assert typ == types.float64 + dbval = val + return c.pyapi.float_from_double(dbval) + +@unbox(types.Float) +def unbox_float(typ, obj, c): + fobj = c.pyapi.number_float(obj) + dbval = c.pyapi.float_as_double(fobj) + c.pyapi.decref(fobj) + if typ == types.float32: + val = c.builder.fptrunc(dbval, + c.context.get_argument_type(typ)) + else: + assert typ == types.float64 + val = dbval + return NativeValue(val, is_error=c.pyapi.c_api_error()) + + +@box(types.Complex) +def box_complex(typ, val, c): + cval = c.context.make_complex(c.builder, typ, value=val) + + if typ == types.complex64: + freal = c.builder.fpext(cval.real, c.pyapi.double) + fimag = c.builder.fpext(cval.imag, c.pyapi.double) + else: + assert typ == types.complex128 + freal, fimag = cval.real, cval.imag + return c.pyapi.complex_from_doubles(freal, fimag) + +@unbox(types.Complex) +def unbox_complex(typ, obj, c): + # First unbox to complex128, since that's what CPython gives us + c128 = c.context.make_complex(c.builder, types.complex128) + ok = c.pyapi.complex_adaptor(obj, c128._getpointer()) + failed = cgutils.is_false(c.builder, ok) + + with cgutils.if_unlikely(c.builder, failed): + c.pyapi.err_set_string("PyExc_TypeError", + "conversion to %s failed" % (typ,)) + + if typ == types.complex64: + # Downcast to complex64 if necessary + cplx = c.context.make_complex(c.builder, typ) + cplx.real = c.context.cast(c.builder, c128.real, + types.float64, types.float32) + cplx.imag = c.context.cast(c.builder, c128.imag, + types.float64, types.float32) + else: + assert typ == types.complex128 + cplx = c128 + return NativeValue(cplx._getvalue(), is_error=failed) + + +@box(types.NoneType) +def box_none(typ, val, c): + return c.pyapi.make_none() + +@unbox(types.NoneType) +@unbox(types.EllipsisType) +def unbox_none(typ, val, c): + return NativeValue(c.context.get_dummy_value()) + + +@box(types.NPDatetime) +def box_npdatetime(typ, val, c): + return c.pyapi.create_np_datetime(val, typ.unit_code) + +@unbox(types.NPDatetime) +def unbox_npdatetime(typ, obj, c): + val = c.pyapi.extract_np_datetime(obj) + return NativeValue(val, is_error=c.pyapi.c_api_error()) + + +@box(types.NPTimedelta) +def box_nptimedelta(typ, val, c): + return c.pyapi.create_np_timedelta(val, typ.unit_code) + +@unbox(types.NPTimedelta) +def unbox_nptimedelta(typ, obj, c): + val = c.pyapi.extract_np_timedelta(obj) + return NativeValue(val, is_error=c.pyapi.c_api_error()) + + +@box(types.RawPointer) +def box_raw_pointer(typ, val, c): + """ + Convert a raw pointer to a Python int. + """ + ll_intp = c.context.get_value_type(types.uintp) + addr = c.builder.ptrtoint(val, ll_intp) + return c.box(types.uintp, addr) + + +@box(types.EnumMember) +def box_enum(typ, val, c): + """ + Fetch an enum member given its native value. + """ + valobj = c.box(typ.dtype, val) + # Call the enum class with the value object + cls_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.instance_class)) + return c.pyapi.call_function_objargs(cls_obj, (valobj,)) + + +@unbox(types.EnumMember) +def unbox_enum(typ, obj, c): + """ + Convert an enum member's value to its native value. + """ + valobj = c.pyapi.object_getattr_string(obj, "value") + return c.unbox(typ.dtype, valobj) + +# +# Composite types +# + +@box(types.Record) +def box_record(typ, val, c): + # Note we will create a copy of the record + # This is the only safe way. + size = ir.Constant(ir.IntType(32), val.type.pointee.count) + ptr = c.builder.bitcast(val, ir.PointerType(ir.IntType(8))) + return c.pyapi.recreate_record(ptr, size, typ.dtype, c.env_manager) + + +@unbox(types.Record) +def unbox_record(typ, obj, c): + buf = c.pyapi.alloca_buffer() + ptr = c.pyapi.extract_record_data(obj, buf) + is_error = cgutils.is_null(c.builder, ptr) + + ltyp = c.context.get_value_type(typ) + val = c.builder.bitcast(ptr, ltyp) + + def cleanup(): + c.pyapi.release_buffer(buf) + return NativeValue(val, cleanup=cleanup, is_error=is_error) + + +@box(types.UnicodeCharSeq) +def box_unicodecharseq(typ, val, c): + # XXX could kind be determined from strptr? + unicode_kind = { + 1: c.pyapi.py_unicode_1byte_kind, + 2: c.pyapi.py_unicode_2byte_kind, + 4: c.pyapi.py_unicode_4byte_kind}[numpy_support.sizeof_unicode_char] + kind = c.context.get_constant(types.int32, unicode_kind) + rawptr = cgutils.alloca_once_value(c.builder, value=val) + strptr = c.builder.bitcast(rawptr, c.pyapi.cstring) + + fullsize = c.context.get_constant(types.intp, typ.count) + zero = fullsize.type(0) + one = fullsize.type(1) + step = fullsize.type(numpy_support.sizeof_unicode_char) + count = cgutils.alloca_once_value(c.builder, zero) + with cgutils.loop_nest(c.builder, [fullsize], fullsize.type) as [idx]: + # Get char at idx + ch = c.builder.load(c.builder.gep(strptr, [c.builder.mul(idx, step)])) + # If the char is a non-null-byte, store the next index as count + with c.builder.if_then(cgutils.is_not_null(c.builder, ch)): + c.builder.store(c.builder.add(idx, one), count) + strlen = c.builder.load(count) + return c.pyapi.string_from_kind_and_data(kind, strptr, strlen) + + +@unbox(types.UnicodeCharSeq) +def unbox_unicodecharseq(typ, obj, c): + lty = c.context.get_value_type(typ) + + ok, buffer, size, kind, is_ascii, hashv = \ + c.pyapi.string_as_string_size_and_kind(obj) + + # If conversion is ok, copy the buffer to the output storage. + with cgutils.if_likely(c.builder, ok): + # Check if the returned string size fits in the charseq + storage_size = ir.Constant(size.type, typ.count) + size_fits = c.builder.icmp_unsigned("<=", size, storage_size) + + # Allow truncation of string + size = c.builder.select(size_fits, size, storage_size) + + # Initialize output to zero bytes + null_string = ir.Constant(lty, None) + outspace = cgutils.alloca_once_value(c.builder, null_string) + + # We don't need to set the NULL-terminator because the storage + # is already zero-filled. + cgutils.memcpy(c.builder, + c.builder.bitcast(outspace, buffer.type), + buffer, size) + + ret = c.builder.load(outspace) + return NativeValue(ret, is_error=c.builder.not_(ok)) + + +@box(types.Bytes) +def box_bytes(typ, val, c): + obj = c.context.make_helper(c.builder, typ, val) + ret = c.pyapi.bytes_from_string_and_size(obj.data, obj.nitems) + c.context.nrt.decref(c.builder, typ, val) + return ret + + +@box(types.CharSeq) +def box_charseq(typ, val, c): + rawptr = cgutils.alloca_once_value(c.builder, value=val) + strptr = c.builder.bitcast(rawptr, c.pyapi.cstring) + fullsize = c.context.get_constant(types.intp, typ.count) + zero = fullsize.type(0) + one = fullsize.type(1) + count = cgutils.alloca_once_value(c.builder, zero) + + # Find the length of the string, mimicking Numpy's behaviour: + # search for the last non-null byte in the underlying storage + # (e.g. b'A\0\0B\0\0\0' will return the logical string b'A\0\0B') + with cgutils.loop_nest(c.builder, [fullsize], fullsize.type) as [idx]: + # Get char at idx + ch = c.builder.load(c.builder.gep(strptr, [idx])) + # If the char is a non-null-byte, store the next index as count + with c.builder.if_then(cgutils.is_not_null(c.builder, ch)): + c.builder.store(c.builder.add(idx, one), count) + + strlen = c.builder.load(count) + return c.pyapi.bytes_from_string_and_size(strptr, strlen) + + +@unbox(types.CharSeq) +def unbox_charseq(typ, obj, c): + lty = c.context.get_value_type(typ) + ok, buffer, size = c.pyapi.string_as_string_and_size(obj) + + # If conversion is ok, copy the buffer to the output storage. + with cgutils.if_likely(c.builder, ok): + # Check if the returned string size fits in the charseq + storage_size = ir.Constant(size.type, typ.count) + size_fits = c.builder.icmp_unsigned("<=", size, storage_size) + + # Allow truncation of string + size = c.builder.select(size_fits, size, storage_size) + + # Initialize output to zero bytes + null_string = ir.Constant(lty, None) + outspace = cgutils.alloca_once_value(c.builder, null_string) + + # We don't need to set the NULL-terminator because the storage + # is already zero-filled. + cgutils.memcpy(c.builder, + c.builder.bitcast(outspace, buffer.type), + buffer, size) + + ret = c.builder.load(outspace) + return NativeValue(ret, is_error=c.builder.not_(ok)) + + +@box(types.Optional) +def box_optional(typ, val, c): + optval = c.context.make_helper(c.builder, typ, val) + ret = cgutils.alloca_once_value(c.builder, c.pyapi.borrow_none()) + with c.builder.if_else(optval.valid) as (then, otherwise): + with then: + validres = c.box(typ.type, optval.data) + c.builder.store(validres, ret) + with otherwise: + c.builder.store(c.pyapi.make_none(), ret) + return c.builder.load(ret) + + +@unbox(types.Optional) +def unbox_optional(typ, obj, c): + """ + Convert object *obj* to a native optional structure. + """ + noneval = c.context.make_optional_none(c.builder, typ.type) + is_not_none = c.builder.icmp_signed('!=', obj, c.pyapi.borrow_none()) + + retptr = cgutils.alloca_once(c.builder, noneval.type) + errptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) + + with c.builder.if_else(is_not_none) as (then, orelse): + with then: + native = c.unbox(typ.type, obj) + just = c.context.make_optional_value(c.builder, + typ.type, native.value) + c.builder.store(just, retptr) + c.builder.store(native.is_error, errptr) + + with orelse: + c.builder.store(noneval, retptr) + + if native.cleanup is not None: + def cleanup(): + with c.builder.if_then(is_not_none): + native.cleanup() + else: + cleanup = None + + ret = c.builder.load(retptr) + return NativeValue(ret, is_error=c.builder.load(errptr), + cleanup=cleanup) + + +@unbox(types.SliceType) +def unbox_slice(typ, obj, c): + """ + Convert object *obj* to a native slice structure. + """ + from numba.cpython import slicing + ok, start, stop, step = c.pyapi.slice_as_ints(obj) + sli = c.context.make_helper(c.builder, typ) + sli.start = start + sli.stop = stop + sli.step = step + return NativeValue(sli._getvalue(), is_error=c.builder.not_(ok)) + +@box(types.SliceLiteral) +def box_slice_literal(typ, val, c): + # Check for integer overflows at compile time. + slice_lit = typ.literal_value + for field_name in ("start", "stop", "step"): + field_obj = getattr(slice_lit, field_name) + if isinstance(field_obj, int): + try: + typeof(field_obj, Purpose) + except ValueError as e: + raise ValueError(( + f"Unable to create literal slice. " + f"Error encountered with {field_name} " + f"attribute. {str(e)}") + ) + + py_ctor, py_args = typ.literal_value.__reduce__() + serialized_ctor = c.pyapi.serialize_object(py_ctor) + serialized_args = c.pyapi.serialize_object(py_args) + ctor = c.pyapi.unserialize(serialized_ctor) + args = c.pyapi.unserialize(serialized_args) + obj = c.pyapi.call(ctor, args) + c.pyapi.decref(ctor) + c.pyapi.decref(args) + return obj + +@unbox(types.StringLiteral) +def unbox_string_literal(typ, obj, c): + # A string literal is a dummy value + return NativeValue(c.context.get_dummy_value()) + +# +# Collections +# + +# NOTE: boxing functions are supposed to steal any NRT references in +# the given native value. + +@box(types.Array) +def box_array(typ, val, c): + nativearycls = c.context.make_array(typ) + nativeary = nativearycls(c.context, c.builder, value=val) + if c.context.enable_nrt: + np_dtype = numpy_support.as_dtype(typ.dtype) + dtypeptr = c.env_manager.read_const(c.env_manager.add_const(np_dtype)) + newary = c.pyapi.nrt_adapt_ndarray_to_python(typ, val, dtypeptr) + # Steals NRT ref + c.context.nrt.decref(c.builder, typ, val) + return newary + else: + parent = nativeary.parent + c.pyapi.incref(parent) + return parent + + +@unbox(types.Buffer) +def unbox_buffer(typ, obj, c): + """ + Convert a Py_buffer-providing object to a native array structure. + """ + buf = c.pyapi.alloca_buffer() + res = c.pyapi.get_buffer(obj, buf) + is_error = cgutils.is_not_null(c.builder, res) + + nativearycls = c.context.make_array(typ) + nativeary = nativearycls(c.context, c.builder) + aryptr = nativeary._getpointer() + + with cgutils.if_likely(c.builder, c.builder.not_(is_error)): + ptr = c.builder.bitcast(aryptr, c.pyapi.voidptr) + if c.context.enable_nrt: + c.pyapi.nrt_adapt_buffer_from_python(buf, ptr) + else: + c.pyapi.numba_buffer_adaptor(buf, ptr) + + def cleanup(): + c.pyapi.release_buffer(buf) + + return NativeValue(c.builder.load(aryptr), is_error=is_error, + cleanup=cleanup) + +@unbox(types.Array) +def unbox_array(typ, obj, c): + """ + Convert a Numpy array object to a native array structure. + """ + # This is necessary because unbox_buffer() does not work on some + # dtypes, e.g. datetime64 and timedelta64. + # TODO check matching dtype. + # currently, mismatching dtype will still work and causes + # potential memory corruption + nativearycls = c.context.make_array(typ) + nativeary = nativearycls(c.context, c.builder) + aryptr = nativeary._getpointer() + + ptr = c.builder.bitcast(aryptr, c.pyapi.voidptr) + if c.context.enable_nrt: + errcode = c.pyapi.nrt_adapt_ndarray_from_python(obj, ptr) + else: + errcode = c.pyapi.numba_array_adaptor(obj, ptr) + + # TODO: here we have minimal typechecking by the itemsize. + # need to do better + try: + expected_itemsize = numpy_support.as_dtype(typ.dtype).itemsize + except NumbaNotImplementedError: + # Don't check types that can't be `as_dtype()`-ed + itemsize_mismatch = cgutils.false_bit + else: + expected_itemsize = nativeary.itemsize.type(expected_itemsize) + itemsize_mismatch = c.builder.icmp_unsigned( + '!=', + nativeary.itemsize, + expected_itemsize, + ) + + failed = c.builder.or_( + cgutils.is_not_null(c.builder, errcode), + itemsize_mismatch, + ) + # Handle error + with c.builder.if_then(failed, likely=False): + c.pyapi.err_set_string("PyExc_TypeError", + "can't unbox array from PyObject into " + "native value. The object maybe of a " + "different type") + return NativeValue(c.builder.load(aryptr), is_error=failed) + + +@box(types.Tuple) +@box(types.UniTuple) +def box_tuple(typ, val, c): + """ + Convert native array or structure *val* to a tuple object. + """ + tuple_val = c.pyapi.tuple_new(typ.count) + + for i, dtype in enumerate(typ): + item = c.builder.extract_value(val, i) + obj = c.box(dtype, item) + c.pyapi.tuple_setitem(tuple_val, i, obj) + + return tuple_val + +@box(types.NamedTuple) +@box(types.NamedUniTuple) +def box_namedtuple(typ, val, c): + """ + Convert native array or structure *val* to a namedtuple object. + """ + cls_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.instance_class)) + tuple_obj = box_tuple(typ, val, c) + obj = c.pyapi.call(cls_obj, tuple_obj) + c.pyapi.decref(cls_obj) + c.pyapi.decref(tuple_obj) + return obj + + +@unbox(types.BaseTuple) +def unbox_tuple(typ, obj, c): + """ + Convert tuple *obj* to a native array (if homogeneous) or structure. + """ + n = len(typ) + values = [] + cleanups = [] + lty = c.context.get_value_type(typ) + + is_error_ptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) + value_ptr = cgutils.alloca_once(c.builder, lty) + + # Issue #1638: need to check the tuple size + actual_size = c.pyapi.tuple_size(obj) + size_matches = c.builder.icmp_unsigned('==', actual_size, + ir.Constant(actual_size.type, n)) + with c.builder.if_then(c.builder.not_(size_matches), likely=False): + c.pyapi.err_format( + "PyExc_ValueError", + "size mismatch for tuple, expected %d element(s) but got %%zd" % (n,), + actual_size) + c.builder.store(cgutils.true_bit, is_error_ptr) + + # We unbox the items even if not `size_matches`, to avoid issues with + # the generated IR (instruction doesn't dominate all uses) + for i, eltype in enumerate(typ): + elem = c.pyapi.tuple_getitem(obj, i) + native = c.unbox(eltype, elem) + values.append(native.value) + with c.builder.if_then(native.is_error, likely=False): + c.builder.store(cgutils.true_bit, is_error_ptr) + if native.cleanup is not None: + cleanups.append(native.cleanup) + + value = c.context.make_tuple(c.builder, typ, values) + c.builder.store(value, value_ptr) + + if cleanups: + with c.builder.if_then(size_matches, likely=True): + def cleanup(): + for func in reversed(cleanups): + func() + else: + cleanup = None + + return NativeValue(c.builder.load(value_ptr), cleanup=cleanup, + is_error=c.builder.load(is_error_ptr)) + + +@box(types.List) +def box_list(typ, val, c): + """ + Convert native list *val* to a list object. + """ + list = listobj.ListInstance(c.context, c.builder, typ, val) + obj = list.parent + res = cgutils.alloca_once_value(c.builder, obj) + with c.builder.if_else(cgutils.is_not_null(c.builder, obj)) as (has_parent, otherwise): + with has_parent: + # List is actually reflected => return the original object + # (note not all list instances whose *type* is reflected are + # actually reflected; see numba.tests.test_lists for an example) + c.pyapi.incref(obj) + + with otherwise: + # Build a new Python list + nitems = list.size + obj = c.pyapi.list_new(nitems) + with c.builder.if_then(cgutils.is_not_null(c.builder, obj), + likely=True): + with cgutils.for_range(c.builder, nitems) as loop: + item = list.getitem(loop.index) + list.incref_value(item) + itemobj = c.box(typ.dtype, item) + c.pyapi.list_setitem(obj, loop.index, itemobj) + + c.builder.store(obj, res) + + # Steal NRT ref + c.context.nrt.decref(c.builder, typ, val) + return c.builder.load(res) + + +class _NumbaTypeHelper(object): + """A helper for acquiring `numba.typeof` for type checking. + + Usage + ----- + + # `c` is the boxing context. + with _NumbaTypeHelper(c) as nth: + # This contextmanager maintains the lifetime of the `numba.typeof` + # function. + the_numba_type = nth.typeof(some_object) + # Do work on the type object + do_checks(the_numba_type) + # Cleanup + c.pyapi.decref(the_numba_type) + # At this point *nth* should not be used. + """ + def __init__(self, c): + self.c = c + + def __enter__(self): + c = self.c + numba_name = c.context.insert_const_string(c.builder.module, 'numba') + numba_mod = c.pyapi.import_module_noblock(numba_name) + typeof_fn = c.pyapi.object_getattr_string(numba_mod, 'typeof') + self.typeof_fn = typeof_fn + c.pyapi.decref(numba_mod) + return self + + def __exit__(self, *args, **kwargs): + c = self.c + c.pyapi.decref(self.typeof_fn) + + def typeof(self, obj): + res = self.c.pyapi.call_function_objargs(self.typeof_fn, [obj]) + return res + + +def _python_list_to_native(typ, obj, c, size, listptr, errorptr): + """ + Construct a new native list from a Python list. + """ + def check_element_type(nth, itemobj, expected_typobj): + typobj = nth.typeof(itemobj) + # Check if *typobj* is NULL + with c.builder.if_then( + cgutils.is_null(c.builder, typobj), + likely=False, + ): + c.builder.store(cgutils.true_bit, errorptr) + loop.do_break() + # Mandate that objects all have the same exact type + type_mismatch = c.builder.icmp_signed('!=', typobj, expected_typobj) + + with c.builder.if_then(type_mismatch, likely=False): + c.builder.store(cgutils.true_bit, errorptr) + c.pyapi.err_format( + "PyExc_TypeError", + "can't unbox heterogeneous list: %S != %S", + expected_typobj, typobj, + ) + c.pyapi.decref(typobj) + loop.do_break() + c.pyapi.decref(typobj) + + # Allocate a new native list + ok, list = listobj.ListInstance.allocate_ex(c.context, c.builder, typ, size) + with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok): + with if_ok: + list.size = size + zero = ir.Constant(size.type, 0) + with c.builder.if_then(c.builder.icmp_signed('>', size, zero), + likely=True): + # Traverse Python list and unbox objects into native list + with _NumbaTypeHelper(c) as nth: + # Note: *expected_typobj* can't be NULL + expected_typobj = nth.typeof(c.pyapi.list_getitem(obj, zero)) + with cgutils.for_range(c.builder, size) as loop: + itemobj = c.pyapi.list_getitem(obj, loop.index) + check_element_type(nth, itemobj, expected_typobj) + # XXX we don't call native cleanup for each + # list element, since that would require keeping + # of which unboxings have been successful. + native = c.unbox(typ.dtype, itemobj) + with c.builder.if_then(native.is_error, likely=False): + c.builder.store(cgutils.true_bit, errorptr) + loop.do_break() + # The reference is borrowed so incref=False + list.setitem(loop.index, native.value, incref=False) + c.pyapi.decref(expected_typobj) + if typ.reflected: + list.parent = obj + # Stuff meminfo pointer into the Python object for + # later reuse. + with c.builder.if_then(c.builder.not_(c.builder.load(errorptr)), + likely=False): + c.pyapi.object_set_private_data(obj, list.meminfo) + list.set_dirty(False) + c.builder.store(list.value, listptr) + + with if_not_ok: + c.builder.store(cgutils.true_bit, errorptr) + + # If an error occurred, drop the whole native list + with c.builder.if_then(c.builder.load(errorptr)): + c.context.nrt.decref(c.builder, typ, list.value) + + +@unbox(types.List) +def unbox_list(typ, obj, c): + """ + Convert list *obj* to a native list. + + If list was previously unboxed, we reuse the existing native list + to ensure consistency. + """ + size = c.pyapi.list_size(obj) + + errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) + listptr = cgutils.alloca_once(c.builder, c.context.get_value_type(typ)) + + # See if the list was previously unboxed, if so, re-use the meminfo. + ptr = c.pyapi.object_get_private_data(obj) + + with c.builder.if_else(cgutils.is_not_null(c.builder, ptr)) \ + as (has_meminfo, otherwise): + + with has_meminfo: + # List was previously unboxed => reuse meminfo + list = listobj.ListInstance.from_meminfo(c.context, c.builder, typ, ptr) + list.size = size + if typ.reflected: + list.parent = obj + c.builder.store(list.value, listptr) + + with otherwise: + _python_list_to_native(typ, obj, c, size, listptr, errorptr) + + def cleanup(): + # Clean up the associated pointer, as the meminfo is now invalid. + c.pyapi.object_reset_private_data(obj) + + return NativeValue(c.builder.load(listptr), + is_error=c.builder.load(errorptr), + cleanup=cleanup) + + +@reflect(types.List) +def reflect_list(typ, val, c): + """ + Reflect the native list's contents into the Python object. + """ + if not typ.reflected: + return + if typ.dtype.reflected: + msg = "cannot reflect element of reflected container: {}\n".format(typ) + raise TypeError(msg) + + list = listobj.ListInstance(c.context, c.builder, typ, val) + with c.builder.if_then(list.dirty, likely=False): + obj = list.parent + size = c.pyapi.list_size(obj) + new_size = list.size + diff = c.builder.sub(new_size, size) + diff_gt_0 = c.builder.icmp_signed('>=', diff, + ir.Constant(diff.type, 0)) + with c.builder.if_else(diff_gt_0) as (if_grow, if_shrink): + # XXX no error checking below + with if_grow: + # First overwrite existing items + with cgutils.for_range(c.builder, size) as loop: + item = list.getitem(loop.index) + list.incref_value(item) + itemobj = c.box(typ.dtype, item) + c.pyapi.list_setitem(obj, loop.index, itemobj) + # Then add missing items + with cgutils.for_range(c.builder, diff) as loop: + idx = c.builder.add(size, loop.index) + item = list.getitem(idx) + list.incref_value(item) + itemobj = c.box(typ.dtype, item) + c.pyapi.list_append(obj, itemobj) + c.pyapi.decref(itemobj) + + with if_shrink: + # First delete list tail + c.pyapi.list_setslice(obj, new_size, size, None) + # Then overwrite remaining items + with cgutils.for_range(c.builder, new_size) as loop: + item = list.getitem(loop.index) + list.incref_value(item) + itemobj = c.box(typ.dtype, item) + c.pyapi.list_setitem(obj, loop.index, itemobj) + + # Mark the list clean, in case it is reflected twice + list.set_dirty(False) + + +def _python_set_to_native(typ, obj, c, size, setptr, errorptr): + """ + Construct a new native set from a Python set. + """ + # Allocate a new native set + ok, inst = setobj.SetInstance.allocate_ex(c.context, c.builder, typ, size) + with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok): + with if_ok: + # Traverse Python set and unbox objects into native set + typobjptr = cgutils.alloca_once_value(c.builder, + ir.Constant(c.pyapi.pyobj, None)) + + with c.pyapi.set_iterate(obj) as loop: + itemobj = loop.value + # Mandate that objects all have the same exact type + typobj = c.pyapi.get_type(itemobj) + expected_typobj = c.builder.load(typobjptr) + + with c.builder.if_else( + cgutils.is_null(c.builder, expected_typobj), + likely=False) as (if_first, if_not_first): + with if_first: + # First iteration => store item type + c.builder.store(typobj, typobjptr) + with if_not_first: + # Otherwise, check item type + type_mismatch = c.builder.icmp_signed('!=', typobj, + expected_typobj) + with c.builder.if_then(type_mismatch, likely=False): + c.builder.store(cgutils.true_bit, errorptr) + c.pyapi.err_set_string("PyExc_TypeError", + "can't unbox heterogeneous set") + loop.do_break() + + # XXX we don't call native cleanup for each set element, + # since that would require keeping track + # of which unboxings have been successful. + native = c.unbox(typ.dtype, itemobj) + with c.builder.if_then(native.is_error, likely=False): + c.builder.store(cgutils.true_bit, errorptr) + inst.add_pyapi(c.pyapi, native.value, do_resize=False) + + if typ.reflected: + inst.parent = obj + # Associate meminfo pointer with the Python object for later reuse. + with c.builder.if_then(c.builder.not_(c.builder.load(errorptr)), + likely=False): + c.pyapi.object_set_private_data(obj, inst.meminfo) + inst.set_dirty(False) + c.builder.store(inst.value, setptr) + + with if_not_ok: + c.builder.store(cgutils.true_bit, errorptr) + + # If an error occurred, drop the whole native set + with c.builder.if_then(c.builder.load(errorptr)): + c.context.nrt.decref(c.builder, typ, inst.value) + + +@unbox(types.Set) +def unbox_set(typ, obj, c): + """ + Convert set *obj* to a native set. + + If set was previously unboxed, we reuse the existing native set + to ensure consistency. + """ + size = c.pyapi.set_size(obj) + + errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) + setptr = cgutils.alloca_once(c.builder, c.context.get_value_type(typ)) + + # See if the set was previously unboxed, if so, re-use the meminfo. + ptr = c.pyapi.object_get_private_data(obj) + + with c.builder.if_else(cgutils.is_not_null(c.builder, ptr)) \ + as (has_meminfo, otherwise): + + with has_meminfo: + # Set was previously unboxed => reuse meminfo + inst = setobj.SetInstance.from_meminfo(c.context, c.builder, typ, ptr) + if typ.reflected: + inst.parent = obj + c.builder.store(inst.value, setptr) + + with otherwise: + _python_set_to_native(typ, obj, c, size, setptr, errorptr) + + def cleanup(): + # Clean up the associated pointer, as the meminfo is now invalid. + c.pyapi.object_reset_private_data(obj) + + return NativeValue(c.builder.load(setptr), + is_error=c.builder.load(errorptr), + cleanup=cleanup) + + +def _native_set_to_python_list(typ, payload, c): + """ + Create a Python list from a native set's items. + """ + nitems = payload.used + listobj = c.pyapi.list_new(nitems) + ok = cgutils.is_not_null(c.builder, listobj) + with c.builder.if_then(ok, likely=True): + index = cgutils.alloca_once_value(c.builder, + ir.Constant(nitems.type, 0)) + with payload._iterate() as loop: + i = c.builder.load(index) + item = loop.entry.key + c.context.nrt.incref(c.builder, typ.dtype, item) + itemobj = c.box(typ.dtype, item) + c.pyapi.list_setitem(listobj, i, itemobj) + i = c.builder.add(i, ir.Constant(i.type, 1)) + c.builder.store(i, index) + + return ok, listobj + + +@box(types.Set) +def box_set(typ, val, c): + """ + Convert native set *val* to a set object. + """ + inst = setobj.SetInstance(c.context, c.builder, typ, val) + obj = inst.parent + res = cgutils.alloca_once_value(c.builder, obj) + + with c.builder.if_else(cgutils.is_not_null(c.builder, obj)) as (has_parent, otherwise): + with has_parent: + # Set is actually reflected => return the original object + # (note not all set instances whose *type* is reflected are + # actually reflected; see numba.tests.test_sets for an example) + c.pyapi.incref(obj) + + with otherwise: + # Build a new Python list and then create a set from that + payload = inst.payload + ok, listobj = _native_set_to_python_list(typ, payload, c) + with c.builder.if_then(ok, likely=True): + obj = c.pyapi.set_new(listobj) + c.pyapi.decref(listobj) + c.builder.store(obj, res) + + # Steal NRT ref + c.context.nrt.decref(c.builder, typ, val) + return c.builder.load(res) + +@reflect(types.Set) +def reflect_set(typ, val, c): + """ + Reflect the native set's contents into the Python object. + """ + if not typ.reflected: + return + inst = setobj.SetInstance(c.context, c.builder, typ, val) + payload = inst.payload + + with c.builder.if_then(payload.dirty, likely=False): + obj = inst.parent + # XXX errors are not dealt with below + c.pyapi.set_clear(obj) + + # Build a new Python list and then update the set with that + ok, listobj = _native_set_to_python_list(typ, payload, c) + with c.builder.if_then(ok, likely=True): + c.pyapi.set_update(obj, listobj) + c.pyapi.decref(listobj) + + # Mark the set clean, in case it is reflected twice + inst.set_dirty(False) + + +# +# Other types +# + +@box(types.Generator) +def box_generator(typ, val, c): + return c.pyapi.from_native_generator(val, typ, c.env_manager.env_ptr) + +@unbox(types.Generator) +def unbox_generator(typ, obj, c): + return c.pyapi.to_native_generator(obj, typ) + + +@box(types.DType) +def box_dtype(typ, val, c): + np_dtype = numpy_support.as_dtype(typ.dtype) + return c.pyapi.unserialize(c.pyapi.serialize_object(np_dtype)) + +@unbox(types.DType) +def unbox_dtype(typ, val, c): + return NativeValue(c.context.get_dummy_value()) + + +@box(types.NumberClass) +def box_number_class(typ, val, c): + np_dtype = numpy_support.as_dtype(typ.dtype) + return c.pyapi.unserialize(c.pyapi.serialize_object(np_dtype)) + +@unbox(types.NumberClass) +def unbox_number_class(typ, val, c): + return NativeValue(c.context.get_dummy_value()) + + +@box(types.PyObject) +@box(types.Object) +def box_pyobject(typ, val, c): + return val + +@unbox(types.PyObject) +@unbox(types.Object) +def unbox_pyobject(typ, obj, c): + return NativeValue(obj) + + +@unbox(types.ExternalFunctionPointer) +def unbox_funcptr(typ, obj, c): + if typ.get_pointer is None: + raise NotImplementedError(typ) + + # Call get_pointer() on the object to get the raw pointer value + ptrty = c.context.get_function_pointer_type(typ) + ret = cgutils.alloca_once_value(c.builder, + ir.Constant(ptrty, None), + name='fnptr') + ser = c.pyapi.serialize_object(typ.get_pointer) + get_pointer = c.pyapi.unserialize(ser) + with cgutils.if_likely(c.builder, + cgutils.is_not_null(c.builder, get_pointer)): + intobj = c.pyapi.call_function_objargs(get_pointer, (obj,)) + c.pyapi.decref(get_pointer) + with cgutils.if_likely(c.builder, + cgutils.is_not_null(c.builder, intobj)): + ptr = c.pyapi.long_as_voidptr(intobj) + c.pyapi.decref(intobj) + c.builder.store(c.builder.bitcast(ptr, ptrty), ret) + return NativeValue(c.builder.load(ret), is_error=c.pyapi.c_api_error()) + +@box(types.DeferredType) +def box_deferred(typ, val, c): + out = c.pyapi.from_native_value(typ.get(), + c.builder.extract_value(val, [0]), + env_manager=c.env_manager) + return out + + +@unbox(types.DeferredType) +def unbox_deferred(typ, obj, c): + native_value = c.pyapi.to_native_value(typ.get(), obj) + model = c.context.data_model_manager[typ] + res = model.set(c.builder, model.make_uninitialized(), native_value.value) + return NativeValue(res, is_error=native_value.is_error, + cleanup=native_value.cleanup) + + +@unbox(types.Dispatcher) +def unbox_dispatcher(typ, obj, c): + # In native code, Dispatcher types can be casted to FunctionType. + return NativeValue(obj) + + +@box(types.Dispatcher) +def box_pyobject(typ, val, c): + c.pyapi.incref(val) + return val + + +def unbox_unsupported(typ, obj, c): + c.pyapi.err_set_string("PyExc_TypeError", + "can't unbox {!r} type".format(typ)) + res = c.context.get_constant_null(typ) + return NativeValue(res, is_error=cgutils.true_bit) + + +def box_unsupported(typ, val, c): + msg = "cannot convert native %s to Python object" % (typ,) + c.pyapi.err_set_string("PyExc_TypeError", msg) + res = c.pyapi.get_null_object() + return res + + +@box(types.Literal) +def box_literal(typ, val, c): + # Const type contains the python object of the constant value, + # which we can directly return. + retval = typ.literal_value + # Serialize the value into the IR + return c.pyapi.unserialize(c.pyapi.serialize_object(retval)) + + +@box(types.MemInfoPointer) +def box_meminfo_pointer(typ, val, c): + return c.pyapi.nrt_meminfo_as_pyobject(val) + + +@unbox(types.MemInfoPointer) +def unbox_meminfo_pointer(typ, obj, c): + res = c.pyapi.nrt_meminfo_from_pyobject(obj) + errored = cgutils.is_null(c.builder, res) + return NativeValue(res, is_error=errored) + +@unbox(types.TypeRef) +def unbox_typeref(typ, val, c): + return NativeValue(c.context.get_dummy_value(), is_error=cgutils.false_bit) + + +@box(types.LiteralStrKeyDict) +def box_LiteralStrKeyDict(typ, val, c): + return box_unsupported(typ, val, c) + + +@contextmanager +def early_exit_if(builder, stack: ExitStack, cond): + then, otherwise = stack.enter_context(builder.if_else(cond, likely=False)) + with then: + yield + stack.enter_context(otherwise) + + +def early_exit_if_null(builder, stack, obj): + return early_exit_if(builder, stack, cgutils.is_null(builder, obj)) + + +# Original implementation at: https://github.com/numba/numba/issues/4499#issuecomment-1063138477 +@unbox(types.NumPyRandomBitGeneratorType) +def unbox_numpy_random_bitgenerator(typ, obj, c): + """ + The bit_generator instance has a `.ctypes` attr which is a namedtuple + with the following members (types): + * state_address (Python int) + * state (ctypes.c_void_p) + * next_uint64 (ctypes.CFunctionType instance) + * next_uint32 (ctypes.CFunctionType instance) + * next_double (ctypes.CFunctionType instance) + * bit_generator (ctypes.c_void_p) + """ + + is_error_ptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) + extra_refs = [] + + def clear_extra_refs(): + for _ref in extra_refs: + c.pyapi.decref(_ref) + + def handle_failure(): + c.builder.store(cgutils.true_bit, is_error_ptr) + clear_extra_refs() + + with ExitStack() as stack: + + def object_getattr_safely(obj, attr): + attr_obj = c.pyapi.object_getattr_string(obj, attr) + extra_refs.append(attr_obj) + return attr_obj + + struct_ptr = cgutils.create_struct_proxy(typ)(c.context, c.builder) + struct_ptr.parent = obj + + # Get the .ctypes attr + ctypes_binding = object_getattr_safely(obj, 'ctypes') + with early_exit_if_null(c.builder, stack, ctypes_binding): + handle_failure() + + # Look up the "state_address" member and wire it into the struct + interface_state_address = object_getattr_safely( + ctypes_binding, 'state_address') + with early_exit_if_null(c.builder, stack, interface_state_address): + handle_failure() + + setattr(struct_ptr, 'state_address', + c.unbox(types.uintp, interface_state_address).value) + + # Look up the "state" member and wire it into the struct + interface_state = object_getattr_safely(ctypes_binding, 'state') + with early_exit_if_null(c.builder, stack, interface_state): + handle_failure() + + interface_state_value = object_getattr_safely( + interface_state, 'value') + with early_exit_if_null(c.builder, stack, interface_state_value): + handle_failure() + setattr( + struct_ptr, + 'state', + c.unbox( + types.uintp, + interface_state_value).value) + + # Want to store callable function pointers to these CFunctionTypes, so + # import ctypes and use it to cast the CFunctionTypes to c_void_p and + # store the results. + # First find ctypes.cast, and ctypes.c_void_p + ctypes_name = c.context.insert_const_string(c.builder.module, 'ctypes') + ctypes_module = c.pyapi.import_module_noblock(ctypes_name) + extra_refs.append(ctypes_module) + with early_exit_if_null(c.builder, stack, ctypes_module): + handle_failure() + + ct_cast = object_getattr_safely(ctypes_module, 'cast') + with early_exit_if_null(c.builder, stack, ct_cast): + handle_failure() + + ct_voidptr_ty = object_getattr_safely(ctypes_module, 'c_void_p') + with early_exit_if_null(c.builder, stack, ct_voidptr_ty): + handle_failure() + + # This wires in the fnptrs referred to by name + def wire_in_fnptrs(name): + # Find the CFunctionType function + interface_next_fn = c.pyapi.object_getattr_string( + ctypes_binding, name) + + extra_refs.append(interface_next_fn) + with early_exit_if_null(c.builder, stack, interface_next_fn): + handle_failure() + + # Want to do ctypes.cast(CFunctionType, ctypes.c_void_p), create an + # args tuple for that. + extra_refs.append(ct_voidptr_ty) + args = c.pyapi.tuple_pack([interface_next_fn, ct_voidptr_ty]) + with early_exit_if_null(c.builder, stack, args): + handle_failure() + extra_refs.append(ct_voidptr_ty) + + # Call ctypes.cast() + interface_next_fn_casted = c.pyapi.call(ct_cast, args) + + # Fetch the .value attr on the resulting ctypes.c_void_p for storage + # in the function pointer slot. + interface_next_fn_casted_value = object_getattr_safely( + interface_next_fn_casted, 'value') + with early_exit_if_null(c.builder, stack, interface_next_fn_casted_value): + handle_failure() + + # Wire up + setattr(struct_ptr, f'fnptr_{name}', + c.unbox(types.uintp, interface_next_fn_casted_value).value) + + + wire_in_fnptrs('next_double') + wire_in_fnptrs('next_uint64') + wire_in_fnptrs('next_uint32') + + clear_extra_refs() + + return NativeValue(struct_ptr._getvalue(), is_error=c.builder.load(is_error_ptr)) + +_bit_gen_type = types.NumPyRandomBitGeneratorType('bit_generator') + +@unbox(types.NumPyRandomGeneratorType) +def unbox_numpy_random_generator(typ, obj, c): + """ + Here we're creating a NumPyRandomGeneratorType StructModel with following fields: + * ('bit_generator', _bit_gen_type): The unboxed BitGenerator associated with + this Generator object instance. + * ('parent', types.pyobject): Pointer to the original Generator PyObject. + * ('meminfo', types.MemInfoPointer(types.voidptr)): The information about the memory + stored at the pointer (to the original Generator PyObject). This is useful for + keeping track of reference counts within the Python runtime. Helps prevent cases + where deletion happens in Python runtime without NRT being awareness of it. + """ + is_error_ptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) + + with ExitStack() as stack: + struct_ptr = cgutils.create_struct_proxy(typ)(c.context, c.builder) + bit_gen_inst = c.pyapi.object_getattr_string(obj, 'bit_generator') + with early_exit_if_null(c.builder, stack, bit_gen_inst): + c.builder.store(cgutils.true_bit, is_error_ptr) + unboxed = c.unbox(_bit_gen_type, bit_gen_inst).value + struct_ptr.bit_generator = unboxed + struct_ptr.parent = obj + NULL = cgutils.voidptr_t(None) + struct_ptr.meminfo = c.pyapi.nrt_meminfo_new_from_pyobject( + NULL, # there's no data + obj, # the python object, the call to nrt_meminfo_new_from_pyobject + # will py_incref + ) + c.pyapi.decref(bit_gen_inst) + + return NativeValue(struct_ptr._getvalue(), is_error=c.builder.load(is_error_ptr)) + + +@box(types.NumPyRandomGeneratorType) +def box_numpy_random_generator(typ, val, c): + inst = c.context.make_helper(c.builder, typ, val) + obj = inst.parent + res = cgutils.alloca_once_value(c.builder, obj) + c.pyapi.incref(obj) + # Steal NRT ref + c.context.nrt.decref(c.builder, typ, val) + return c.builder.load(res) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/bytecode.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/bytecode.py new file mode 100644 index 0000000000000000000000000000000000000000..13516cd15290e9ebd753925ee15fc305c1b003c6 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/bytecode.py @@ -0,0 +1,369 @@ +from collections import namedtuple, OrderedDict +import dis +import inspect +import itertools +from types import CodeType, ModuleType + +from numba.core import errors, utils, serialize +from numba.core.utils import PYVERSION + +opcode_info = namedtuple('opcode_info', ['argsize']) + +# The following offset is used as a hack to inject a NOP at the start of the +# bytecode. So that function starting with `while True` will not have block-0 +# as a jump target. The Lowerer puts argument initialization at block-0. +_FIXED_OFFSET = 2 + + +def get_function_object(obj): + """ + Objects that wraps function should provide a "__numba__" magic attribute + that contains a name of an attribute that contains the actual python + function object. + """ + attr = getattr(obj, "__numba__", None) + if attr: + return getattr(obj, attr) + return obj + + +def get_code_object(obj): + "Shamelessly borrowed from llpython" + return getattr(obj, '__code__', getattr(obj, 'func_code', None)) + + +def _as_opcodes(seq): + lst = [] + for s in seq: + c = dis.opmap.get(s) + if c is not None: + lst.append(c) + return lst + + +JREL_OPS = frozenset(dis.hasjrel) +JABS_OPS = frozenset(dis.hasjabs) +JUMP_OPS = JREL_OPS | JABS_OPS +TERM_OPS = frozenset(_as_opcodes(['RETURN_VALUE', 'RAISE_VARARGS'])) +EXTENDED_ARG = dis.EXTENDED_ARG +HAVE_ARGUMENT = dis.HAVE_ARGUMENT + + +class ByteCodeInst(object): + ''' + Attributes + ---------- + - offset: + byte offset of opcode + - opcode: + opcode integer value + - arg: + instruction arg + - lineno: + -1 means unknown + ''' + __slots__ = 'offset', 'next', 'opcode', 'opname', 'arg', 'lineno' + + def __init__(self, offset, opcode, arg, nextoffset): + self.offset = offset + self.next = nextoffset + self.opcode = opcode + self.opname = dis.opname[opcode] + self.arg = arg + self.lineno = -1 # unknown line number + + @property + def is_jump(self): + return self.opcode in JUMP_OPS + + @property + def is_terminator(self): + return self.opcode in TERM_OPS + + def get_jump_target(self): + # With Python 3.10 the addressing of "bytecode" instructions has + # changed from using bytes to using 16-bit words instead. As a + # consequence the code to determine where a jump will lead had to be + # adapted. + # See also: + # https://bugs.python.org/issue26647 + # https://bugs.python.org/issue27129 + # https://github.com/python/cpython/pull/25069 + assert self.is_jump + if PYVERSION >= (3, 10): + if self.opcode in JREL_OPS: + return self.next + self.arg * 2 + else: + assert self.opcode in JABS_OPS + return self.arg * 2 - 2 + else: + if self.opcode in JREL_OPS: + return self.next + self.arg + else: + assert self.opcode in JABS_OPS + return self.arg + + def __repr__(self): + return '%s(arg=%s, lineno=%d)' % (self.opname, self.arg, self.lineno) + + @property + def block_effect(self): + """Effect of the block stack + Returns +1 (push), 0 (none) or -1 (pop) + """ + if self.opname.startswith('SETUP_'): + return 1 + elif self.opname == 'POP_BLOCK': + return -1 + else: + return 0 + + +CODE_LEN = 1 +ARG_LEN = 1 +NO_ARG_LEN = 1 + +OPCODE_NOP = dis.opname.index('NOP') + + +# Adapted from Lib/dis.py +def _unpack_opargs(code): + """ + Returns a 4-int-tuple of + (bytecode offset, opcode, argument, offset of next bytecode). + """ + extended_arg = 0 + n = len(code) + offset = i = 0 + while i < n: + op = code[i] + i += CODE_LEN + if op >= HAVE_ARGUMENT: + arg = code[i] | extended_arg + for j in range(ARG_LEN): + arg |= code[i + j] << (8 * j) + i += ARG_LEN + if op == EXTENDED_ARG: + extended_arg = arg << 8 * ARG_LEN + continue + else: + arg = None + i += NO_ARG_LEN + + extended_arg = 0 + yield (offset, op, arg, i) + offset = i # Mark inst offset at first extended + + +def _patched_opargs(bc_stream): + """Patch the bytecode stream. + + - Adds a NOP bytecode at the start to avoid jump target being at the entry. + """ + # Injected NOP + yield (0, OPCODE_NOP, None, _FIXED_OFFSET) + # Adjust bytecode offset for the rest of the stream + for offset, opcode, arg, nextoffset in bc_stream: + # If the opcode has an absolute jump target, adjust it. + if opcode in JABS_OPS: + arg += _FIXED_OFFSET + yield offset + _FIXED_OFFSET, opcode, arg, nextoffset + _FIXED_OFFSET + + +class ByteCodeIter(object): + def __init__(self, code): + self.code = code + self.iter = iter(_patched_opargs(_unpack_opargs(self.code.co_code))) + + def __iter__(self): + return self + + def _fetch_opcode(self): + return next(self.iter) + + def next(self): + offset, opcode, arg, nextoffset = self._fetch_opcode() + return offset, ByteCodeInst(offset=offset, opcode=opcode, arg=arg, + nextoffset=nextoffset) + + __next__ = next + + def read_arg(self, size): + buf = 0 + for i in range(size): + _offset, byte = next(self.iter) + buf |= byte << (8 * i) + return buf + + +class ByteCode(object): + """ + The decoded bytecode of a function, and related information. + """ + __slots__ = ('func_id', 'co_names', 'co_varnames', 'co_consts', + 'co_cellvars', 'co_freevars', 'table', 'labels') + + def __init__(self, func_id): + code = func_id.code + + labels = set(x + _FIXED_OFFSET for x in dis.findlabels(code.co_code)) + labels.add(0) + + # A map of {offset: ByteCodeInst} + table = OrderedDict(ByteCodeIter(code)) + self._compute_lineno(table, code) + + self.func_id = func_id + self.co_names = code.co_names + self.co_varnames = code.co_varnames + self.co_consts = code.co_consts + self.co_cellvars = code.co_cellvars + self.co_freevars = code.co_freevars + self.table = table + self.labels = sorted(labels) + + @classmethod + def _compute_lineno(cls, table, code): + """ + Compute the line numbers for all bytecode instructions. + """ + for offset, lineno in dis.findlinestarts(code): + adj_offset = offset + _FIXED_OFFSET + if adj_offset in table: + table[adj_offset].lineno = lineno + # Assign unfilled lineno + # Start with first bytecode's lineno + known = table[_FIXED_OFFSET].lineno + for inst in table.values(): + if inst.lineno >= 0: + known = inst.lineno + else: + inst.lineno = known + return table + + def __iter__(self): + return iter(self.table.values()) + + def __getitem__(self, offset): + return self.table[offset] + + def __contains__(self, offset): + return offset in self.table + + def dump(self): + def label_marker(i): + if i[1].offset in self.labels: + return '>' + else: + return ' ' + + return '\n'.join('%s %10s\t%s' % ((label_marker(i),) + i) + for i in self.table.items()) + + @classmethod + def _compute_used_globals(cls, func, table, co_consts, co_names): + """ + Compute the globals used by the function with the given + bytecode table. + """ + d = {} + globs = func.__globals__ + builtins = globs.get('__builtins__', utils.builtins) + if isinstance(builtins, ModuleType): + builtins = builtins.__dict__ + # Look for LOAD_GLOBALs in the bytecode + for inst in table.values(): + if inst.opname == 'LOAD_GLOBAL': + name = co_names[inst.arg] + if name not in d: + try: + value = globs[name] + except KeyError: + value = builtins[name] + d[name] = value + # Add globals used by any nested code object + for co in co_consts: + if isinstance(co, CodeType): + subtable = OrderedDict(ByteCodeIter(co)) + d.update(cls._compute_used_globals(func, subtable, + co.co_consts, co.co_names)) + return d + + def get_used_globals(self): + """ + Get a {name: value} map of the globals used by this code + object and any nested code objects. + """ + return self._compute_used_globals(self.func_id.func, self.table, + self.co_consts, self.co_names) + + +class FunctionIdentity(serialize.ReduceMixin): + """ + A function's identity and metadata. + + Note this typically represents a function whose bytecode is + being compiled, not necessarily the top-level user function + (the two might be distinct, e.g. in the `@generated_jit` case). + """ + _unique_ids = itertools.count(1) + + @classmethod + def from_function(cls, pyfunc): + """ + Create the FunctionIdentity of the given function. + """ + func = get_function_object(pyfunc) + code = get_code_object(func) + pysig = utils.pysignature(func) + if not code: + raise errors.ByteCodeSupportError( + "%s does not provide its bytecode" % func) + + try: + func_qualname = func.__qualname__ + except AttributeError: + func_qualname = func.__name__ + + self = cls() + self.func = func + self.func_qualname = func_qualname + self.func_name = func_qualname.split('.')[-1] + self.code = code + self.module = inspect.getmodule(func) + self.modname = (utils._dynamic_modname + if self.module is None + else self.module.__name__) + self.is_generator = inspect.isgeneratorfunction(func) + self.pysig = pysig + self.filename = code.co_filename + self.firstlineno = code.co_firstlineno + self.arg_count = len(pysig.parameters) + self.arg_names = list(pysig.parameters) + + # Even the same function definition can be compiled into + # several different function objects with distinct closure + # variables, so we make sure to disambiguate using an unique id. + uid = next(cls._unique_ids) + self.unique_name = '{}${}'.format(self.func_qualname, uid) + self.unique_id = uid + + return self + + def derive(self): + """Copy the object and increment the unique counter. + """ + return self.from_function(self.func) + + def _reduce_states(self): + """ + NOTE: part of ReduceMixin protocol + """ + return dict(pyfunc=self.func) + + @classmethod + def _rebuild(cls, pyfunc): + """ + NOTE: part of ReduceMixin protocol + """ + return cls.from_function(pyfunc) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/byteflow.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/byteflow.py new file mode 100644 index 0000000000000000000000000000000000000000..19b59596767f0488b2b0c59b8a49df146cee528a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/byteflow.py @@ -0,0 +1,1555 @@ +""" +Implement python 3.8+ bytecode analysis +""" + +from pprint import pformat +import logging +from collections import namedtuple, defaultdict, deque +from functools import total_ordering + +from numba.core.utils import UniqueDict, PYVERSION +from numba.core.controlflow import NEW_BLOCKERS, CFGraph +from numba.core.ir import Loc +from numba.core.errors import UnsupportedError + + +_logger = logging.getLogger(__name__) + + +_EXCEPT_STACK_OFFSET = 6 +_FINALLY_POP = _EXCEPT_STACK_OFFSET if PYVERSION >= (3, 8) else 1 +_NO_RAISE_OPS = frozenset({ + 'LOAD_CONST', +}) + + +@total_ordering +class BlockKind(object): + """Kinds of block to make related code safer than just `str`. + """ + _members = frozenset({ + 'LOOP', + 'TRY', 'EXCEPT', 'FINALLY', + 'WITH', 'WITH_FINALLY', + }) + + def __init__(self, value): + assert value in self._members + self._value = value + + def __hash__(self): + return hash((type(self), self._value)) + + def __lt__(self, other): + if isinstance(other, BlockKind): + return self._value < other._value + else: + raise TypeError('cannot compare to {!r}'.format(type(other))) + + def __eq__(self, other): + if isinstance(other, BlockKind): + return self._value == other._value + else: + raise TypeError('cannot compare to {!r}'.format(type(other))) + + def __repr__(self): + return "BlockKind({})".format(self._value) + + +class _lazy_pformat(object): + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + + def __str__(self): + return pformat(*self.args, **self.kwargs) + + +class Flow(object): + """Data+Control Flow analysis. + + Simulate execution to recover dataflow and controlflow information. + """ + def __init__(self, bytecode): + _logger.debug("bytecode dump:\n%s", bytecode.dump()) + self._bytecode = bytecode + self.block_infos = UniqueDict() + + def run(self): + """Run a trace over the bytecode over all reachable path. + + The trace starts at bytecode offset 0 and gathers stack and control- + flow information by partially interpreting each bytecode. + Each ``State`` instance in the trace corresponds to a basic-block. + The State instances forks when a jump instruction is encountered. + A newly forked state is then added to the list of pending states. + The trace ends when there are no more pending states. + """ + firststate = State(bytecode=self._bytecode, pc=0, nstack=0, + blockstack=()) + runner = TraceRunner(debug_filename=self._bytecode.func_id.filename) + runner.pending.append(firststate) + + # Enforce unique-ness on initial PC to avoid re-entering the PC with + # a different stack-depth. We don't know if such a case is ever + # possible, but no such case has been encountered in our tests. + first_encounter = UniqueDict() + # Loop over each pending state at a initial PC. + # Each state is tracing a basic block + while runner.pending: + _logger.debug("pending: %s", runner.pending) + state = runner.pending.popleft() + if state not in runner.finished: + _logger.debug("stack: %s", state._stack) + first_encounter[state.pc_initial] = state + # Loop over the state until it is terminated. + while True: + runner.dispatch(state) + # Terminated? + if state.has_terminated(): + break + elif (state.has_active_try() and + state.get_inst().opname not in _NO_RAISE_OPS): + # Is in a *try* block + state.fork(pc=state.get_inst().next) + tryblk = state.get_top_block('TRY') + state.pop_block_and_above(tryblk) + nstack = state.stack_depth + kwargs = {} + if nstack > tryblk['entry_stack']: + kwargs['npop'] = nstack - tryblk['entry_stack'] + handler = tryblk['handler'] + kwargs['npush'] = { + BlockKind('EXCEPT'): _EXCEPT_STACK_OFFSET, + BlockKind('FINALLY'): _FINALLY_POP + }[handler['kind']] + kwargs['extra_block'] = handler + state.fork(pc=tryblk['end'], **kwargs) + break + else: + state.advance_pc() + # Must the new PC be a new block? + if self._is_implicit_new_block(state): + # check if this is a with...as, abort if so + self._guard_with_as(state) + # else split + state.split_new_block() + break + _logger.debug("end state. edges=%s", state.outgoing_edges) + runner.finished.add(state) + out_states = state.get_outgoing_states() + runner.pending.extend(out_states) + + # Complete controlflow + self._build_cfg(runner.finished) + # Prune redundant PHI-nodes + self._prune_phis(runner) + # Post process + for state in sorted(runner.finished, key=lambda x: x.pc_initial): + self.block_infos[state.pc_initial] = si = adapt_state_infos(state) + _logger.debug("block_infos %s:\n%s", state, si) + + def _build_cfg(self, all_states): + graph = CFGraph() + for state in all_states: + b = state.pc_initial + graph.add_node(b) + for state in all_states: + for edge in state.outgoing_edges: + graph.add_edge(state.pc_initial, edge.pc, 0) + graph.set_entry_point(0) + graph.process() + self.cfgraph = graph + + def _prune_phis(self, runner): + # Find phis that are unused in the local block + _logger.debug("Prune PHIs".center(60, '-')) + + # Compute dataflow for used phis and propagate + + # 1. Get used-phis for each block + # Map block to used_phis + def get_used_phis_per_state(): + used_phis = defaultdict(set) + phi_set = set() + for state in runner.finished: + used = set(state._used_regs) + phis = set(state._phis) + used_phis[state] |= phis & used + phi_set |= phis + return used_phis, phi_set + + # Find use-defs + def find_use_defs(): + defmap = {} + phismap = defaultdict(set) + for state in runner.finished: + for phi, rhs in state._outgoing_phis.items(): + if rhs not in phi_set: + # Is a definition + defmap[phi] = state + phismap[phi].add((rhs, state)) + _logger.debug("defmap: %s", _lazy_pformat(defmap)) + _logger.debug("phismap: %s", _lazy_pformat(phismap)) + return defmap, phismap + + def propagate_phi_map(phismap): + """An iterative dataflow algorithm to find the definition + (the source) of each PHI node. + """ + blacklist = defaultdict(set) + + while True: + changing = False + for phi, defsites in sorted(list(phismap.items())): + for rhs, state in sorted(list(defsites)): + if rhs in phi_set: + defsites |= phismap[rhs] + blacklist[phi].add((rhs, state)) + to_remove = blacklist[phi] + if to_remove & defsites: + defsites -= to_remove + changing = True + + _logger.debug("changing phismap: %s", _lazy_pformat(phismap)) + if not changing: + break + + def apply_changes(used_phis, phismap): + keep = {} + for state, used_set in used_phis.items(): + for phi in used_set: + keep[phi] = phismap[phi] + _logger.debug("keep phismap: %s", _lazy_pformat(keep)) + new_out = defaultdict(dict) + for phi in keep: + for rhs, state in keep[phi]: + new_out[state][phi] = rhs + + _logger.debug("new_out: %s", _lazy_pformat(new_out)) + for state in runner.finished: + state._outgoing_phis.clear() + state._outgoing_phis.update(new_out[state]) + + used_phis, phi_set = get_used_phis_per_state() + _logger.debug("Used_phis: %s", _lazy_pformat(used_phis)) + defmap, phismap = find_use_defs() + propagate_phi_map(phismap) + apply_changes(used_phis, phismap) + _logger.debug("DONE Prune PHIs".center(60, '-')) + + def _is_implicit_new_block(self, state): + inst = state.get_inst() + + if inst.offset in self._bytecode.labels: + return True + elif inst.opname in NEW_BLOCKERS: + return True + else: + return False + + def _guard_with_as(self, state): + """Checks if the next instruction after a SETUP_WITH is something other + than a POP_TOP, if it is something else it'll be some sort of store + which is not supported (this corresponds to `with CTXMGR as VAR(S)`).""" + current_inst = state.get_inst() + if current_inst.opname == "SETUP_WITH": + next_op = self._bytecode[current_inst.next].opname + if next_op != "POP_TOP": + msg = ("The 'with (context manager) as " + "(variable):' construct is not " + "supported.") + raise UnsupportedError(msg) + + +class TraceRunner(object): + """Trace runner contains the states for the trace and the opcode dispatch. + """ + def __init__(self, debug_filename): + self.debug_filename = debug_filename + self.pending = deque() + self.finished = set() + + def get_debug_loc(self, lineno): + return Loc(self.debug_filename, lineno) + + def dispatch(self, state): + inst = state.get_inst() + _logger.debug("dispatch pc=%s, inst=%s", state._pc, inst) + _logger.debug("stack %s", state._stack) + fn = getattr(self, "op_{}".format(inst.opname), None) + if fn is not None: + fn(state, inst) + else: + msg = "Use of unsupported opcode (%s) found" % inst.opname + raise UnsupportedError(msg, loc=self.get_debug_loc(inst.lineno)) + + def op_NOP(self, state, inst): + state.append(inst) + + def op_FORMAT_VALUE(self, state, inst): + """ + FORMAT_VALUE(flags): flags argument specifies format spec which is + not supported yet. Currently, we just call str() on the value. + Pops a value from stack and pushes results back. + Required for supporting f-strings. + https://docs.python.org/3/library/dis.html#opcode-FORMAT_VALUE + """ + if inst.arg != 0: + msg = "format spec in f-strings not supported yet" + raise UnsupportedError(msg, loc=self.get_debug_loc(inst.lineno)) + value = state.pop() + strvar = state.make_temp() + res = state.make_temp() + state.append(inst, value=value, res=res, strvar=strvar) + state.push(res) + + def op_BUILD_STRING(self, state, inst): + """ + BUILD_STRING(count): Concatenates count strings from the stack and + pushes the resulting string onto the stack. + Required for supporting f-strings. + https://docs.python.org/3/library/dis.html#opcode-BUILD_STRING + """ + count = inst.arg + strings = list(reversed([state.pop() for _ in range(count)])) + # corner case: f"" + if count == 0: + tmps = [state.make_temp()] + else: + tmps = [state.make_temp() for _ in range(count - 1)] + state.append(inst, strings=strings, tmps=tmps) + state.push(tmps[-1]) + + def op_POP_TOP(self, state, inst): + state.pop() + + def op_LOAD_GLOBAL(self, state, inst): + res = state.make_temp() + state.append(inst, res=res) + state.push(res) + + def op_LOAD_DEREF(self, state, inst): + res = state.make_temp() + state.append(inst, res=res) + state.push(res) + + def op_LOAD_CONST(self, state, inst): + res = state.make_temp("const") + state.push(res) + state.append(inst, res=res) + + def op_LOAD_ATTR(self, state, inst): + item = state.pop() + res = state.make_temp() + state.append(inst, item=item, res=res) + state.push(res) + + def op_LOAD_FAST(self, state, inst): + name = state.get_varname(inst) + res = state.make_temp(name) + state.append(inst, res=res) + state.push(res) + + def op_DELETE_FAST(self, state, inst): + state.append(inst) + + def op_DELETE_ATTR(self, state, inst): + target = state.pop() + state.append(inst, target=target) + + def op_STORE_ATTR(self, state, inst): + target = state.pop() + value = state.pop() + state.append(inst, target=target, value=value) + + def op_STORE_DEREF(self, state, inst): + value = state.pop() + state.append(inst, value=value) + + def op_STORE_FAST(self, state, inst): + value = state.pop() + state.append(inst, value=value) + + def op_SLICE_1(self, state, inst): + """ + TOS = TOS1[TOS:] + """ + tos = state.pop() + tos1 = state.pop() + res = state.make_temp() + slicevar = state.make_temp() + indexvar = state.make_temp() + nonevar = state.make_temp() + state.append( + inst, + base=tos1, + start=tos, + res=res, + slicevar=slicevar, + indexvar=indexvar, + nonevar=nonevar, + ) + state.push(res) + + def op_SLICE_2(self, state, inst): + """ + TOS = TOS1[:TOS] + """ + tos = state.pop() + tos1 = state.pop() + res = state.make_temp() + slicevar = state.make_temp() + indexvar = state.make_temp() + nonevar = state.make_temp() + state.append( + inst, + base=tos1, + stop=tos, + res=res, + slicevar=slicevar, + indexvar=indexvar, + nonevar=nonevar, + ) + state.push(res) + + def op_SLICE_3(self, state, inst): + """ + TOS = TOS2[TOS1:TOS] + """ + tos = state.pop() + tos1 = state.pop() + tos2 = state.pop() + res = state.make_temp() + slicevar = state.make_temp() + indexvar = state.make_temp() + state.append( + inst, + base=tos2, + start=tos1, + stop=tos, + res=res, + slicevar=slicevar, + indexvar=indexvar, + ) + state.push(res) + + def op_STORE_SLICE_0(self, state, inst): + """ + TOS[:] = TOS1 + """ + tos = state.pop() + value = state.pop() + slicevar = state.make_temp() + indexvar = state.make_temp() + nonevar = state.make_temp() + state.append( + inst, + base=tos, + value=value, + slicevar=slicevar, + indexvar=indexvar, + nonevar=nonevar, + ) + + def op_STORE_SLICE_1(self, state, inst): + """ + TOS1[TOS:] = TOS2 + """ + tos = state.pop() + tos1 = state.pop() + value = state.pop() + slicevar = state.make_temp() + indexvar = state.make_temp() + nonevar = state.make_temp() + state.append( + inst, + base=tos1, + start=tos, + slicevar=slicevar, + value=value, + indexvar=indexvar, + nonevar=nonevar, + ) + + def op_STORE_SLICE_2(self, state, inst): + """ + TOS1[:TOS] = TOS2 + """ + tos = state.pop() + tos1 = state.pop() + value = state.pop() + slicevar = state.make_temp() + indexvar = state.make_temp() + nonevar = state.make_temp() + state.append( + inst, + base=tos1, + stop=tos, + value=value, + slicevar=slicevar, + indexvar=indexvar, + nonevar=nonevar, + ) + + def op_STORE_SLICE_3(self, state, inst): + """ + TOS2[TOS1:TOS] = TOS3 + """ + tos = state.pop() + tos1 = state.pop() + tos2 = state.pop() + value = state.pop() + slicevar = state.make_temp() + indexvar = state.make_temp() + state.append( + inst, + base=tos2, + start=tos1, + stop=tos, + value=value, + slicevar=slicevar, + indexvar=indexvar, + ) + + def op_DELETE_SLICE_0(self, state, inst): + """ + del TOS[:] + """ + tos = state.pop() + slicevar = state.make_temp() + indexvar = state.make_temp() + nonevar = state.make_temp() + state.append( + inst, base=tos, slicevar=slicevar, indexvar=indexvar, + nonevar=nonevar, + ) + + def op_DELETE_SLICE_1(self, state, inst): + """ + del TOS1[TOS:] + """ + tos = state.pop() + tos1 = state.pop() + slicevar = state.make_temp() + indexvar = state.make_temp() + nonevar = state.make_temp() + state.append( + inst, + base=tos1, + start=tos, + slicevar=slicevar, + indexvar=indexvar, + nonevar=nonevar, + ) + + def op_DELETE_SLICE_2(self, state, inst): + """ + del TOS1[:TOS] + """ + tos = state.pop() + tos1 = state.pop() + slicevar = state.make_temp() + indexvar = state.make_temp() + nonevar = state.make_temp() + state.append( + inst, + base=tos1, + stop=tos, + slicevar=slicevar, + indexvar=indexvar, + nonevar=nonevar, + ) + + def op_DELETE_SLICE_3(self, state, inst): + """ + del TOS2[TOS1:TOS] + """ + tos = state.pop() + tos1 = state.pop() + tos2 = state.pop() + slicevar = state.make_temp() + indexvar = state.make_temp() + state.append( + inst, base=tos2, start=tos1, stop=tos, slicevar=slicevar, + indexvar=indexvar + ) + + def op_BUILD_SLICE(self, state, inst): + """ + slice(TOS1, TOS) or slice(TOS2, TOS1, TOS) + """ + argc = inst.arg + if argc == 2: + tos = state.pop() + tos1 = state.pop() + start = tos1 + stop = tos + step = None + elif argc == 3: + tos = state.pop() + tos1 = state.pop() + tos2 = state.pop() + start = tos2 + stop = tos1 + step = tos + else: + raise Exception("unreachable") + slicevar = state.make_temp() + res = state.make_temp() + state.append( + inst, start=start, stop=stop, step=step, res=res, slicevar=slicevar + ) + state.push(res) + + def _op_POP_JUMP_IF(self, state, inst): + pred = state.pop() + state.append(inst, pred=pred) + + target_inst = inst.get_jump_target() + next_inst = inst.next + # if the next inst and the jump target are the same location, issue one + # fork else issue a fork for the next and the target. + state.fork(pc=next_inst) + if target_inst != next_inst: + state.fork(pc=target_inst) + + op_POP_JUMP_IF_TRUE = _op_POP_JUMP_IF + op_POP_JUMP_IF_FALSE = _op_POP_JUMP_IF + + def _op_JUMP_IF_OR_POP(self, state, inst): + pred = state.get_tos() + state.append(inst, pred=pred) + state.fork(pc=inst.next, npop=1) + state.fork(pc=inst.get_jump_target()) + + op_JUMP_IF_FALSE_OR_POP = _op_JUMP_IF_OR_POP + op_JUMP_IF_TRUE_OR_POP = _op_JUMP_IF_OR_POP + + def op_JUMP_FORWARD(self, state, inst): + state.append(inst) + state.fork(pc=inst.get_jump_target()) + + def op_JUMP_ABSOLUTE(self, state, inst): + state.append(inst) + state.fork(pc=inst.get_jump_target()) + + def op_BREAK_LOOP(self, state, inst): + # NOTE: bytecode removed since py3.8 + end = state.get_top_block('LOOP')['end'] + state.append(inst, end=end) + state.pop_block() + state.fork(pc=end) + + def op_RETURN_VALUE(self, state, inst): + state.append(inst, retval=state.pop(), castval=state.make_temp()) + state.terminate() + + def op_YIELD_VALUE(self, state, inst): + val = state.pop() + res = state.make_temp() + state.append(inst, value=val, res=res) + state.push(res) + + def op_RAISE_VARARGS(self, state, inst): + in_exc_block = any([ + state.get_top_block("EXCEPT") is not None, + state.get_top_block("FINALLY") is not None + ]) + if inst.arg == 0: + exc = None + if in_exc_block: + raise UnsupportedError( + "The re-raising of an exception is not yet supported.", + loc=self.get_debug_loc(inst.lineno), + ) + elif inst.arg == 1: + exc = state.pop() + else: + raise ValueError("Multiple argument raise is not supported.") + state.append(inst, exc=exc) + state.terminate() + + def op_BEGIN_FINALLY(self, state, inst): + temps = [] + for i in range(_EXCEPT_STACK_OFFSET): + tmp = state.make_temp() + temps.append(tmp) + state.push(tmp) + state.append(inst, temps=temps) + + def op_END_FINALLY(self, state, inst): + blk = state.pop_block() + state.reset_stack(blk['entry_stack']) + + def op_POP_FINALLY(self, state, inst): + # we don't emulate the exact stack behavior + if inst.arg != 0: + msg = ('Unsupported use of a bytecode related to try..finally' + ' or a with-context') + raise UnsupportedError(msg, loc=self.get_debug_loc(inst.lineno)) + + def op_CALL_FINALLY(self, state, inst): + pass + + def op_WITH_CLEANUP_START(self, state, inst): + # we don't emulate the exact stack behavior + state.append(inst) + + def op_WITH_CLEANUP_FINISH(self, state, inst): + # we don't emulate the exact stack behavior + state.append(inst) + + def op_SETUP_LOOP(self, state, inst): + # NOTE: bytecode removed since py3.8 + state.push_block( + state.make_block( + kind='LOOP', + end=inst.get_jump_target(), + ) + ) + + def op_SETUP_WITH(self, state, inst): + cm = state.pop() # the context-manager + + yielded = state.make_temp() + exitfn = state.make_temp(prefix='setup_with_exitfn') + state.append(inst, contextmanager=cm, exitfn=exitfn) + + # py39 doesn't have with-finally + if PYVERSION < (3, 9): + state.push_block( + state.make_block( + kind='WITH_FINALLY', + end=inst.get_jump_target(), + ) + ) + + state.push(exitfn) + state.push(yielded) + + state.push_block( + state.make_block( + kind='WITH', + end=inst.get_jump_target(), + ) + ) + # Forces a new block + state.fork(pc=inst.next) + + def _setup_try(self, kind, state, next, end): + handler_block = state.make_block( + kind=kind, + end=None, + reset_stack=False, + ) + # Forces a new block + # Fork to the body of the finally + state.fork( + pc=next, + extra_block=state.make_block( + kind='TRY', + end=end, + reset_stack=False, + handler=handler_block, + ) + ) + + def op_SETUP_EXCEPT(self, state, inst): + # Opcode removed since py3.8 + state.append(inst) + self._setup_try( + 'EXCEPT', state, next=inst.next, end=inst.get_jump_target(), + ) + + def op_SETUP_FINALLY(self, state, inst): + state.append(inst) + self._setup_try( + 'FINALLY', state, next=inst.next, end=inst.get_jump_target(), + ) + + def op_POP_EXCEPT(self, state, inst): + blk = state.pop_block() + if blk['kind'] not in {BlockKind('EXCEPT'), BlockKind('FINALLY')}: + raise UnsupportedError( + "POP_EXCEPT got an unexpected block: {}".format(blk['kind']), + loc=self.get_debug_loc(inst.lineno), + ) + state.pop() + state.pop() + state.pop() + # Forces a new block + state.fork(pc=inst.next) + + def op_POP_BLOCK(self, state, inst): + blk = state.pop_block() + if blk['kind'] == BlockKind('TRY'): + state.append(inst, kind='try') + elif blk['kind'] == BlockKind('WITH'): + state.append(inst, kind='with') + state.fork(pc=inst.next) + + def op_BINARY_SUBSCR(self, state, inst): + index = state.pop() + target = state.pop() + res = state.make_temp() + state.append(inst, index=index, target=target, res=res) + state.push(res) + + def op_STORE_SUBSCR(self, state, inst): + index = state.pop() + target = state.pop() + value = state.pop() + state.append(inst, target=target, index=index, value=value) + + def op_DELETE_SUBSCR(self, state, inst): + index = state.pop() + target = state.pop() + state.append(inst, target=target, index=index) + + def op_CALL_FUNCTION(self, state, inst): + narg = inst.arg + args = list(reversed([state.pop() for _ in range(narg)])) + func = state.pop() + + res = state.make_temp() + state.append(inst, func=func, args=args, res=res) + state.push(res) + + def op_CALL_FUNCTION_KW(self, state, inst): + narg = inst.arg + names = state.pop() # tuple of names + args = list(reversed([state.pop() for _ in range(narg)])) + func = state.pop() + + res = state.make_temp() + state.append(inst, func=func, args=args, names=names, res=res) + state.push(res) + + def op_CALL_FUNCTION_EX(self, state, inst): + if inst.arg & 1 and PYVERSION != (3, 10): + errmsg = "CALL_FUNCTION_EX with **kwargs not supported" + raise UnsupportedError(errmsg) + if inst.arg & 1: + varkwarg = state.pop() + else: + varkwarg = None + vararg = state.pop() + func = state.pop() + res = state.make_temp() + state.append(inst, func=func, vararg=vararg, varkwarg=varkwarg, res=res) + state.push(res) + + def _dup_topx(self, state, inst, count): + orig = [state.pop() for _ in range(count)] + orig.reverse() + # We need to actually create new temporaries if we want the + # IR optimization pass to work correctly (see issue #580) + duped = [state.make_temp() for _ in range(count)] + state.append(inst, orig=orig, duped=duped) + for val in orig: + state.push(val) + for val in duped: + state.push(val) + + def op_DUP_TOPX(self, state, inst): + count = inst.arg + assert 1 <= count <= 5, "Invalid DUP_TOPX count" + self._dup_topx(state, inst, count) + + def op_DUP_TOP(self, state, inst): + self._dup_topx(state, inst, count=1) + + def op_DUP_TOP_TWO(self, state, inst): + self._dup_topx(state, inst, count=2) + + def op_ROT_TWO(self, state, inst): + first = state.pop() + second = state.pop() + state.push(first) + state.push(second) + + def op_ROT_THREE(self, state, inst): + first = state.pop() + second = state.pop() + third = state.pop() + state.push(first) + state.push(third) + state.push(second) + + def op_ROT_FOUR(self, state, inst): + first = state.pop() + second = state.pop() + third = state.pop() + forth = state.pop() + state.push(first) + state.push(forth) + state.push(third) + state.push(second) + + def op_UNPACK_SEQUENCE(self, state, inst): + count = inst.arg + iterable = state.pop() + stores = [state.make_temp() for _ in range(count)] + tupleobj = state.make_temp() + state.append(inst, iterable=iterable, stores=stores, tupleobj=tupleobj) + for st in reversed(stores): + state.push(st) + + def op_BUILD_TUPLE(self, state, inst): + count = inst.arg + items = list(reversed([state.pop() for _ in range(count)])) + tup = state.make_temp() + state.append(inst, items=items, res=tup) + state.push(tup) + + def _build_tuple_unpack(self, state, inst): + # Builds tuple from other tuples on the stack + tuples = list(reversed([state.pop() for _ in range(inst.arg)])) + temps = [state.make_temp() for _ in range(len(tuples) - 1)] + + # if the unpack is assign-like, e.g. x = (*y,), it needs handling + # differently. + is_assign = len(tuples) == 1 + if is_assign: + temps = [state.make_temp(),] + + state.append(inst, tuples=tuples, temps=temps, is_assign=is_assign) + # The result is in the last temp var + state.push(temps[-1]) + + def op_BUILD_TUPLE_UNPACK_WITH_CALL(self, state, inst): + # just unpack the input tuple, call inst will be handled afterwards + self._build_tuple_unpack(state, inst) + + def op_BUILD_TUPLE_UNPACK(self, state, inst): + self._build_tuple_unpack(state, inst) + + def op_LIST_TO_TUPLE(self, state, inst): + # "Pops a list from the stack and pushes a tuple containing the same + # values." + tos = state.pop() + res = state.make_temp() # new tuple var + state.append(inst, const_list=tos, res=res) + state.push(res) + + def op_BUILD_CONST_KEY_MAP(self, state, inst): + keys = state.pop() + vals = list(reversed([state.pop() for _ in range(inst.arg)])) + keytmps = [state.make_temp() for _ in range(inst.arg)] + res = state.make_temp() + state.append(inst, keys=keys, keytmps=keytmps, values=vals, res=res) + state.push(res) + + def op_BUILD_LIST(self, state, inst): + count = inst.arg + items = list(reversed([state.pop() for _ in range(count)])) + lst = state.make_temp() + state.append(inst, items=items, res=lst) + state.push(lst) + + def op_LIST_APPEND(self, state, inst): + value = state.pop() + index = inst.arg + target = state.peek(index) + appendvar = state.make_temp() + res = state.make_temp() + state.append(inst, target=target, value=value, appendvar=appendvar, + res=res) + + def op_LIST_EXTEND(self, state, inst): + value = state.pop() + index = inst.arg + target = state.peek(index) + extendvar = state.make_temp() + res = state.make_temp() + state.append(inst, target=target, value=value, extendvar=extendvar, + res=res) + + def op_BUILD_MAP(self, state, inst): + dct = state.make_temp() + count = inst.arg + items = [] + # In 3.5+, BUILD_MAP takes pairs from the stack + for i in range(count): + v, k = state.pop(), state.pop() + items.append((k, v)) + state.append(inst, items=items[::-1], size=count, res=dct) + state.push(dct) + + def op_MAP_ADD(self, state, inst): + # NOTE: https://docs.python.org/3/library/dis.html#opcode-MAP_ADD + # Python >= 3.8: TOS and TOS1 are value and key respectively + # Python < 3.8: TOS and TOS1 are key and value respectively + TOS = state.pop() + TOS1 = state.pop() + key, value = (TOS, TOS1) if PYVERSION < (3, 8) else (TOS1, TOS) + index = inst.arg + target = state.peek(index) + setitemvar = state.make_temp() + res = state.make_temp() + state.append(inst, target=target, key=key, value=value, + setitemvar=setitemvar, res=res) + + def op_BUILD_SET(self, state, inst): + count = inst.arg + # Note: related python bug http://bugs.python.org/issue26020 + items = list(reversed([state.pop() for _ in range(count)])) + res = state.make_temp() + state.append(inst, items=items, res=res) + state.push(res) + + def op_SET_UPDATE(self, state, inst): + value = state.pop() + index = inst.arg + target = state.peek(index) + updatevar = state.make_temp() + res = state.make_temp() + state.append(inst, target=target, value=value, updatevar=updatevar, + res=res) + + def op_DICT_UPDATE(self, state, inst): + value = state.pop() + index = inst.arg + target = state.peek(index) + updatevar = state.make_temp() + res = state.make_temp() + state.append(inst, target=target, value=value, updatevar=updatevar, + res=res) + + def op_GET_ITER(self, state, inst): + value = state.pop() + res = state.make_temp() + state.append(inst, value=value, res=res) + state.push(res) + + def op_FOR_ITER(self, state, inst): + iterator = state.get_tos() + pair = state.make_temp() + indval = state.make_temp() + pred = state.make_temp() + state.append(inst, iterator=iterator, pair=pair, indval=indval, + pred=pred) + state.push(indval) + end = inst.get_jump_target() + state.fork(pc=end, npop=2) + state.fork(pc=inst.next) + + def op_GEN_START(self, state, inst): + """Pops TOS. If TOS was not None, raises an exception. The kind + operand corresponds to the type of generator or coroutine and + determines the error message. The legal kinds are 0 for generator, + 1 for coroutine, and 2 for async generator. + + New in version 3.10. + """ + # no-op in Numba + pass + + def _unaryop(self, state, inst): + val = state.pop() + res = state.make_temp() + state.append(inst, value=val, res=res) + state.push(res) + + op_UNARY_NEGATIVE = _unaryop + op_UNARY_POSITIVE = _unaryop + op_UNARY_NOT = _unaryop + op_UNARY_INVERT = _unaryop + + def _binaryop(self, state, inst): + rhs = state.pop() + lhs = state.pop() + res = state.make_temp() + state.append(inst, lhs=lhs, rhs=rhs, res=res) + state.push(res) + + op_COMPARE_OP = _binaryop + op_IS_OP = _binaryop + op_CONTAINS_OP = _binaryop + + op_INPLACE_ADD = _binaryop + op_INPLACE_SUBTRACT = _binaryop + op_INPLACE_MULTIPLY = _binaryop + op_INPLACE_DIVIDE = _binaryop + op_INPLACE_TRUE_DIVIDE = _binaryop + op_INPLACE_FLOOR_DIVIDE = _binaryop + op_INPLACE_MODULO = _binaryop + op_INPLACE_POWER = _binaryop + op_INPLACE_MATRIX_MULTIPLY = _binaryop + + op_INPLACE_LSHIFT = _binaryop + op_INPLACE_RSHIFT = _binaryop + op_INPLACE_AND = _binaryop + op_INPLACE_OR = _binaryop + op_INPLACE_XOR = _binaryop + + op_BINARY_ADD = _binaryop + op_BINARY_SUBTRACT = _binaryop + op_BINARY_MULTIPLY = _binaryop + op_BINARY_DIVIDE = _binaryop + op_BINARY_TRUE_DIVIDE = _binaryop + op_BINARY_FLOOR_DIVIDE = _binaryop + op_BINARY_MODULO = _binaryop + op_BINARY_POWER = _binaryop + op_BINARY_MATRIX_MULTIPLY = _binaryop + + op_BINARY_LSHIFT = _binaryop + op_BINARY_RSHIFT = _binaryop + op_BINARY_AND = _binaryop + op_BINARY_OR = _binaryop + op_BINARY_XOR = _binaryop + + def op_MAKE_FUNCTION(self, state, inst, MAKE_CLOSURE=False): + name = state.pop() + code = state.pop() + closure = annotations = kwdefaults = defaults = None + if PYVERSION < (3, 6): + num_posdefaults = inst.arg & 0xFF + num_kwdefaults = (inst.arg >> 8) & 0xFF + num_annotations = (inst.arg >> 16) & 0x7FFF + if MAKE_CLOSURE: + closure = state.pop() + if num_annotations > 0: + annotations = state.pop() + if num_kwdefaults > 0: + kwdefaults = [] + for i in range(num_kwdefaults): + v = state.pop() + k = state.pop() + kwdefaults.append((k, v)) + kwdefaults = tuple(kwdefaults) + if num_posdefaults: + defaults = [] + for i in range(num_posdefaults): + defaults.append(state.pop()) + defaults = tuple(defaults) + else: + if inst.arg & 0x8: + closure = state.pop() + if inst.arg & 0x4: + annotations = state.pop() + if inst.arg & 0x2: + kwdefaults = state.pop() + if inst.arg & 0x1: + defaults = state.pop() + res = state.make_temp() + state.append( + inst, + name=name, + code=code, + closure=closure, + annotations=annotations, + kwdefaults=kwdefaults, + defaults=defaults, + res=res, + ) + state.push(res) + + def op_MAKE_CLOSURE(self, state, inst): + self.op_MAKE_FUNCTION(state, inst, MAKE_CLOSURE=True) + + def op_LOAD_CLOSURE(self, state, inst): + res = state.make_temp() + state.append(inst, res=res) + state.push(res) + + def op_LOAD_ASSERTION_ERROR(self, state, inst): + res = state.make_temp("assertion_error") + state.append(inst, res=res) + state.push(res) + + def op_JUMP_IF_NOT_EXC_MATCH(self, state, inst): + # Tests whether the second value on the stack is an exception matching + # TOS, and jumps if it is not. Pops two values from the stack. + pred = state.make_temp("predicate") + tos = state.pop() + tos1 = state.pop() + state.append(inst, pred=pred, tos=tos, tos1=tos1) + state.fork(pc=inst.next) + state.fork(pc=inst.get_jump_target()) + + def op_RERAISE(self, state, inst): + # This isn't handled, but the state is set up anyway + exc = state.pop() + state.append(inst, exc=exc) + state.terminate() + + # NOTE: Please see notes in `interpreter.py` surrounding the implementation + # of LOAD_METHOD and CALL_METHOD. + + def op_LOAD_METHOD(self, state, inst): + self.op_LOAD_ATTR(state, inst) + + def op_CALL_METHOD(self, state, inst): + self.op_CALL_FUNCTION(state, inst) + + +@total_ordering +class State(object): + """State of the trace + """ + def __init__(self, bytecode, pc, nstack, blockstack): + """ + Parameters + ---------- + bytecode : numba.bytecode.ByteCode + function bytecode + pc : int + program counter + nstack : int + stackdepth at entry + blockstack : Sequence[Dict] + A sequence of dictionary denoting entries on the blockstack. + """ + self._bytecode = bytecode + self._pc_initial = pc + self._pc = pc + self._nstack_initial = nstack + self._stack = [] + self._blockstack_initial = tuple(blockstack) + self._blockstack = list(blockstack) + self._temp_registers = [] + self._insts = [] + self._outedges = [] + self._terminated = False + self._phis = {} + self._outgoing_phis = UniqueDict() + self._used_regs = set() + for i in range(nstack): + phi = self.make_temp("phi") + self._phis[phi] = i + self.push(phi) + + def __repr__(self): + return "State(pc_initial={} nstack_initial={})".format( + self._pc_initial, self._nstack_initial + ) + + def get_identity(self): + return (self._pc_initial, self._nstack_initial) + + def __hash__(self): + return hash(self.get_identity()) + + def __lt__(self, other): + return self.get_identity() < other.get_identity() + + def __eq__(self, other): + return self.get_identity() == other.get_identity() + + @property + def pc_initial(self): + """The starting bytecode offset of this State. + The PC given to the constructor. + """ + return self._pc_initial + + @property + def instructions(self): + """The list of instructions information as a 2-tuple of + ``(pc : int, register_map : Dict)`` + """ + return self._insts + + @property + def outgoing_edges(self): + """The list of outgoing edges. + + Returns + ------- + edges : List[State] + """ + return self._outedges + + @property + def outgoing_phis(self): + """The dictionary of outgoing phi nodes. + + The keys are the name of the PHI nodes. + The values are the outgoing states. + """ + return self._outgoing_phis + + @property + def blockstack_initial(self): + """A copy of the initial state of the blockstack + """ + return self._blockstack_initial + + @property + def stack_depth(self): + """The current size of the stack + + Returns + ------- + res : int + """ + return len(self._stack) + + def find_initial_try_block(self): + """Find the initial *try* block. + """ + for blk in reversed(self._blockstack_initial): + if blk['kind'] == BlockKind('TRY'): + return blk + + def has_terminated(self): + return self._terminated + + def get_inst(self): + return self._bytecode[self._pc] + + def advance_pc(self): + inst = self.get_inst() + self._pc = inst.next + + def make_temp(self, prefix=""): + if not prefix: + name = "${prefix}{offset}{opname}.{tempct}".format( + prefix=prefix, + offset=self._pc, + opname=self.get_inst().opname.lower(), + tempct=len(self._temp_registers), + ) + else: + name = "${prefix}{offset}.{tempct}".format( + prefix=prefix, + offset=self._pc, + tempct=len(self._temp_registers), + ) + + self._temp_registers.append(name) + return name + + def append(self, inst, **kwargs): + """Append new inst""" + self._insts.append((inst.offset, kwargs)) + self._used_regs |= set(_flatten_inst_regs(kwargs.values())) + + def get_tos(self): + return self.peek(1) + + def peek(self, k): + """Return the k'th element on the stack + """ + return self._stack[-k] + + def push(self, item): + """Push to stack""" + self._stack.append(item) + + def pop(self): + """Pop the stack""" + return self._stack.pop() + + def push_block(self, synblk): + """Push a block to blockstack + """ + assert 'stack_depth' in synblk + self._blockstack.append(synblk) + + def reset_stack(self, depth): + """Reset the stack to the given stack depth. + Returning the popped items. + """ + self._stack, popped = self._stack[:depth], self._stack[depth:] + return popped + + def make_block(self, kind, end, reset_stack=True, handler=None): + """Make a new block + """ + d = { + 'kind': BlockKind(kind), + 'end': end, + 'entry_stack': len(self._stack), + } + if reset_stack: + d['stack_depth'] = len(self._stack) + else: + d['stack_depth'] = None + d['handler'] = handler + return d + + def pop_block(self): + """Pop a block and unwind the stack + """ + b = self._blockstack.pop() + self.reset_stack(b['stack_depth']) + return b + + def pop_block_and_above(self, blk): + """Find *blk* in the blockstack and remove it and all blocks above it + from the stack. + """ + idx = self._blockstack.index(blk) + assert 0 <= idx < len(self._blockstack) + self._blockstack = self._blockstack[:idx] + + def get_top_block(self, kind): + """Find the first block that matches *kind* + """ + kind = BlockKind(kind) + for bs in reversed(self._blockstack): + if bs['kind'] == kind: + return bs + + def has_active_try(self): + """Returns a boolean indicating if the top-block is a *try* block + """ + return self.get_top_block('TRY') is not None + + def get_varname(self, inst): + """Get referenced variable name from the oparg + """ + return self._bytecode.co_varnames[inst.arg] + + def terminate(self): + """Mark block as terminated + """ + self._terminated = True + + def fork(self, pc, npop=0, npush=0, extra_block=None): + """Fork the state + """ + # Handle changes on the stack + stack = list(self._stack) + if npop: + assert 0 <= npop <= len(self._stack) + nstack = len(self._stack) - npop + stack = stack[:nstack] + if npush: + assert 0 <= npush + for i in range(npush): + stack.append(self.make_temp()) + # Handle changes on the blockstack + blockstack = list(self._blockstack) + if extra_block: + blockstack.append(extra_block) + self._outedges.append(Edge( + pc=pc, stack=tuple(stack), npush=npush, + blockstack=tuple(blockstack), + )) + self.terminate() + + def split_new_block(self): + """Split the state + """ + self.fork(pc=self._pc) + + def get_outgoing_states(self): + """Get states for each outgoing edges + """ + # Should only call once + assert not self._outgoing_phis + ret = [] + for edge in self._outedges: + state = State(bytecode=self._bytecode, pc=edge.pc, + nstack=len(edge.stack), blockstack=edge.blockstack) + ret.append(state) + # Map outgoing_phis + for phi, i in state._phis.items(): + self._outgoing_phis[phi] = edge.stack[i] + return ret + + def get_outgoing_edgepushed(self): + """ + Returns + ------- + Dict[int, int] + where keys are the PC + values are the edge-pushed stack values + """ + + return {edge.pc: tuple(edge.stack[-edge.npush:]) + for edge in self._outedges} + + +Edge = namedtuple("Edge", ["pc", "stack", "blockstack", "npush"]) + + +class AdaptDFA(object): + """Adapt Flow to the old DFA class expected by Interpreter + """ + def __init__(self, flow): + self._flow = flow + + @property + def infos(self): + return self._flow.block_infos + + +AdaptBlockInfo = namedtuple( + "AdaptBlockInfo", + ["insts", "outgoing_phis", "blockstack", "active_try_block", + "outgoing_edgepushed"], +) + + +def adapt_state_infos(state): + return AdaptBlockInfo( + insts=tuple(state.instructions), + outgoing_phis=state.outgoing_phis, + blockstack=state.blockstack_initial, + active_try_block=state.find_initial_try_block(), + outgoing_edgepushed=state.get_outgoing_edgepushed(), + ) + + +def _flatten_inst_regs(iterable): + """Flatten an iterable of registers used in an instruction + """ + for item in iterable: + if isinstance(item, str): + yield item + elif isinstance(item, (tuple, list)): + for x in _flatten_inst_regs(item): + yield x + + +class AdaptCFA(object): + """Adapt Flow to the old CFA class expected by Interpreter + """ + def __init__(self, flow): + self._flow = flow + self._blocks = {} + for offset, blockinfo in flow.block_infos.items(): + self._blocks[offset] = AdaptCFBlock(blockinfo, offset) + backbone = self._flow.cfgraph.backbone() + + graph = flow.cfgraph + # Find backbone + backbone = graph.backbone() + # Filter out in loop blocks (Assuming no other cyclic control blocks) + # This is to unavoid variables defined in loops being considered as + # function scope. + inloopblocks = set() + for b in self.blocks.keys(): + if graph.in_loops(b): + inloopblocks.add(b) + self._backbone = backbone - inloopblocks + + @property + def graph(self): + return self._flow.cfgraph + + @property + def backbone(self): + return self._backbone + + @property + def blocks(self): + return self._blocks + + def iterliveblocks(self): + for b in sorted(self.blocks): + yield self.blocks[b] + + def dump(self): + self._flow.cfgraph.dump() + + +class AdaptCFBlock(object): + def __init__(self, blockinfo, offset): + self.offset = offset + self.body = tuple(i for i, _ in blockinfo.insts) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/caching.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/caching.py new file mode 100644 index 0000000000000000000000000000000000000000..4339f5e3087b3bcd9de54fc123859799b5d97978 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/caching.py @@ -0,0 +1,731 @@ +""" +Caching mechanism for compiled functions. +""" + + +from abc import ABCMeta, abstractmethod, abstractproperty +import contextlib +import errno +import hashlib +import inspect +import itertools +import os +import pickle +import sys +import tempfile +import uuid +import warnings + +from numba.misc.appdirs import AppDirs + +import numba +from numba.core.errors import NumbaWarning +from numba.core.base import BaseContext +from numba.core.codegen import CodeLibrary +from numba.core.compiler import CompileResult +from numba.core import config, compiler +from numba.core.serialize import dumps + + +def _cache_log(msg, *args): + if config.DEBUG_CACHE: + msg = msg % args + print(msg) + + +class _Cache(metaclass=ABCMeta): + + @abstractproperty + def cache_path(self): + """ + The base filesystem path of this cache (for example its root folder). + """ + + @abstractmethod + def load_overload(self, sig, target_context): + """ + Load an overload for the given signature using the target context. + The saved object must be returned if successful, None if not found + in the cache. + """ + + @abstractmethod + def save_overload(self, sig, data): + """ + Save the overload for the given signature. + """ + + @abstractmethod + def enable(self): + """ + Enable the cache. + """ + + @abstractmethod + def disable(self): + """ + Disable the cache. + """ + + @abstractmethod + def flush(self): + """ + Flush the cache. + """ + + +class NullCache(_Cache): + @property + def cache_path(self): + return None + + def load_overload(self, sig, target_context): + pass + + def save_overload(self, sig, cres): + pass + + def enable(self): + pass + + def disable(self): + pass + + def flush(self): + pass + + +class _CacheLocator(metaclass=ABCMeta): + """ + A filesystem locator for caching a given function. + """ + + def ensure_cache_path(self): + path = self.get_cache_path() + os.makedirs(path, exist_ok=True) + # Ensure the directory is writable by trying to write a temporary file + tempfile.TemporaryFile(dir=path).close() + + @abstractmethod + def get_cache_path(self): + """ + Return the directory the function is cached in. + """ + + @abstractmethod + def get_source_stamp(self): + """ + Get a timestamp representing the source code's freshness. + Can return any picklable Python object. + """ + + @abstractmethod + def get_disambiguator(self): + """ + Get a string disambiguator for this locator's function. + It should allow disambiguating different but similarly-named functions. + """ + + @classmethod + def from_function(cls, py_func, py_file): + """ + Create a locator instance for the given function located in the + given file. + """ + raise NotImplementedError + + @classmethod + def get_suitable_cache_subpath(cls, py_file): + """Given the Python file path, compute a suitable path inside the + cache directory. + + This will reduce a file path that is too long, which can be a problem + on some operating system (i.e. Windows 7). + """ + path = os.path.abspath(py_file) + subpath = os.path.dirname(path) + parentdir = os.path.split(subpath)[-1] + # Use SHA1 to reduce path length. + # Note: windows doesn't like long path. + hashed = hashlib.sha1(subpath.encode()).hexdigest() + # Retain parent directory name for easier debugging + return '_'.join([parentdir, hashed]) + + +class _SourceFileBackedLocatorMixin(object): + """ + A cache locator mixin for functions which are backed by a well-known + Python source file. + """ + + def get_source_stamp(self): + if getattr(sys, 'frozen', False): + st = os.stat(sys.executable) + else: + st = os.stat(self._py_file) + # We use both timestamp and size as some filesystems only have second + # granularity. + return st.st_mtime, st.st_size + + def get_disambiguator(self): + return str(self._lineno) + + @classmethod + def from_function(cls, py_func, py_file): + if not os.path.exists(py_file): + # Perhaps a placeholder (e.g. "") + return + self = cls(py_func, py_file) + try: + self.ensure_cache_path() + except OSError: + # Cannot ensure the cache directory exists or is writable + return + return self + + +class _UserProvidedCacheLocator(_SourceFileBackedLocatorMixin, _CacheLocator): + """ + A locator that always point to the user provided directory in + `numba.config.CACHE_DIR` + """ + def __init__(self, py_func, py_file): + self._py_file = py_file + self._lineno = py_func.__code__.co_firstlineno + cache_subpath = self.get_suitable_cache_subpath(py_file) + self._cache_path = os.path.join(config.CACHE_DIR, cache_subpath) + + def get_cache_path(self): + return self._cache_path + + @classmethod + def from_function(cls, py_func, py_file): + if not config.CACHE_DIR: + return + parent = super(_UserProvidedCacheLocator, cls) + return parent.from_function(py_func, py_file) + + +class _InTreeCacheLocator(_SourceFileBackedLocatorMixin, _CacheLocator): + """ + A locator for functions backed by a regular Python module with a + writable __pycache__ directory. + """ + + def __init__(self, py_func, py_file): + self._py_file = py_file + self._lineno = py_func.__code__.co_firstlineno + self._cache_path = os.path.join(os.path.dirname(self._py_file), '__pycache__') + + def get_cache_path(self): + return self._cache_path + + +class _UserWideCacheLocator(_SourceFileBackedLocatorMixin, _CacheLocator): + """ + A locator for functions backed by a regular Python module or a + frozen executable, cached into a user-wide cache directory. + """ + + def __init__(self, py_func, py_file): + self._py_file = py_file + self._lineno = py_func.__code__.co_firstlineno + appdirs = AppDirs(appname="numba", appauthor=False) + cache_dir = appdirs.user_cache_dir + cache_subpath = self.get_suitable_cache_subpath(py_file) + self._cache_path = os.path.join(cache_dir, cache_subpath) + + def get_cache_path(self): + return self._cache_path + + @classmethod + def from_function(cls, py_func, py_file): + if not (os.path.exists(py_file) or getattr(sys, 'frozen', False)): + # Perhaps a placeholder (e.g. "") + # stop function exit if frozen, since it uses a temp placeholder + return + self = cls(py_func, py_file) + try: + self.ensure_cache_path() + except OSError: + # Cannot ensure the cache directory exists or is writable + return + return self + + +class _IPythonCacheLocator(_CacheLocator): + """ + A locator for functions entered at the IPython prompt (notebook or other). + """ + + def __init__(self, py_func, py_file): + self._py_file = py_file + # Note IPython enhances the linecache module to be able to + # inspect source code of functions defined on the interactive prompt. + source = inspect.getsource(py_func) + if isinstance(source, bytes): + self._bytes_source = source + else: + self._bytes_source = source.encode('utf-8') + + def get_cache_path(self): + # We could also use jupyter_core.paths.jupyter_runtime_dir() + # In both cases this is a user-wide directory, so we need to + # be careful when disambiguating if we don't want too many + # conflicts (see below). + try: + from IPython.paths import get_ipython_cache_dir + except ImportError: + # older IPython version + from IPython.utils.path import get_ipython_cache_dir + return os.path.join(get_ipython_cache_dir(), 'numba_cache') + + def get_source_stamp(self): + return hashlib.sha256(self._bytes_source).hexdigest() + + def get_disambiguator(self): + # Heuristic: we don't want too many variants being saved, but + # we don't want similar named functions (e.g. "f") to compete + # for the cache, so we hash the first two lines of the function + # source (usually this will be the @jit decorator + the function + # signature). + firstlines = b''.join(self._bytes_source.splitlines(True)[:2]) + return hashlib.sha256(firstlines).hexdigest()[:10] + + @classmethod + def from_function(cls, py_func, py_file): + if not ( + py_file.startswith("' can appear in the qualname (e.g. '') but + # are forbidden in Windows filenames + fixed_fullname = fullname.replace('<', '').replace('>', '') + fmt = '%s-%s.py%d%d%s' + return fmt % (fixed_fullname, self.locator.get_disambiguator(), + sys.version_info[0], sys.version_info[1], abiflags) + + @property + def filename_base(self): + return self._filename_base + + @property + def locator(self): + return self._locator + + @abstractmethod + def reduce(self, data): + "Returns the serialized form the data" + pass + + @abstractmethod + def rebuild(self, target_context, reduced_data): + "Returns the de-serialized form of the *reduced_data*" + pass + + @abstractmethod + def check_cachable(self, data): + "Returns True if the given data is cachable; otherwise, returns False." + pass + + +class CompileResultCacheImpl(CacheImpl): + """ + Implements the logic to cache CompileResult objects. + """ + + def reduce(self, cres): + """ + Returns a serialized CompileResult + """ + return cres._reduce() + + def rebuild(self, target_context, payload): + """ + Returns the unserialized CompileResult + """ + return compiler.CompileResult._rebuild(target_context, *payload) + + def check_cachable(self, cres): + """ + Check cachability of the given compile result. + """ + cannot_cache = None + if any(not x.can_cache for x in cres.lifted): + cannot_cache = "as it uses lifted code" + elif cres.library.has_dynamic_globals: + cannot_cache = ("as it uses dynamic globals " + "(such as ctypes pointers and large global arrays)") + if cannot_cache: + msg = ('Cannot cache compiled function "%s" %s' + % (cres.fndesc.qualname.split('.')[-1], cannot_cache)) + warnings.warn_explicit(msg, NumbaWarning, + self._locator._py_file, self._lineno) + return False + return True + + +class CodeLibraryCacheImpl(CacheImpl): + """ + Implements the logic to cache CodeLibrary objects. + """ + + _filename_prefix = None # must be overridden + + def reduce(self, codelib): + """ + Returns a serialized CodeLibrary + """ + return codelib.serialize_using_object_code() + + def rebuild(self, target_context, payload): + """ + Returns the unserialized CodeLibrary + """ + return target_context.codegen().unserialize_library(payload) + + def check_cachable(self, codelib): + """ + Check cachability of the given CodeLibrary. + """ + return not codelib.has_dynamic_globals + + def get_filename_base(self, fullname, abiflags): + parent = super(CodeLibraryCacheImpl, self) + res = parent.get_filename_base(fullname, abiflags) + return '-'.join([self._filename_prefix, res]) + + +class IndexDataCacheFile(object): + """ + Implements the logic for the index file and data file used by a cache. + """ + def __init__(self, cache_path, filename_base, source_stamp): + self._cache_path = cache_path + self._index_name = '%s.nbi' % (filename_base,) + self._index_path = os.path.join(self._cache_path, self._index_name) + self._data_name_pattern = '%s.{number:d}.nbc' % (filename_base,) + self._source_stamp = source_stamp + self._version = numba.__version__ + + def flush(self): + self._save_index({}) + + def save(self, key, data): + """ + Save a new cache entry with *key* and *data*. + """ + overloads = self._load_index() + try: + # If key already exists, we will overwrite the file + data_name = overloads[key] + except KeyError: + # Find an available name for the data file + existing = set(overloads.values()) + for i in itertools.count(1): + data_name = self._data_name(i) + if data_name not in existing: + break + overloads[key] = data_name + self._save_index(overloads) + self._save_data(data_name, data) + + def load(self, key): + """ + Load a cache entry with *key*. + """ + overloads = self._load_index() + data_name = overloads.get(key) + if data_name is None: + return + try: + return self._load_data(data_name) + except OSError: + # File could have been removed while the index still refers it. + return + + def _load_index(self): + """ + Load the cache index and return it as a dictionary (possibly + empty if cache is empty or obsolete). + """ + try: + with open(self._index_path, "rb") as f: + version = pickle.load(f) + data = f.read() + except FileNotFoundError: + # Index doesn't exist yet? + return {} + if version != self._version: + # This is another version. Avoid trying to unpickling the + # rest of the stream, as that may fail. + return {} + stamp, overloads = pickle.loads(data) + _cache_log("[cache] index loaded from %r", self._index_path) + if stamp != self._source_stamp: + # Cache is not fresh. Stale data files will be eventually + # overwritten, since they are numbered in incrementing order. + return {} + else: + return overloads + + def _save_index(self, overloads): + data = self._source_stamp, overloads + data = self._dump(data) + with self._open_for_write(self._index_path) as f: + pickle.dump(self._version, f, protocol=-1) + f.write(data) + _cache_log("[cache] index saved to %r", self._index_path) + + def _load_data(self, name): + path = self._data_path(name) + with open(path, "rb") as f: + data = f.read() + tup = pickle.loads(data) + _cache_log("[cache] data loaded from %r", path) + return tup + + def _save_data(self, name, data): + data = self._dump(data) + path = self._data_path(name) + with self._open_for_write(path) as f: + f.write(data) + _cache_log("[cache] data saved to %r", path) + + def _data_name(self, number): + return self._data_name_pattern.format(number=number) + + def _data_path(self, name): + return os.path.join(self._cache_path, name) + + def _dump(self, obj): + return dumps(obj) + + @contextlib.contextmanager + def _open_for_write(self, filepath): + """ + Open *filepath* for writing in a race condition-free way (hopefully). + uuid4 is used to try and avoid name collisions on a shared filesystem. + """ + uid = uuid.uuid4().hex[:16] # avoid long paths + tmpname = '%s.tmp.%s' % (filepath, uid) + try: + with open(tmpname, "wb") as f: + yield f + os.replace(tmpname, filepath) + except Exception: + # In case of error, remove dangling tmp file + try: + os.unlink(tmpname) + except OSError: + pass + raise + + +class Cache(_Cache): + """ + A per-function compilation cache. The cache saves data in separate + data files and maintains information in an index file. + + There is one index file per function and Python version + ("function_name-.pyXY.nbi") which contains a mapping of + signatures and architectures to data files. + It is prefixed by a versioning key and a timestamp of the Python source + file containing the function. + + There is one data file ("function_name-.pyXY..nbc") + per function, function signature, target architecture and Python version. + + Separate index and data files per Python version avoid pickle + compatibility problems. + + Note: + This contains the driver logic only. The core logic is provided + by a subclass of ``CacheImpl`` specified as *_impl_class* in the subclass. + """ + + # The following class variables must be overridden by subclass. + _impl_class = None + + def __init__(self, py_func): + self._name = repr(py_func) + self._py_func = py_func + self._impl = self._impl_class(py_func) + self._cache_path = self._impl.locator.get_cache_path() + # This may be a bit strict but avoids us maintaining a magic number + source_stamp = self._impl.locator.get_source_stamp() + filename_base = self._impl.filename_base + self._cache_file = IndexDataCacheFile(cache_path=self._cache_path, + filename_base=filename_base, + source_stamp=source_stamp) + self.enable() + + def __repr__(self): + return "<%s py_func=%r>" % (self.__class__.__name__, self._name) + + @property + def cache_path(self): + return self._cache_path + + def enable(self): + self._enabled = True + + def disable(self): + self._enabled = False + + def flush(self): + self._cache_file.flush() + + def load_overload(self, sig, target_context): + """ + Load and recreate the cached object for the given signature, + using the *target_context*. + """ + # Refresh the context to ensure it is initialized + target_context.refresh() + with self._guard_against_spurious_io_errors(): + return self._load_overload(sig, target_context) + # None returned if the `with` block swallows an exception + + def _load_overload(self, sig, target_context): + if not self._enabled: + return + key = self._index_key(sig, target_context.codegen()) + data = self._cache_file.load(key) + if data is not None: + data = self._impl.rebuild(target_context, data) + return data + + def save_overload(self, sig, data): + """ + Save the data for the given signature in the cache. + """ + with self._guard_against_spurious_io_errors(): + self._save_overload(sig, data) + + def _save_overload(self, sig, data): + if not self._enabled: + return + if not self._impl.check_cachable(data): + return + self._impl.locator.ensure_cache_path() + key = self._index_key(sig, data.codegen) + data = self._impl.reduce(data) + self._cache_file.save(key, data) + + @contextlib.contextmanager + def _guard_against_spurious_io_errors(self): + if os.name == 'nt': + # Guard against permission errors due to accessing the file + # from several processes (see #2028) + try: + yield + except OSError as e: + if e.errno != errno.EACCES: + raise + else: + # No such conditions under non-Windows OSes + yield + + def _index_key(self, sig, codegen): + """ + Compute index key for the given signature and codegen. + It includes a description of the OS, target architecture and hashes of + the bytecode for the function and, if the function has a __closure__, + a hash of the cell_contents. + """ + codebytes = self._py_func.__code__.co_code + if self._py_func.__closure__ is not None: + cvars = tuple([x.cell_contents for x in self._py_func.__closure__]) + # Note: cloudpickle serializes a function differently depending + # on how the process is launched; e.g. multiprocessing.Process + cvarbytes = dumps(cvars) + else: + cvarbytes = b'' + + hasher = lambda x: hashlib.sha256(x).hexdigest() + return (sig, codegen.magic_tuple(), (hasher(codebytes), + hasher(cvarbytes),)) + + +class FunctionCache(Cache): + """ + Implements Cache that saves and loads CompileResult objects. + """ + _impl_class = CompileResultCacheImpl + + +# Remember used cache filename prefixes. +_lib_cache_prefixes = set(['']) + + +def make_library_cache(prefix): + """ + Create a Cache class for additional compilation features to cache their + result for reuse. The cache is saved in filename pattern like + in ``FunctionCache`` but with additional *prefix* as specified. + """ + # avoid cache prefix reuse + assert prefix not in _lib_cache_prefixes + _lib_cache_prefixes.add(prefix) + + class CustomCodeLibraryCacheImpl(CodeLibraryCacheImpl): + _filename_prefix = prefix + + class LibraryCache(Cache): + """ + Implements Cache that saves and loads CodeLibrary objects for additional + feature for the specified python function. + """ + _impl_class = CustomCodeLibraryCacheImpl + + return LibraryCache + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/callconv.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/callconv.py new file mode 100644 index 0000000000000000000000000000000000000000..9347b92b1f8873cd06505a3ce762bffa077e436a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/callconv.py @@ -0,0 +1,650 @@ +""" +Calling conventions for Numba-compiled functions. +""" + +from collections import namedtuple +from collections.abc import Iterable +import itertools + +from llvmlite import ir + +from numba.core import types, cgutils +from numba.core.base import PYOBJECT, GENERIC_POINTER + + +TryStatus = namedtuple('TryStatus', ['in_try', 'excinfo']) + + +Status = namedtuple("Status", + ("code", + # If the function returned ok (a value or None) + "is_ok", + # If the function returned None + "is_none", + # If the function errored out (== not is_ok) + "is_error", + # If the generator exited with StopIteration + "is_stop_iteration", + # If the function errored with an already set exception + "is_python_exc", + # If the function errored with a user exception + "is_user_exc", + # The pointer to the exception info structure (for user exceptions) + "excinfoptr", + )) + +int32_t = ir.IntType(32) +errcode_t = int32_t + +def _const_int(code): + return ir.Constant(errcode_t, code) + +RETCODE_OK = _const_int(0) +RETCODE_EXC = _const_int(-1) +RETCODE_NONE = _const_int(-2) +# StopIteration +RETCODE_STOPIT = _const_int(-3) + +FIRST_USEREXC = 1 + +RETCODE_USEREXC = _const_int(FIRST_USEREXC) + + + + +class BaseCallConv(object): + + def __init__(self, context): + self.context = context + + def return_optional_value(self, builder, retty, valty, value): + if valty == types.none: + # Value is none + self.return_native_none(builder) + + elif retty == valty: + # Value is an optional, need a runtime switch + optval = self.context.make_helper(builder, retty, value=value) + + validbit = cgutils.as_bool_bit(builder, optval.valid) + with builder.if_then(validbit): + retval = self.context.get_return_value(builder, retty.type, + optval.data) + self.return_value(builder, retval) + + self.return_native_none(builder) + + elif not isinstance(valty, types.Optional): + # Value is not an optional, need a cast + if valty != retty.type: + value = self.context.cast(builder, value, fromty=valty, + toty=retty.type) + retval = self.context.get_return_value(builder, retty.type, value) + self.return_value(builder, retval) + + else: + raise NotImplementedError("returning {0} for {1}".format(valty, + retty)) + + def return_native_none(self, builder): + self._return_errcode_raw(builder, RETCODE_NONE) + + def return_exc(self, builder): + self._return_errcode_raw(builder, RETCODE_EXC, mark_exc=True) + + def return_stop_iteration(self, builder): + self._return_errcode_raw(builder, RETCODE_STOPIT) + + def get_return_type(self, ty): + """ + Get the actual type of the return argument for Numba type *ty*. + """ + restype = self.context.data_model_manager[ty].get_return_type() + return restype.as_pointer() + + def init_call_helper(self, builder): + """ + Initialize and return a call helper object for the given builder. + """ + ch = self._make_call_helper(builder) + builder.__call_helper = ch + return ch + + def _get_call_helper(self, builder): + return builder.__call_helper + + def raise_error(self, builder, api, status): + """ + Given a non-ok *status*, raise the corresponding Python exception. + """ + bbend = builder.function.append_basic_block() + + with builder.if_then(status.is_user_exc): + # Unserialize user exception. + # Make sure another error may not interfere. + api.err_clear() + exc = api.unserialize(status.excinfoptr) + with cgutils.if_likely(builder, + cgutils.is_not_null(builder, exc)): + api.raise_object(exc) # steals ref + builder.branch(bbend) + + with builder.if_then(status.is_stop_iteration): + api.err_set_none("PyExc_StopIteration") + builder.branch(bbend) + + with builder.if_then(status.is_python_exc): + # Error already raised => nothing to do + builder.branch(bbend) + + api.err_set_string("PyExc_SystemError", + "unknown error when calling native function") + builder.branch(bbend) + + builder.position_at_end(bbend) + + def decode_arguments(self, builder, argtypes, func): + """ + Get the decoded (unpacked) Python arguments with *argtypes* + from LLVM function *func*. A tuple of LLVM values is returned. + """ + raw_args = self.get_arguments(func) + arginfo = self._get_arg_packer(argtypes) + return arginfo.from_arguments(builder, raw_args) + + def _get_arg_packer(self, argtypes): + """ + Get an argument packer for the given argument types. + """ + return self.context.get_arg_packer(argtypes) + + +class MinimalCallConv(BaseCallConv): + """ + A minimal calling convention, suitable for e.g. GPU targets. + The implemented function signature is: + + retcode_t (*, ... ) + + The return code will be one of the RETCODE_* constants or a + function-specific user exception id (>= RETCODE_USEREXC). + + Caller is responsible for allocating a slot for the return value + (passed as a pointer in the first argument). + """ + + def _make_call_helper(self, builder): + return _MinimalCallHelper() + + def return_value(self, builder, retval): + retptr = builder.function.args[0] + assert retval.type == retptr.type.pointee, \ + (str(retval.type), str(retptr.type.pointee)) + builder.store(retval, retptr) + self._return_errcode_raw(builder, RETCODE_OK) + + def return_user_exc(self, builder, exc, exc_args=None, loc=None, + func_name=None): + if exc is not None and not issubclass(exc, BaseException): + raise TypeError("exc should be None or exception class, got %r" + % (exc,)) + if exc_args is not None and not isinstance(exc_args, tuple): + raise TypeError("exc_args should be None or tuple, got %r" + % (exc_args,)) + + # Build excinfo struct + if loc is not None: + fname = loc._raw_function_name() + if fname is None: + # could be exec() or REPL, try func_name + fname = func_name + + locinfo = (fname, loc.filename, loc.line) + if None in locinfo: + locinfo = None + else: + locinfo = None + + call_helper = self._get_call_helper(builder) + exc_id = call_helper._add_exception(exc, exc_args, locinfo) + self._return_errcode_raw(builder, _const_int(exc_id), mark_exc=True) + + def return_status_propagate(self, builder, status): + self._return_errcode_raw(builder, status.code) + + def _return_errcode_raw(self, builder, code, mark_exc=False): + if isinstance(code, int): + code = _const_int(code) + builder.ret(code) + + def _get_return_status(self, builder, code): + """ + Given a return *code*, get a Status instance. + """ + norm = builder.icmp_signed('==', code, RETCODE_OK) + none = builder.icmp_signed('==', code, RETCODE_NONE) + ok = builder.or_(norm, none) + err = builder.not_(ok) + exc = builder.icmp_signed('==', code, RETCODE_EXC) + is_stop_iteration = builder.icmp_signed('==', code, RETCODE_STOPIT) + is_user_exc = builder.icmp_signed('>=', code, RETCODE_USEREXC) + + status = Status(code=code, + is_ok=ok, + is_error=err, + is_python_exc=exc, + is_none=none, + is_user_exc=is_user_exc, + is_stop_iteration=is_stop_iteration, + excinfoptr=None) + return status + + def get_function_type(self, restype, argtypes): + """ + Get the implemented Function type for *restype* and *argtypes*. + """ + arginfo = self._get_arg_packer(argtypes) + argtypes = list(arginfo.argument_types) + resptr = self.get_return_type(restype) + fnty = ir.FunctionType(errcode_t, [resptr] + argtypes) + return fnty + + def decorate_function(self, fn, args, fe_argtypes, noalias=False): + """ + Set names and attributes of function arguments. + """ + assert not noalias + arginfo = self._get_arg_packer(fe_argtypes) + arginfo.assign_names(self.get_arguments(fn), + ['arg.' + a for a in args]) + fn.args[0].name = ".ret" + return fn + + def get_arguments(self, func): + """ + Get the Python-level arguments of LLVM *func*. + """ + return func.args[1:] + + def call_function(self, builder, callee, resty, argtys, args): + """ + Call the Numba-compiled *callee*. + """ + retty = callee.args[0].type.pointee + retvaltmp = cgutils.alloca_once(builder, retty) + # initialize return value + builder.store(cgutils.get_null_value(retty), retvaltmp) + + arginfo = self._get_arg_packer(argtys) + args = arginfo.as_arguments(builder, args) + realargs = [retvaltmp] + list(args) + code = builder.call(callee, realargs) + status = self._get_return_status(builder, code) + retval = builder.load(retvaltmp) + out = self.context.get_returned_value(builder, resty, retval) + return status, out + + +class _MinimalCallHelper(object): + """ + A call helper object for the "minimal" calling convention. + User exceptions are represented as integer codes and stored in + a mapping for retrieval from the caller. + """ + + def __init__(self): + self.exceptions = {} + + def _add_exception(self, exc, exc_args, locinfo): + """ + Add a new user exception to this helper. Returns an integer that can be + used to refer to the added exception in future. + + Parameters + ---------- + exc : + exception type + exc_args : None or tuple + exception args + locinfo : tuple + location information + """ + exc_id = len(self.exceptions) + FIRST_USEREXC + self.exceptions[exc_id] = exc, exc_args, locinfo + return exc_id + + def get_exception(self, exc_id): + """ + Get information about a user exception. Returns a tuple of + (exception type, exception args, location information). + + Parameters + ---------- + id : integer + The ID of the exception to look up + """ + try: + return self.exceptions[exc_id] + except KeyError: + msg = "unknown error %d in native function" % exc_id + exc = SystemError + exc_args = (msg,) + locinfo = None + return exc, exc_args, locinfo + +# The structure type constructed by PythonAPI.serialize_uncached() +# i.e a {i8* pickle_buf, i32 pickle_bufsz, i8* hash_buf} +excinfo_t = ir.LiteralStructType([GENERIC_POINTER, int32_t, GENERIC_POINTER]) +excinfo_ptr_t = ir.PointerType(excinfo_t) + + +class CPUCallConv(BaseCallConv): + """ + The calling convention for CPU targets. + The implemented function signature is: + + retcode_t (*, excinfo **, ... ) + + The return code will be one of the RETCODE_* constants. + If RETCODE_USEREXC, the exception info pointer will be filled with + a pointer to a constant struct describing the raised exception. + + Caller is responsible for allocating slots for the return value + and the exception info pointer (passed as first and second arguments, + respectively). + """ + _status_ids = itertools.count(1) + + def _make_call_helper(self, builder): + return None + + def return_value(self, builder, retval): + retptr = self._get_return_argument(builder.function) + assert retval.type == retptr.type.pointee, \ + (str(retval.type), str(retptr.type.pointee)) + builder.store(retval, retptr) + self._return_errcode_raw(builder, RETCODE_OK) + + def set_static_user_exc(self, builder, exc, exc_args=None, loc=None, + func_name=None): + if exc is not None and not issubclass(exc, BaseException): + raise TypeError("exc should be None or exception class, got %r" + % (exc,)) + if exc_args is not None and not isinstance(exc_args, tuple): + raise TypeError("exc_args should be None or tuple, got %r" + % (exc_args,)) + # None is indicative of no args, set the exc_args to an empty tuple + # as PyObject_CallObject(exc, exc_args) requires the second argument to + # be a tuple (or nullptr, but doing this makes it consistent) + if exc_args is None: + exc_args = tuple() + + pyapi = self.context.get_python_api(builder) + # Build excinfo struct + if loc is not None: + fname = loc._raw_function_name() + if fname is None: + # could be exec() or REPL, try func_name + fname = func_name + + locinfo = (fname, loc.filename, loc.line) + if None in locinfo: + locinfo = None + else: + locinfo = None + exc = (exc, exc_args, locinfo) + struct_gv = pyapi.serialize_object(exc) + excptr = self._get_excinfo_argument(builder.function) + builder.store(struct_gv, excptr) + + def return_user_exc(self, builder, exc, exc_args=None, loc=None, + func_name=None): + try_info = getattr(builder, '_in_try_block', False) + self.set_static_user_exc(builder, exc, exc_args=exc_args, + loc=loc, func_name=func_name) + trystatus = self.check_try_status(builder) + if try_info: + # This is a hack for old-style impl. + # We will branch directly to the exception handler. + builder.branch(try_info['target']) + else: + # Return from the current function + self._return_errcode_raw(builder, RETCODE_USEREXC, mark_exc=True) + + def _get_try_state(self, builder): + try: + return builder.__eh_try_state + except AttributeError: + ptr = cgutils.alloca_once( + builder, cgutils.intp_t, name='try_state', zfill=True, + ) + builder.__eh_try_state = ptr + return ptr + + def check_try_status(self, builder): + try_state_ptr = self._get_try_state(builder) + try_depth = builder.load(try_state_ptr) + # try_depth > 0 + in_try = builder.icmp_unsigned('>', try_depth, try_depth.type(0)) + + excinfoptr = self._get_excinfo_argument(builder.function) + excinfo = builder.load(excinfoptr) + + return TryStatus(in_try=in_try, excinfo=excinfo) + + def set_try_status(self, builder): + try_state_ptr = self._get_try_state(builder) + # Increment try depth + old = builder.load(try_state_ptr) + new = builder.add(old, old.type(1)) + builder.store(new, try_state_ptr) + + def unset_try_status(self, builder): + try_state_ptr = self._get_try_state(builder) + # Decrement try depth + old = builder.load(try_state_ptr) + new = builder.sub(old, old.type(1)) + builder.store(new, try_state_ptr) + + # Needs to reset the exception state so that the exception handler + # will run normally. + excinfoptr = self._get_excinfo_argument(builder.function) + null = cgutils.get_null_value(excinfoptr.type.pointee) + builder.store(null, excinfoptr) + + def return_status_propagate(self, builder, status): + trystatus = self.check_try_status(builder) + excptr = self._get_excinfo_argument(builder.function) + builder.store(status.excinfoptr, excptr) + with builder.if_then(builder.not_(trystatus.in_try)): + self._return_errcode_raw(builder, status.code, mark_exc=True) + + def _return_errcode_raw(self, builder, code, mark_exc=False): + ret = builder.ret(code) + + if mark_exc: + md = builder.module.add_metadata([ir.IntType(1)(1)]) + ret.set_metadata("ret_is_raise", md) + + def _get_return_status(self, builder, code, excinfoptr): + """ + Given a return *code* and *excinfoptr*, get a Status instance. + """ + norm = builder.icmp_signed('==', code, RETCODE_OK) + none = builder.icmp_signed('==', code, RETCODE_NONE) + exc = builder.icmp_signed('==', code, RETCODE_EXC) + is_stop_iteration = builder.icmp_signed('==', code, RETCODE_STOPIT) + ok = builder.or_(norm, none) + err = builder.not_(ok) + is_user_exc = builder.icmp_signed('>=', code, RETCODE_USEREXC) + excinfoptr = builder.select(is_user_exc, excinfoptr, + ir.Constant(excinfo_ptr_t, ir.Undefined)) + + status = Status(code=code, + is_ok=ok, + is_error=err, + is_python_exc=exc, + is_none=none, + is_user_exc=is_user_exc, + is_stop_iteration=is_stop_iteration, + excinfoptr=excinfoptr) + return status + + def get_function_type(self, restype, argtypes): + """ + Get the implemented Function type for *restype* and *argtypes*. + """ + arginfo = self._get_arg_packer(argtypes) + argtypes = list(arginfo.argument_types) + resptr = self.get_return_type(restype) + fnty = ir.FunctionType(errcode_t, + [resptr, ir.PointerType(excinfo_ptr_t)] + + argtypes) + return fnty + + def decorate_function(self, fn, args, fe_argtypes, noalias=False): + """ + Set names of function arguments, and add useful attributes to them. + """ + arginfo = self._get_arg_packer(fe_argtypes) + arginfo.assign_names(self.get_arguments(fn), + ['arg.' + a for a in args]) + retarg = self._get_return_argument(fn) + retarg.name = "retptr" + retarg.add_attribute("nocapture") + retarg.add_attribute("noalias") + excarg = self._get_excinfo_argument(fn) + excarg.name = "excinfo" + excarg.add_attribute("nocapture") + excarg.add_attribute("noalias") + + if noalias: + args = self.get_arguments(fn) + for a in args: + if isinstance(a.type, ir.PointerType): + a.add_attribute("nocapture") + a.add_attribute("noalias") + + # Add metadata to mark functions that may need NRT + # thus disabling aggressive refct pruning in removerefctpass.py + def type_may_always_need_nrt(ty): + # Returns True if it's a non-Array type that is contains MemInfo + if not isinstance(ty, types.Array): + dmm = self.context.data_model_manager + if dmm[ty].contains_nrt_meminfo(): + return True + return False + + args_may_always_need_nrt = any( + map(type_may_always_need_nrt, fe_argtypes) + ) + + if args_may_always_need_nrt: + nmd = fn.module.add_named_metadata( + 'numba_args_may_always_need_nrt', + ) + nmd.add(fn.module.add_metadata([fn])) + + return fn + + def get_arguments(self, func): + """ + Get the Python-level arguments of LLVM *func*. + """ + return func.args[2:] + + def _get_return_argument(self, func): + return func.args[0] + + def _get_excinfo_argument(self, func): + return func.args[1] + + def call_function(self, builder, callee, resty, argtys, args, + attrs=None): + """ + Call the Numba-compiled *callee*. + Parameters: + ----------- + attrs: LLVM style string or iterable of individual attributes, default + is None which specifies no attributes. Examples: + LLVM style string: "noinline fast" + Equivalent iterable: ("noinline", "fast") + """ + # XXX better fix for callees that are not function values + # (pointers to function; thus have no `.args` attribute) + retty = self._get_return_argument(callee.function_type).pointee + + retvaltmp = cgutils.alloca_once(builder, retty) + # initialize return value to zeros + builder.store(cgutils.get_null_value(retty), retvaltmp) + + excinfoptr = cgutils.alloca_once(builder, ir.PointerType(excinfo_t), + name="excinfo") + + arginfo = self._get_arg_packer(argtys) + args = list(arginfo.as_arguments(builder, args)) + realargs = [retvaltmp, excinfoptr] + args + # deal with attrs, it's fine to specify a load in a string like + # "noinline fast" as per LLVM or equally as an iterable of individual + # attributes. + if attrs is None: + _attrs = () + elif isinstance(attrs, Iterable) and not isinstance(attrs, str): + _attrs = tuple(attrs) + else: + raise TypeError("attrs must be an iterable of strings or None") + code = builder.call(callee, realargs, attrs=_attrs) + status = self._get_return_status(builder, code, + builder.load(excinfoptr)) + retval = builder.load(retvaltmp) + out = self.context.get_returned_value(builder, resty, retval) + return status, out + + +class ErrorModel(object): + + def __init__(self, call_conv): + self.call_conv = call_conv + + def fp_zero_division(self, builder, exc_args=None, loc=None): + if self.raise_on_fp_zero_division: + self.call_conv.return_user_exc(builder, ZeroDivisionError, exc_args, + loc) + return True + else: + return False + + +class PythonErrorModel(ErrorModel): + """ + The Python error model. Any invalid FP input raises an exception. + """ + raise_on_fp_zero_division = True + + +class NumpyErrorModel(ErrorModel): + """ + In the Numpy error model, floating-point errors don't raise an + exception. The FPU exception state is inspected by Numpy at the + end of a ufunc's execution and a warning is raised if appropriate. + + Note there's no easy way to set the FPU exception state from LLVM. + Instructions known to set an FP exception can be optimized away: + https://llvm.org/bugs/show_bug.cgi?id=6050 + http://lists.llvm.org/pipermail/llvm-dev/2014-September/076918.html + http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20140929/237997.html + """ + raise_on_fp_zero_division = False + + +error_models = { + 'python': PythonErrorModel, + 'numpy': NumpyErrorModel, + } + + +def create_error_model(model_name, context): + """ + Create an error model instance for the given target context. + """ + return error_models[model_name](context.call_conv) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/callwrapper.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/callwrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..5508aa38e516b92d89f1c97c01c2ec861a612cc0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/callwrapper.py @@ -0,0 +1,226 @@ +from llvmlite.ir import Constant, IRBuilder +import llvmlite.ir + +from numba.core import types, config, cgutils + + +class _ArgManager(object): + """ + A utility class to handle argument unboxing and cleanup + """ + def __init__(self, context, builder, api, env_manager, endblk, nargs): + self.context = context + self.builder = builder + self.api = api + self.env_manager = env_manager + self.arg_count = 0 # how many function arguments have been processed + self.cleanups = [] + self.nextblk = endblk + + def add_arg(self, obj, ty): + """ + Unbox argument and emit code that handles any error during unboxing. + Args are cleaned up in reverse order of the parameter list, and + cleanup begins as soon as unboxing of any argument fails. E.g. failure + on arg2 will result in control flow going through: + + arg2.err -> arg1.err -> arg0.err -> arg.end (returns) + """ + # Unbox argument + native = self.api.to_native_value(ty, obj) + + # If an error occurred, go to the cleanup block for + # the previous argument + with cgutils.if_unlikely(self.builder, native.is_error): + self.builder.branch(self.nextblk) + + # Define the cleanup function for the argument + def cleanup_arg(): + # Native value reflection + self.api.reflect_native_value(ty, native.value, self.env_manager) + + # Native value cleanup + if native.cleanup is not None: + native.cleanup() + + # NRT cleanup + # (happens after the native value cleanup as the latter + # may need the native value) + if self.context.enable_nrt: + self.context.nrt.decref(self.builder, ty, native.value) + + self.cleanups.append(cleanup_arg) + + # Write the on-error cleanup block for this argument + cleanupblk = self.builder.append_basic_block( + "arg%d.err" % self.arg_count) + with self.builder.goto_block(cleanupblk): + cleanup_arg() + # Go to next cleanup block + self.builder.branch(self.nextblk) + + self.nextblk = cleanupblk + self.arg_count += 1 + return native.value + + def emit_cleanup(self): + """ + Emit the cleanup code after returning from the wrapped function. + """ + for dtor in self.cleanups: + dtor() + + +class _GilManager(object): + """ + A utility class to handle releasing the GIL and then re-acquiring it + again. + """ + + def __init__(self, builder, api, argman): + self.builder = builder + self.api = api + self.argman = argman + self.thread_state = api.save_thread() + + def emit_cleanup(self): + self.api.restore_thread(self.thread_state) + self.argman.emit_cleanup() + + +class PyCallWrapper(object): + def __init__(self, context, module, func, fndesc, env, call_helper, + release_gil): + self.context = context + self.module = module + self.func = func + self.fndesc = fndesc + self.env = env + self.release_gil = release_gil + + def build(self): + wrapname = self.fndesc.llvm_cpython_wrapper_name + + # This is the signature of PyCFunctionWithKeywords + # (see CPython's methodobject.h) + pyobj = self.context.get_argument_type(types.pyobject) + wrapty = llvmlite.ir.FunctionType(pyobj, [pyobj, pyobj, pyobj]) + wrapper = llvmlite.ir.Function(self.module, wrapty, name=wrapname) + + builder = IRBuilder(wrapper.append_basic_block('entry')) + + # - `closure` will receive the `self` pointer stored in the + # PyCFunction object (see _dynfunc.c) + # - `args` and `kws` will receive the tuple and dict objects + # of positional and keyword arguments, respectively. + closure, args, kws = wrapper.args + closure.name = 'py_closure' + args.name = 'py_args' + kws.name = 'py_kws' + + api = self.context.get_python_api(builder) + self.build_wrapper(api, builder, closure, args, kws) + + return wrapper, api + + def build_wrapper(self, api, builder, closure, args, kws): + nargs = len(self.fndesc.argtypes) + + objs = [api.alloca_obj() for _ in range(nargs)] + parseok = api.unpack_tuple(args, self.fndesc.qualname, + nargs, nargs, *objs) + + pred = builder.icmp_unsigned( + '==', + parseok, + Constant(parseok.type, None)) + with cgutils.if_unlikely(builder, pred): + builder.ret(api.get_null_object()) + + # Block that returns after erroneous argument unboxing/cleanup + endblk = builder.append_basic_block("arg.end") + with builder.goto_block(endblk): + builder.ret(api.get_null_object()) + + # Get the Environment object + env_manager = self.get_env(api, builder) + + cleanup_manager = _ArgManager(self.context, builder, api, + env_manager, endblk, nargs) + + # Compute the arguments to the compiled Numba function. + innerargs = [] + for obj, ty in zip(objs, self.fndesc.argtypes): + if isinstance(ty, types.Omitted): + # It's an omitted value => ignore dummy Python object + innerargs.append(None) + else: + val = cleanup_manager.add_arg(builder.load(obj), ty) + innerargs.append(val) + + if self.release_gil: + cleanup_manager = _GilManager(builder, api, cleanup_manager) + + # We elect to not inline the top level user function into the call + # wrapper, this incurs an overhead of a function call, however, it + # increases optimisation stability in that the optimised user function + # is what will actually be run and it is this function that all the + # inspection tools "see". Further, this makes optimisation "stable" in + # that calling the user function from e.g. C or from this wrapper will + # result in the same code executing, were inlining permitted this may + # not be the case as the inline could trigger additional optimisation + # as the function goes into the wrapper, this resulting in the executing + # instruction stream being different from that of the instruction stream + # present in the user function. + status, retval = self.context.call_conv.call_function( + builder, self.func, self.fndesc.restype, self.fndesc.argtypes, + innerargs, attrs=('noinline',)) + # Do clean up + self.debug_print(builder, "# callwrapper: emit_cleanup") + cleanup_manager.emit_cleanup() + self.debug_print(builder, "# callwrapper: emit_cleanup end") + + # Determine return status + with builder.if_then(status.is_ok, likely=True): + # Ok => return boxed Python value + with builder.if_then(status.is_none): + api.return_none() + + retty = self._simplified_return_type() + obj = api.from_native_return(retty, retval, env_manager) + builder.ret(obj) + + # Error out + self.context.call_conv.raise_error(builder, api, status) + builder.ret(api.get_null_object()) + + def get_env(self, api, builder): + """Get the Environment object which is declared as a global + in the module of the wrapped function. + """ + envname = self.context.get_env_name(self.fndesc) + gvptr = self.context.declare_env_global(builder.module, envname) + envptr = builder.load(gvptr) + + env_body = self.context.get_env_body(builder, envptr) + + api.emit_environment_sentry(envptr, return_pyobject=True, + debug_msg=self.fndesc.env_name) + env_manager = api.get_env_manager(self.env, env_body, envptr) + return env_manager + + def _simplified_return_type(self): + """ + The NPM callconv has already converted simplified optional types. + We can simply use the value type from it. + """ + restype = self.fndesc.restype + # Optional type + if isinstance(restype, types.Optional): + return restype.type + else: + return restype + + def debug_print(self, builder, msg): + if config.DEBUG_JIT: + self.context.debug_print(builder, "DEBUGJIT: {0}".format(msg)) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/ccallback.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/ccallback.py new file mode 100644 index 0000000000000000000000000000000000000000..2fd222db98a4f3f72796c632444b64a8b8a79dff --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/ccallback.py @@ -0,0 +1,134 @@ +""" +Implementation of compiled C callbacks (@cfunc). +""" + + +import ctypes + +from numba.core import utils, compiler, registry +from numba.core.caching import NullCache, FunctionCache +from numba.core.dispatcher import _FunctionCompiler +from numba.core.typing import signature +from numba.core.typing.ctypes_utils import to_ctypes +from numba.core.compiler_lock import global_compiler_lock + + +class _CFuncCompiler(_FunctionCompiler): + + def _customize_flags(self, flags): + flags.no_cpython_wrapper = True + flags.no_cfunc_wrapper = False + # Disable compilation of the IR module, because we first want to + # add the cfunc wrapper. + flags.no_compile = True + # Object mode is not currently supported in C callbacks + # (no reliable way to get the environment) + flags.enable_pyobject = False + if flags.force_pyobject: + raise NotImplementedError("object mode not allowed in C callbacks") + return flags + + +class CFunc(object): + """ + A compiled C callback, as created by the @cfunc decorator. + """ + _targetdescr = registry.cpu_target + + def __init__(self, pyfunc, sig, locals, options, + pipeline_class=compiler.Compiler): + args, return_type = sig + if return_type is None: + raise TypeError("C callback needs an explicit return type") + self.__name__ = pyfunc.__name__ + self.__qualname__ = getattr(pyfunc, '__qualname__', self.__name__) + self.__wrapped__ = pyfunc + + self._pyfunc = pyfunc + self._sig = signature(return_type, *args) + self._compiler = _CFuncCompiler(pyfunc, self._targetdescr, + options, locals, + pipeline_class=pipeline_class) + + self._wrapper_name = None + self._wrapper_address = None + self._cache = NullCache() + self._cache_hits = 0 + + def enable_caching(self): + self._cache = FunctionCache(self._pyfunc) + + @global_compiler_lock + def compile(self): + # Try to load from cache + cres = self._cache.load_overload(self._sig, + self._targetdescr.target_context) + if cres is None: + cres = self._compile_uncached() + self._cache.save_overload(self._sig, cres) + else: + self._cache_hits += 1 + + self._library = cres.library + self._wrapper_name = cres.fndesc.llvm_cfunc_wrapper_name + self._wrapper_address = self._library.get_pointer_to_function( + self._wrapper_name) + + def _compile_uncached(self): + sig = self._sig + + # Compile native function as well as cfunc wrapper + return self._compiler.compile(sig.args, sig.return_type) + + @property + def native_name(self): + """ + The process-wide symbol the C callback is exposed as. + """ + # Note from our point of view, the C callback is the wrapper around + # the native function. + return self._wrapper_name + + @property + def address(self): + """ + The address of the C callback. + """ + return self._wrapper_address + + @utils.cached_property + def cffi(self): + """ + A cffi function pointer representing the C callback. + """ + import cffi + ffi = cffi.FFI() + # cffi compares types by name, so using precise types would risk + # spurious mismatches (such as "int32_t" vs. "int"). + return ffi.cast("void *", self.address) + + @utils.cached_property + def ctypes(self): + """ + A ctypes function object representing the C callback. + """ + ctypes_args = [to_ctypes(ty) for ty in self._sig.args] + ctypes_restype = to_ctypes(self._sig.return_type) + functype = ctypes.CFUNCTYPE(ctypes_restype, *ctypes_args) + return functype(self.address) + + def inspect_llvm(self): + """ + Return the LLVM IR of the C callback definition. + """ + return self._library.get_llvm_str() + + @property + def cache_hits(self): + return self._cache_hits + + def __repr__(self): + return "" % (self.__qualname__,) + + def __call__(self, *args, **kwargs): + return self._pyfunc(*args, **kwargs) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/cgutils.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/cgutils.py new file mode 100644 index 0000000000000000000000000000000000000000..3efe47f84cfd8af41c92d1c8d926efa0b75b3280 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/cgutils.py @@ -0,0 +1,1194 @@ +""" +Generic helpers for LLVM code generation. +""" + + +import collections +from contextlib import contextmanager +import functools + +from llvmlite import ir + +from numba.core import utils, types, config, debuginfo +import numba.core.datamodel + + +bool_t = ir.IntType(1) +int8_t = ir.IntType(8) +int32_t = ir.IntType(32) +intp_t = ir.IntType(utils.MACHINE_BITS) +voidptr_t = int8_t.as_pointer() + +true_bit = bool_t(1) +false_bit = bool_t(0) +true_byte = int8_t(1) +false_byte = int8_t(0) + + +def as_bool_bit(builder, value): + return builder.icmp_unsigned('!=', value, value.type(0)) + + +def make_anonymous_struct(builder, values, struct_type=None): + """ + Create an anonymous struct containing the given LLVM *values*. + """ + if struct_type is None: + struct_type = ir.LiteralStructType([v.type for v in values]) + struct_val = struct_type(ir.Undefined) + for i, v in enumerate(values): + struct_val = builder.insert_value(struct_val, v, i) + return struct_val + + +def make_bytearray(buf): + """ + Make a byte array constant from *buf*. + """ + b = bytearray(buf) + n = len(b) + return ir.Constant(ir.ArrayType(ir.IntType(8), n), b) + + +_struct_proxy_cache = {} + + +def create_struct_proxy(fe_type, kind='value'): + """ + Returns a specialized StructProxy subclass for the given fe_type. + """ + cache_key = (fe_type, kind) + res = _struct_proxy_cache.get(cache_key) + if res is None: + base = {'value': ValueStructProxy, + 'data': DataStructProxy, + }[kind] + clsname = base.__name__ + '_' + str(fe_type) + bases = (base,) + clsmembers = dict(_fe_type=fe_type) + res = type(clsname, bases, clsmembers) + + _struct_proxy_cache[cache_key] = res + return res + + +def copy_struct(dst, src, repl={}): + """ + Copy structure from *src* to *dst* with replacement from *repl*. + """ + repl = repl.copy() + # copy data from src or use those in repl + for k in src._datamodel._fields: + v = repl.pop(k, getattr(src, k)) + setattr(dst, k, v) + # use remaining key-values in repl + for k, v in repl.items(): + setattr(dst, k, v) + return dst + + +class _StructProxy(object): + """ + Creates a `Structure` like interface that is constructed with information + from DataModel instance. FE type must have a data model that is a + subclass of StructModel. + """ + # The following class members must be overridden by subclass + _fe_type = None + + def __init__(self, context, builder, value=None, ref=None): + self._context = context + self._datamodel = self._context.data_model_manager[self._fe_type] + if not isinstance(self._datamodel, numba.core.datamodel.StructModel): + raise TypeError( + "Not a structure model: {0}".format(self._datamodel)) + self._builder = builder + + self._be_type = self._get_be_type(self._datamodel) + assert not is_pointer(self._be_type) + + outer_ref, ref = self._make_refs(ref) + if ref.type.pointee != self._be_type: + raise AssertionError("bad ref type: expected %s, got %s" + % (self._be_type.as_pointer(), ref.type)) + + if value is not None: + if value.type != outer_ref.type.pointee: + raise AssertionError("bad value type: expected %s, got %s" + % (outer_ref.type.pointee, value.type)) + self._builder.store(value, outer_ref) + + self._value = ref + self._outer_ref = outer_ref + + def _make_refs(self, ref): + """ + Return an (outer ref, value ref) pair. By default, these are + the same pointers, but a derived class may override this. + """ + if ref is None: + ref = alloca_once(self._builder, self._be_type, zfill=True) + return ref, ref + + def _get_be_type(self, datamodel): + raise NotImplementedError + + def _cast_member_to_value(self, index, val): + raise NotImplementedError + + def _cast_member_from_value(self, index, val): + raise NotImplementedError + + def _get_ptr_by_index(self, index): + return gep_inbounds(self._builder, self._value, 0, index) + + def _get_ptr_by_name(self, attrname): + index = self._datamodel.get_field_position(attrname) + return self._get_ptr_by_index(index) + + def __getattr__(self, field): + """ + Load the LLVM value of the named *field*. + """ + if not field.startswith('_'): + return self[self._datamodel.get_field_position(field)] + else: + raise AttributeError(field) + + def __setattr__(self, field, value): + """ + Store the LLVM *value* into the named *field*. + """ + if field.startswith('_'): + return super(_StructProxy, self).__setattr__(field, value) + self[self._datamodel.get_field_position(field)] = value + + def __getitem__(self, index): + """ + Load the LLVM value of the field at *index*. + """ + member_val = self._builder.load(self._get_ptr_by_index(index)) + return self._cast_member_to_value(index, member_val) + + def __setitem__(self, index, value): + """ + Store the LLVM *value* into the field at *index*. + """ + ptr = self._get_ptr_by_index(index) + value = self._cast_member_from_value(index, value) + if value.type != ptr.type.pointee: + if (is_pointer(value.type) and is_pointer(ptr.type.pointee) + and value.type.pointee == ptr.type.pointee.pointee): + # Differ by address-space only + # Auto coerce it + value = self._context.addrspacecast(self._builder, + value, + ptr.type.pointee.addrspace) + else: + raise TypeError("Invalid store of {value.type} to " + "{ptr.type.pointee} in " + "{self._datamodel} " + "(trying to write member #{index})" + .format(value=value, ptr=ptr, self=self, + index=index)) + self._builder.store(value, ptr) + + def __len__(self): + """ + Return the number of fields. + """ + return self._datamodel.field_count + + def _getpointer(self): + """ + Return the LLVM pointer to the underlying structure. + """ + return self._outer_ref + + def _getvalue(self): + """ + Load and return the value of the underlying LLVM structure. + """ + return self._builder.load(self._outer_ref) + + def _setvalue(self, value): + """ + Store the value in this structure. + """ + assert not is_pointer(value.type) + assert value.type == self._be_type, (value.type, self._be_type) + self._builder.store(value, self._value) + + +class ValueStructProxy(_StructProxy): + """ + Create a StructProxy suitable for accessing regular values + (e.g. LLVM values or alloca slots). + """ + def _get_be_type(self, datamodel): + return datamodel.get_value_type() + + def _cast_member_to_value(self, index, val): + return val + + def _cast_member_from_value(self, index, val): + return val + + +class DataStructProxy(_StructProxy): + """ + Create a StructProxy suitable for accessing data persisted in memory. + """ + def _get_be_type(self, datamodel): + return datamodel.get_data_type() + + def _cast_member_to_value(self, index, val): + model = self._datamodel.get_model(index) + return model.from_data(self._builder, val) + + def _cast_member_from_value(self, index, val): + model = self._datamodel.get_model(index) + return model.as_data(self._builder, val) + + +class Structure(object): + """ + A high-level object wrapping a alloca'ed LLVM structure, including + named fields and attribute access. + """ + + # XXX Should this warrant several separate constructors? + def __init__(self, context, builder, value=None, ref=None, cast_ref=False): + self._type = context.get_struct_type(self) + self._context = context + self._builder = builder + if ref is None: + self._value = alloca_once(builder, self._type, zfill=True) + if value is not None: + assert not is_pointer(value.type) + assert value.type == self._type, (value.type, self._type) + builder.store(value, self._value) + else: + assert value is None + assert is_pointer(ref.type) + if self._type != ref.type.pointee: + if cast_ref: + ref = builder.bitcast(ref, self._type.as_pointer()) + else: + raise TypeError( + "mismatching pointer type: got %s, expected %s" + % (ref.type.pointee, self._type)) + self._value = ref + + self._namemap = {} + self._fdmap = [] + self._typemap = [] + base = int32_t(0) + for i, (k, tp) in enumerate(self._fields): + self._namemap[k] = i + self._fdmap.append((base, int32_t(i))) + self._typemap.append(tp) + + def _get_ptr_by_index(self, index): + ptr = self._builder.gep(self._value, self._fdmap[index], inbounds=True) + return ptr + + def _get_ptr_by_name(self, attrname): + return self._get_ptr_by_index(self._namemap[attrname]) + + def __getattr__(self, field): + """ + Load the LLVM value of the named *field*. + """ + if not field.startswith('_'): + return self[self._namemap[field]] + else: + raise AttributeError(field) + + def __setattr__(self, field, value): + """ + Store the LLVM *value* into the named *field*. + """ + if field.startswith('_'): + return super(Structure, self).__setattr__(field, value) + self[self._namemap[field]] = value + + def __getitem__(self, index): + """ + Load the LLVM value of the field at *index*. + """ + + return self._builder.load(self._get_ptr_by_index(index)) + + def __setitem__(self, index, value): + """ + Store the LLVM *value* into the field at *index*. + """ + ptr = self._get_ptr_by_index(index) + if ptr.type.pointee != value.type: + fmt = "Type mismatch: __setitem__(%d, ...) expected %r but got %r" + raise AssertionError(fmt % (index, + str(ptr.type.pointee), + str(value.type))) + self._builder.store(value, ptr) + + def __len__(self): + """ + Return the number of fields. + """ + return len(self._namemap) + + def _getpointer(self): + """ + Return the LLVM pointer to the underlying structure. + """ + return self._value + + def _getvalue(self): + """ + Load and return the value of the underlying LLVM structure. + """ + return self._builder.load(self._value) + + def _setvalue(self, value): + """Store the value in this structure""" + assert not is_pointer(value.type) + assert value.type == self._type, (value.type, self._type) + self._builder.store(value, self._value) + + # __iter__ is derived by Python from __len__ and __getitem__ + + +def alloca_once(builder, ty, size=None, name='', zfill=False): + """Allocate stack memory at the entry block of the current function + pointed by ``builder`` with llvm type ``ty``. The optional ``size`` arg + set the number of element to allocate. The default is 1. The optional + ``name`` arg set the symbol name inside the llvm IR for debugging. + If ``zfill`` is set, fill the memory with zeros at the current + use-site location. Note that the memory is always zero-filled after the + ``alloca`` at init-site (the entry block). + """ + if isinstance(size, int): + size = ir.Constant(intp_t, size) + # suspend debug metadata emission else it links up python source lines with + # alloca in the entry block as well as their actual location and it makes + # the debug info "jump about". + with debuginfo.suspend_emission(builder): + with builder.goto_entry_block(): + ptr = builder.alloca(ty, size=size, name=name) + # Always zero-fill at init-site. This is safe. + builder.store(ty(None), ptr) + # Also zero-fill at the use-site + if zfill: + builder.store(ptr.type.pointee(None), ptr) + return ptr + + +def sizeof(builder, ptr_type): + """Compute sizeof using GEP + """ + null = ptr_type(None) + offset = null.gep([int32_t(1)]) + return builder.ptrtoint(offset, intp_t) + + +def alloca_once_value(builder, value, name='', zfill=False): + """ + Like alloca_once(), but passing a *value* instead of a type. The + type is inferred and the allocated slot is also initialized with the + given value. + """ + storage = alloca_once(builder, value.type, zfill=zfill) + builder.store(value, storage) + return storage + + +def insert_pure_function(module, fnty, name): + """ + Insert a pure function (in the functional programming sense) in the + given module. + """ + fn = get_or_insert_function(module, fnty, name) + fn.attributes.add("readonly") + fn.attributes.add("nounwind") + return fn + + +def get_or_insert_function(module, fnty, name): + """ + Get the function named *name* with type *fnty* from *module*, or insert it + if it doesn't exist. + """ + fn = module.globals.get(name, None) + if fn is None: + fn = ir.Function(module, fnty, name) + return fn + + +def get_or_insert_named_metadata(module, name): + try: + return module.get_named_metadata(name) + except KeyError: + return module.add_named_metadata(name) + + +def add_global_variable(module, ty, name, addrspace=0): + unique_name = module.get_unique_name(name) + return ir.GlobalVariable(module, ty, unique_name, addrspace) + + +def terminate(builder, bbend): + bb = builder.basic_block + if bb.terminator is None: + builder.branch(bbend) + + +def get_null_value(ltype): + return ltype(None) + + +def is_null(builder, val): + null = get_null_value(val.type) + return builder.icmp_unsigned('==', null, val) + + +def is_not_null(builder, val): + null = get_null_value(val.type) + return builder.icmp_unsigned('!=', null, val) + + +def if_unlikely(builder, pred): + return builder.if_then(pred, likely=False) + + +def if_likely(builder, pred): + return builder.if_then(pred, likely=True) + + +def ifnot(builder, pred): + return builder.if_then(builder.not_(pred)) + + +def increment_index(builder, val): + """ + Increment an index *val*. + """ + one = val.type(1) + # We pass the "nsw" flag in the hope that LLVM understands the index + # never changes sign. Unfortunately this doesn't always work + # (e.g. ndindex()). + return builder.add(val, one, flags=['nsw']) + + +Loop = collections.namedtuple('Loop', ('index', 'do_break')) + + +@contextmanager +def for_range(builder, count, start=None, intp=None): + """ + Generate LLVM IR for a for-loop in [start, count). + *start* is equal to 0 by default. + + Yields a Loop namedtuple with the following members: + - `index` is the loop index's value + - `do_break` is a no-argument callable to break out of the loop + """ + if intp is None: + intp = count.type + if start is None: + start = intp(0) + stop = count + + bbcond = builder.append_basic_block("for.cond") + bbbody = builder.append_basic_block("for.body") + bbend = builder.append_basic_block("for.end") + + def do_break(): + builder.branch(bbend) + + bbstart = builder.basic_block + builder.branch(bbcond) + + with builder.goto_block(bbcond): + index = builder.phi(intp, name="loop.index") + pred = builder.icmp_signed('<', index, stop) + builder.cbranch(pred, bbbody, bbend) + + with builder.goto_block(bbbody): + yield Loop(index, do_break) + # Update bbbody as a new basic block may have been activated + bbbody = builder.basic_block + incr = increment_index(builder, index) + terminate(builder, bbcond) + + index.add_incoming(start, bbstart) + index.add_incoming(incr, bbbody) + + builder.position_at_end(bbend) + + +@contextmanager +def for_range_slice(builder, start, stop, step, intp=None, inc=True): + """ + Generate LLVM IR for a for-loop based on a slice. Yields a + (index, count) tuple where `index` is the slice index's value + inside the loop, and `count` the iteration count. + + Parameters + ------------- + builder : object + IRBuilder object + start : int + The beginning value of the slice + stop : int + The end value of the slice + step : int + The step value of the slice + intp : + The data type + inc : boolean, optional + Signals whether the step is positive (True) or negative (False). + + Returns + ----------- + None + """ + if intp is None: + intp = start.type + + bbcond = builder.append_basic_block("for.cond") + bbbody = builder.append_basic_block("for.body") + bbend = builder.append_basic_block("for.end") + bbstart = builder.basic_block + builder.branch(bbcond) + + with builder.goto_block(bbcond): + index = builder.phi(intp, name="loop.index") + count = builder.phi(intp, name="loop.count") + if (inc): + pred = builder.icmp_signed('<', index, stop) + else: + pred = builder.icmp_signed('>', index, stop) + builder.cbranch(pred, bbbody, bbend) + + with builder.goto_block(bbbody): + yield index, count + bbbody = builder.basic_block + incr = builder.add(index, step) + next_count = increment_index(builder, count) + terminate(builder, bbcond) + + index.add_incoming(start, bbstart) + index.add_incoming(incr, bbbody) + count.add_incoming(ir.Constant(intp, 0), bbstart) + count.add_incoming(next_count, bbbody) + builder.position_at_end(bbend) + + +@contextmanager +def for_range_slice_generic(builder, start, stop, step): + """ + A helper wrapper for for_range_slice(). This is a context manager which + yields two for_range_slice()-alike context managers, the first for + the positive step case, the second for the negative step case. + + Use: + with for_range_slice_generic(...) as (pos_range, neg_range): + with pos_range as (idx, count): + ... + with neg_range as (idx, count): + ... + """ + intp = start.type + is_pos_step = builder.icmp_signed('>=', step, ir.Constant(intp, 0)) + + pos_for_range = for_range_slice(builder, start, stop, step, intp, inc=True) + neg_for_range = for_range_slice(builder, start, stop, step, intp, inc=False) + + @contextmanager + def cm_cond(cond, inner_cm): + with cond: + with inner_cm as value: + yield value + + with builder.if_else(is_pos_step, likely=True) as (then, otherwise): + yield cm_cond(then, pos_for_range), cm_cond(otherwise, neg_for_range) + + +@contextmanager +def loop_nest(builder, shape, intp, order='C'): + """ + Generate a loop nest walking a N-dimensional array. + Yields a tuple of N indices for use in the inner loop body, + iterating over the *shape* space. + + If *order* is 'C' (the default), indices are incremented inside-out + (i.e. (0,0), (0,1), (0,2), (1,0) etc.). + If *order* is 'F', they are incremented outside-in + (i.e. (0,0), (1,0), (2,0), (0,1) etc.). + This has performance implications when walking an array as it impacts + the spatial locality of memory accesses. + """ + assert order in 'CF' + if not shape: + # 0-d array + yield () + else: + if order == 'F': + _swap = lambda x: x[::-1] + else: + _swap = lambda x: x + with _loop_nest(builder, _swap(shape), intp) as indices: + assert len(indices) == len(shape) + yield _swap(indices) + + +@contextmanager +def _loop_nest(builder, shape, intp): + with for_range(builder, shape[0], intp=intp) as loop: + if len(shape) > 1: + with _loop_nest(builder, shape[1:], intp) as indices: + yield (loop.index,) + indices + else: + yield (loop.index,) + + +def pack_array(builder, values, ty=None): + """ + Pack a sequence of values in a LLVM array. *ty* should be given + if the array may be empty, in which case the type can't be inferred + from the values. + """ + n = len(values) + if ty is None: + ty = values[0].type + ary = ir.ArrayType(ty, n)(ir.Undefined) + for i, v in enumerate(values): + ary = builder.insert_value(ary, v, i) + return ary + + +def pack_struct(builder, values): + """ + Pack a sequence of values into a LLVM struct. + """ + structty = ir.LiteralStructType([v.type for v in values]) + st = structty(ir.Undefined) + for i, v in enumerate(values): + st = builder.insert_value(st, v, i) + return st + + +def unpack_tuple(builder, tup, count=None): + """ + Unpack an array or structure of values, return a Python tuple. + """ + if count is None: + # Assuming *tup* is an aggregate + count = len(tup.type.elements) + vals = [builder.extract_value(tup, i) + for i in range(count)] + return vals + + +def get_item_pointer(context, builder, aryty, ary, inds, wraparound=False, + boundscheck=False): + # Set boundscheck=True for any pointer access that should be + # boundschecked. do_boundscheck() will handle enabling or disabling the + # actual boundschecking based on the user config. + shapes = unpack_tuple(builder, ary.shape, count=aryty.ndim) + strides = unpack_tuple(builder, ary.strides, count=aryty.ndim) + return get_item_pointer2(context, builder, data=ary.data, shape=shapes, + strides=strides, layout=aryty.layout, inds=inds, + wraparound=wraparound, boundscheck=boundscheck) + + +def do_boundscheck(context, builder, ind, dimlen, axis=None): + def _dbg(): + # Remove this when we figure out how to include this information + # in the error message. + if axis is not None: + if isinstance(axis, int): + printf(builder, "debug: IndexError: index %d is out of bounds " + "for axis {} with size %d\n".format(axis), ind, dimlen) + else: + printf(builder, "debug: IndexError: index %d is out of bounds " + "for axis %d with size %d\n", ind, axis, + dimlen) + else: + printf(builder, + "debug: IndexError: index %d is out of bounds for size %d\n", + ind, dimlen) + + msg = "index is out of bounds" + out_of_bounds_upper = builder.icmp_signed('>=', ind, dimlen) + with if_unlikely(builder, out_of_bounds_upper): + if config.FULL_TRACEBACKS: + _dbg() + context.call_conv.return_user_exc(builder, IndexError, (msg,)) + out_of_bounds_lower = builder.icmp_signed('<', ind, ind.type(0)) + with if_unlikely(builder, out_of_bounds_lower): + if config.FULL_TRACEBACKS: + _dbg() + context.call_conv.return_user_exc(builder, IndexError, (msg,)) + + +def get_item_pointer2(context, builder, data, shape, strides, layout, inds, + wraparound=False, boundscheck=False): + # Set boundscheck=True for any pointer access that should be + # boundschecked. do_boundscheck() will handle enabling or disabling the + # actual boundschecking based on the user config. + if wraparound: + # Wraparound + indices = [] + for ind, dimlen in zip(inds, shape): + negative = builder.icmp_signed('<', ind, ind.type(0)) + wrapped = builder.add(dimlen, ind) + selected = builder.select(negative, wrapped, ind) + indices.append(selected) + else: + indices = inds + if boundscheck: + for axis, (ind, dimlen) in enumerate(zip(indices, shape)): + do_boundscheck(context, builder, ind, dimlen, axis) + + if not indices: + # Indexing with empty tuple + return builder.gep(data, [int32_t(0)]) + intp = indices[0].type + # Indexing code + if layout in 'CF': + steps = [] + # Compute steps for each dimension + if layout == 'C': + # C contiguous + for i in range(len(shape)): + last = intp(1) + for j in shape[i + 1:]: + last = builder.mul(last, j) + steps.append(last) + elif layout == 'F': + # F contiguous + for i in range(len(shape)): + last = intp(1) + for j in shape[:i]: + last = builder.mul(last, j) + steps.append(last) + else: + raise Exception("unreachable") + + # Compute index + loc = intp(0) + for i, s in zip(indices, steps): + tmp = builder.mul(i, s) + loc = builder.add(loc, tmp) + ptr = builder.gep(data, [loc]) + return ptr + else: + # Any layout + dimoffs = [builder.mul(s, i) for s, i in zip(strides, indices)] + offset = functools.reduce(builder.add, dimoffs) + return pointer_add(builder, data, offset) + + +def _scalar_pred_against_zero(builder, value, fpred, icond): + nullval = value.type(0) + if isinstance(value.type, (ir.FloatType, ir.DoubleType)): + isnull = fpred(value, nullval) + elif isinstance(value.type, ir.IntType): + isnull = builder.icmp_signed(icond, value, nullval) + else: + raise TypeError("unexpected value type %s" % (value.type,)) + return isnull + + +def is_scalar_zero(builder, value): + """ + Return a predicate representing whether *value* is equal to zero. + """ + return _scalar_pred_against_zero( + builder, value, functools.partial(builder.fcmp_ordered, '=='), '==') + + +def is_not_scalar_zero(builder, value): + """ + Return a predicate representing whether a *value* is not equal to zero. + (not exactly "not is_scalar_zero" because of nans) + """ + return _scalar_pred_against_zero( + builder, value, functools.partial(builder.fcmp_unordered, '!='), '!=') + + +def is_scalar_zero_or_nan(builder, value): + """ + Return a predicate representing whether *value* is equal to either zero + or NaN. + """ + return _scalar_pred_against_zero( + builder, value, functools.partial(builder.fcmp_unordered, '=='), '==') + + +is_true = is_not_scalar_zero +is_false = is_scalar_zero + + +def is_scalar_neg(builder, value): + """ + Is *value* negative? Assumes *value* is signed. + """ + return _scalar_pred_against_zero( + builder, value, functools.partial(builder.fcmp_ordered, '<'), '<') + + +def guard_null(context, builder, value, exc_tuple): + """ + Guard against *value* being null or zero. + *exc_tuple* should be a (exception type, arguments...) tuple. + """ + with builder.if_then(is_scalar_zero(builder, value), likely=False): + exc = exc_tuple[0] + exc_args = exc_tuple[1:] or None + context.call_conv.return_user_exc(builder, exc, exc_args) + + +def guard_memory_error(context, builder, pointer, msg=None): + """ + Guard against *pointer* being NULL (and raise a MemoryError). + """ + assert isinstance(pointer.type, ir.PointerType), pointer.type + exc_args = (msg,) if msg else () + with builder.if_then(is_null(builder, pointer), likely=False): + context.call_conv.return_user_exc(builder, MemoryError, exc_args) + + +@contextmanager +def if_zero(builder, value, likely=False): + """ + Execute the given block if the scalar value is zero. + """ + with builder.if_then(is_scalar_zero(builder, value), likely=likely): + yield + + +guard_zero = guard_null + + +def is_pointer(ltyp): + """ + Whether the LLVM type *typ* is a struct type. + """ + return isinstance(ltyp, ir.PointerType) + + +def get_record_member(builder, record, offset, typ): + pval = gep_inbounds(builder, record, 0, offset) + assert not is_pointer(pval.type.pointee) + return builder.bitcast(pval, typ.as_pointer()) + + +def is_neg_int(builder, val): + return builder.icmp_signed('<', val, val.type(0)) + + +def gep_inbounds(builder, ptr, *inds, **kws): + """ + Same as *gep*, but add the `inbounds` keyword. + """ + return gep(builder, ptr, *inds, inbounds=True, **kws) + + +def gep(builder, ptr, *inds, **kws): + """ + Emit a getelementptr instruction for the given pointer and indices. + The indices can be LLVM values or Python int constants. + """ + name = kws.pop('name', '') + inbounds = kws.pop('inbounds', False) + assert not kws + idx = [] + for i in inds: + if isinstance(i, int): + # NOTE: llvm only accepts int32 inside structs, not int64 + ind = int32_t(i) + else: + ind = i + idx.append(ind) + return builder.gep(ptr, idx, name=name, inbounds=inbounds) + + +def pointer_add(builder, ptr, offset, return_type=None): + """ + Add an integral *offset* to pointer *ptr*, and return a pointer + of *return_type* (or, if omitted, the same type as *ptr*). + + Note the computation is done in bytes, and ignores the width of + the pointed item type. + """ + intptr = builder.ptrtoint(ptr, intp_t) + if isinstance(offset, int): + offset = intp_t(offset) + intptr = builder.add(intptr, offset) + return builder.inttoptr(intptr, return_type or ptr.type) + + +def memset(builder, ptr, size, value): + """ + Fill *size* bytes starting from *ptr* with *value*. + """ + fn = builder.module.declare_intrinsic('llvm.memset', (voidptr_t, size.type)) + ptr = builder.bitcast(ptr, voidptr_t) + if isinstance(value, int): + value = int8_t(value) + builder.call(fn, [ptr, value, size, bool_t(0)]) + + +def memset_padding(builder, ptr): + """ + Fill padding bytes of the pointee with zeros. + """ + # Load existing value + val = builder.load(ptr) + # Fill pointee with zeros + memset(builder, ptr, sizeof(builder, ptr.type), 0) + # Store value back + builder.store(val, ptr) + + +def global_constant(builder_or_module, name, value, linkage='internal'): + """ + Get or create a (LLVM module-)global constant with *name* or *value*. + """ + if isinstance(builder_or_module, ir.Module): + module = builder_or_module + else: + module = builder_or_module.module + data = add_global_variable(module, value.type, name) + data.linkage = linkage + data.global_constant = True + data.initializer = value + return data + + +def divmod_by_constant(builder, val, divisor): + """ + Compute the (quotient, remainder) of *val* divided by the constant + positive *divisor*. The semantics reflects those of Python integer + floor division, rather than C's / LLVM's signed division and modulo. + The difference lies with a negative *val*. + """ + assert divisor > 0 + divisor = val.type(divisor) + one = val.type(1) + + quot = alloca_once(builder, val.type) + + with builder.if_else(is_neg_int(builder, val)) as (if_neg, if_pos): + with if_pos: + # quot = val / divisor + quot_val = builder.sdiv(val, divisor) + builder.store(quot_val, quot) + with if_neg: + # quot = -1 + (val + 1) / divisor + val_plus_one = builder.add(val, one) + quot_val = builder.sdiv(val_plus_one, divisor) + builder.store(builder.sub(quot_val, one), quot) + + # rem = val - quot * divisor + # (should be slightly faster than a separate modulo operation) + quot_val = builder.load(quot) + rem_val = builder.sub(val, builder.mul(quot_val, divisor)) + return quot_val, rem_val + + +def cbranch_or_continue(builder, cond, bbtrue): + """ + Branch conditionally or continue. + + Note: a new block is created and builder is moved to the end of the new + block. + """ + bbcont = builder.append_basic_block('.continue') + builder.cbranch(cond, bbtrue, bbcont) + builder.position_at_end(bbcont) + return bbcont + + +def memcpy(builder, dst, src, count): + """ + Emit a memcpy to the builder. + + Copies each element of dst to src. Unlike the C equivalent, each element + can be any LLVM type. + + Assumes + ------- + * dst.type == src.type + * count is positive + """ + # Note this does seem to be optimized as a raw memcpy() by LLVM + # whenever possible... + assert dst.type == src.type + with for_range(builder, count, intp=count.type) as loop: + out_ptr = builder.gep(dst, [loop.index]) + in_ptr = builder.gep(src, [loop.index]) + builder.store(builder.load(in_ptr), out_ptr) + + +def _raw_memcpy(builder, func_name, dst, src, count, itemsize, align): + size_t = count.type + if isinstance(itemsize, int): + itemsize = ir.Constant(size_t, itemsize) + + memcpy = builder.module.declare_intrinsic(func_name, + [voidptr_t, voidptr_t, size_t]) + is_volatile = false_bit + builder.call(memcpy, [builder.bitcast(dst, voidptr_t), + builder.bitcast(src, voidptr_t), + builder.mul(count, itemsize), + is_volatile]) + + +def raw_memcpy(builder, dst, src, count, itemsize, align=1): + """ + Emit a raw memcpy() call for `count` items of size `itemsize` + from `src` to `dest`. + """ + return _raw_memcpy(builder, 'llvm.memcpy', dst, src, count, itemsize, align) + + +def raw_memmove(builder, dst, src, count, itemsize, align=1): + """ + Emit a raw memmove() call for `count` items of size `itemsize` + from `src` to `dest`. + """ + return _raw_memcpy(builder, 'llvm.memmove', dst, src, count, + itemsize, align) + + +def muladd_with_overflow(builder, a, b, c): + """ + Compute (a * b + c) and return a (result, overflow bit) pair. + The operands must be signed integers. + """ + p = builder.smul_with_overflow(a, b) + prod = builder.extract_value(p, 0) + prod_ovf = builder.extract_value(p, 1) + s = builder.sadd_with_overflow(prod, c) + res = builder.extract_value(s, 0) + ovf = builder.or_(prod_ovf, builder.extract_value(s, 1)) + return res, ovf + + +def printf(builder, format, *args): + """ + Calls printf(). + Argument `format` is expected to be a Python string. + Values to be printed are listed in `args`. + + Note: There is no checking to ensure there is correct number of values + in `args` and there type matches the declaration in the format string. + """ + assert isinstance(format, str) + mod = builder.module + # Make global constant for format string + cstring = voidptr_t + fmt_bytes = make_bytearray((format + '\00').encode('ascii')) + global_fmt = global_constant(mod, "printf_format", fmt_bytes) + fnty = ir.FunctionType(int32_t, [cstring], var_arg=True) + # Insert printf() + try: + fn = mod.get_global('printf') + except KeyError: + fn = ir.Function(mod, fnty, name="printf") + # Call + ptr_fmt = builder.bitcast(global_fmt, cstring) + return builder.call(fn, [ptr_fmt] + list(args)) + + +def snprintf(builder, buffer, bufsz, format, *args): + """Calls libc snprintf(buffer, bufsz, format, ...args) + """ + assert isinstance(format, str) + mod = builder.module + # Make global constant for format string + cstring = voidptr_t + fmt_bytes = make_bytearray((format + '\00').encode('ascii')) + global_fmt = global_constant(mod, "snprintf_format", fmt_bytes) + fnty = ir.FunctionType( + int32_t, [cstring, intp_t, cstring], var_arg=True, + ) + # Actual symbol name of snprintf is different on win32. + symbol = 'snprintf' + if config.IS_WIN32: + symbol = '_' + symbol + # Insert snprintf() + try: + fn = mod.get_global(symbol) + except KeyError: + fn = ir.Function(mod, fnty, name=symbol) + # Call + ptr_fmt = builder.bitcast(global_fmt, cstring) + return builder.call(fn, [buffer, bufsz, ptr_fmt] + list(args)) + + +def snprintf_stackbuffer(builder, bufsz, format, *args): + """Similar to `snprintf()` but the buffer is stack allocated to size *bufsz*. + + Returns the buffer pointer as i8*. + """ + assert isinstance(bufsz, int) + spacety = ir.ArrayType(ir.IntType(8), bufsz) + space = alloca_once(builder, spacety, zfill=True) + buffer = builder.bitcast(space, voidptr_t) + snprintf(builder, buffer, intp_t(bufsz), format, *args) + return buffer + + +def normalize_ir_text(text): + """ + Normalize the given string to latin1 compatible encoding that is + suitable for use in LLVM IR. + """ + # Just re-encoding to latin1 is enough + return text.encode('utf8').decode('latin1') + + +def hexdump(builder, ptr, nbytes): + """Debug print the memory region in *ptr* to *ptr + nbytes* + as hex. + """ + bytes_per_line = 16 + nbytes = builder.zext(nbytes, intp_t) + printf(builder, "hexdump p=%p n=%zu", + ptr, nbytes) + byte_t = ir.IntType(8) + ptr = builder.bitcast(ptr, byte_t.as_pointer()) + # Loop to print the bytes in *ptr* as hex + with for_range(builder, nbytes) as idx: + div_by = builder.urem(idx.index, intp_t(bytes_per_line)) + do_new_line = builder.icmp_unsigned("==", div_by, intp_t(0)) + with builder.if_then(do_new_line): + printf(builder, "\n") + + offset = builder.gep(ptr, [idx.index]) + val = builder.load(offset) + printf(builder, " %02x", val) + printf(builder, "\n") + + +def is_nonelike(ty): + """ returns if 'ty' is none """ + return ( + ty is None or + isinstance(ty, types.NoneType) or + isinstance(ty, types.Omitted) + ) + + +def create_constant_array(ty, val): + """ + Create an LLVM-constant of a fixed-length array from Python values. + + The type provided is the type of the elements. + """ + return ir.Constant(ir.ArrayType(ty, len(val)), val) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/codegen.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/codegen.py new file mode 100644 index 0000000000000000000000000000000000000000..e988fab12634d6a19011f5e6fa612329ef22a7cf --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/codegen.py @@ -0,0 +1,1437 @@ +import warnings +import functools +import locale +import weakref +import ctypes +import html +import textwrap + +import llvmlite.binding as ll +import llvmlite.ir as llvmir + +from abc import abstractmethod, ABCMeta +from numba.core import utils, config, cgutils +from numba.core.llvm_bindings import create_pass_manager_builder +from numba.core.runtime.nrtopt import remove_redundant_nrt_refct +from numba.core.runtime import rtsys +from numba.core.compiler_lock import require_global_compiler_lock +from numba.core.errors import NumbaInvalidConfigWarning +from numba.misc.inspection import disassemble_elf_to_cfg +from numba.misc.llvm_pass_timings import PassTimingsCollection + + +_x86arch = frozenset(['x86', 'i386', 'i486', 'i586', 'i686', 'i786', + 'i886', 'i986']) + + +def _is_x86(triple): + arch = triple.split('-')[0] + return arch in _x86arch + + +def _parse_refprune_flags(): + """Parse refprune flags from the `config`. + + Invalid values are ignored an warn via a `NumbaInvalidConfigWarning` + category. + + Returns + ------- + flags : llvmlite.binding.RefPruneSubpasses + """ + flags = config.LLVM_REFPRUNE_FLAGS.split(',') + if not flags: + return 0 + val = 0 + for item in flags: + item = item.strip() + try: + val |= getattr(ll.RefPruneSubpasses, item.upper()) + except AttributeError: + warnings.warn(f"invalid refprune flags {item!r}", + NumbaInvalidConfigWarning) + return val + + +def dump(header, body, lang): + if config.HIGHLIGHT_DUMPS: + try: + import pygments + except ImportError: + msg = "Please install pygments to see highlighted dumps" + raise ValueError(msg) + else: + from pygments import highlight + from pygments.lexers import GasLexer as gas_lexer + from pygments.lexers import LlvmLexer as llvm_lexer + from pygments.formatters import Terminal256Formatter + from numba.misc.dump_style import by_colorscheme + + lexer_map = {'llvm': llvm_lexer, 'asm': gas_lexer} + lexer = lexer_map[lang] + def printer(arg): + print(highlight(arg, lexer(), + Terminal256Formatter(style=by_colorscheme()))) + else: + printer = print + print('=' * 80) + print(header.center(80, '-')) + printer(body) + print('=' * 80) + + +class _CFG(object): + """ + Wraps the CFG graph for different display method. + + Instance of the class can be stringified (``__repr__`` is defined) to get + the graph in DOT format. The ``.display()`` method plots the graph in + PDF. If in IPython notebook, the returned image can be inlined. + """ + def __init__(self, cres, name, py_func, **kwargs): + self.cres = cres + self.name = name + self.py_func = py_func + fn = cres.get_function(name) + self.dot = ll.get_function_cfg(fn) + self.kwargs = kwargs + + def pretty_printer(self, filename=None, view=None, render_format=None, + highlight=True, + interleave=False, strip_ir=False, show_key=True, + fontsize=10): + """ + "Pretty" prints the DOT graph of the CFG. + For explanation of the parameters see the docstring for + numba.core.dispatcher::inspect_cfg. + """ + import graphviz as gv + import re + import json + import inspect + from llvmlite import binding as ll + from numba.typed import List + from types import SimpleNamespace + from collections import defaultdict + + _default = False + _highlight = SimpleNamespace(incref=_default, + decref=_default, + returns=_default, + raises=_default, + meminfo=_default, + branches=_default, + llvm_intrin_calls=_default, + function_calls=_default,) + _interleave = SimpleNamespace(python=_default, lineinfo=_default) + + def parse_config(_config, kwarg): + """ Parses the kwarg into a consistent format for use in configuring + the Digraph rendering. _config is the configuration instance to + update, kwarg is the kwarg on which to base the updates. + """ + if isinstance(kwarg, bool): + for attr in _config.__dict__: + setattr(_config, attr, kwarg) + elif isinstance(kwarg, dict): + for k, v in kwarg.items(): + if k not in _config.__dict__: + raise ValueError("Unexpected key in kwarg: %s" % k) + if isinstance(v, bool): + setattr(_config, k, v) + else: + msg = "Unexpected value for key: %s, got:%s" + raise ValueError(msg % (k, v)) + elif isinstance(kwarg, set): + for item in kwarg: + if item not in _config.__dict__: + raise ValueError("Unexpected key in kwarg: %s" % item) + else: + setattr(_config, item, True) + else: + msg = "Unhandled configuration type for kwarg %s" + raise ValueError(msg % type(kwarg)) + + parse_config(_highlight, highlight) + parse_config(_interleave, interleave) + + # This is the colour scheme. The graphviz HTML label renderer only takes + # names for colours: https://www.graphviz.org/doc/info/shapes.html#html + cs = defaultdict(lambda: 'white') # default bg colour is white + cs['marker'] = 'orange' + cs['python'] = 'yellow' + cs['truebr'] = 'green' + cs['falsebr'] = 'red' + cs['incref'] = 'cyan' + cs['decref'] = 'turquoise' + cs['raise'] = 'lightpink' + cs['meminfo'] = 'lightseagreen' + cs['return'] = 'purple' + cs['llvm_intrin_calls'] = 'rosybrown' + cs['function_calls'] = 'tomato' + + # Get the raw dot format information from LLVM and the LLVM IR + fn = self.cres.get_function(self.name) + #raw_dot = ll.get_function_cfg(fn).replace('\\l...', '') + llvm_str = self.cres.get_llvm_str() + + def get_metadata(llvm_str): + """ Gets the metadata entries from the LLVM IR, these look something + like '!123 = INFORMATION'. Returns a map of metadata key to metadata + value, i.e. from the example {'!123': INFORMATION}""" + md = {} + metadata_entry = re.compile(r'(^[!][0-9]+)(\s+=\s+.*)') + for x in llvm_str.splitlines(): + match = metadata_entry.match(x) + if match is not None: + g = match.groups() + if g is not None: + assert len(g) == 2 + md[g[0]] = g[1] + return md + + md = get_metadata(llvm_str) + + # setup digraph with initial properties + def init_digraph(name, fname, fontsize): + # name and fname are arbitrary graph and file names, they appear in + # some rendering formats, the fontsize determines the output + # fontsize. + + # truncate massive mangled names as file names as it causes OSError + # when trying to render to pdf + cmax = 200 + if len(fname) > cmax: + wstr = (f'CFG output filename "{fname}" exceeds maximum ' + f'supported length, it will be truncated.') + warnings.warn(wstr, NumbaInvalidConfigWarning) + fname = fname[:cmax] + f = gv.Digraph(name, filename=fname) + f.attr(rankdir='TB') + f.attr('node', shape='none', fontsize='%s' % str(fontsize)) + return f + + f = init_digraph(self.name, self.name, fontsize) + + # A lot of regex is needed to parse the raw dot output. This output + # contains a mix of LLVM IR in the labels, and also DOT markup. + + # DOT syntax, matches a "port" (where the tail of an edge starts) + port_match = re.compile('.*{(.*)}.*') + # DOT syntax, matches the "port" value from a found "port_match" + port_jmp_match = re.compile('.*<(.*)>(.*)') + # LLVM syntax, matches a LLVM debug marker + metadata_marker = re.compile(r'.*!dbg\s+(![0-9]+).*') + # LLVM syntax, matches a location entry + location_expr = (r'.*!DILocation\(line:\s+([0-9]+),' + r'\s+column:\s+([0-9]),.*') + location_entry = re.compile(location_expr) + # LLVM syntax, matches LLVMs internal debug value calls + dbg_value = re.compile(r'.*call void @llvm.dbg.value.*') + # LLVM syntax, matches tokens for highlighting + nrt_incref = re.compile(r"@NRT_incref\b") + nrt_decref = re.compile(r"@NRT_decref\b") + nrt_meminfo = re.compile("@NRT_MemInfo") + ll_intrin_calls = re.compile(r".*call.*@llvm\..*") + ll_function_call = re.compile(r".*call.*@.*") + ll_raise = re.compile(r"ret i32.*\!ret_is_raise.*") + ll_return = re.compile("ret i32 [^1],?.*") + + # wrapper function for line wrapping LLVM lines + def wrap(s): + return textwrap.wrap(s, width=120, subsequent_indent='... ') + + # function to fix (sometimes escaped for DOT!) LLVM IR etc that needs to + # be HTML escaped + def clean(s): + # Grab first 300 chars only, 1. this should be enough to identify + # the token and it keeps names short. 2. graphviz/dot has a maximum + # buffer size near 585?!, with additional transforms it's hard to + # know if this would be exceeded. 3. hash of the token string is + # written into the rendering to permit exact identification against + # e.g. LLVM IR dump if necessary. + n = 300 + if len(s) > n: + hs = str(hash(s)) + s = '{}...'.format(s[:n], hs) + s = html.escape(s) # deals with &, < and > + s = s.replace('\\{', "{") + s = s.replace('\\}', "}") + s = s.replace('\\', "\") + s = s.replace('%', "%") + s = s.replace('!', "!") + return s + + # These hold the node and edge ids from the raw dot information. They + # are used later to wire up a new DiGraph that has the same structure + # as the raw dot but with new nodes. + node_ids = {} + edge_ids = {} + + # Python source lines, used if python source interleave is requested + if _interleave.python: + src_code, firstlineno = inspect.getsourcelines(self.py_func) + + # This is the dot info from LLVM, it's in DOT form and has continuation + # lines, strip them and then re-parse into `dot_json` form for use in + # producing a formatted output. + raw_dot = ll.get_function_cfg(fn).replace('\\l...', '') + json_bytes = gv.Source(raw_dot).pipe(format='dot_json') + jzon = json.loads(json_bytes.decode('utf-8')) + + idc = 0 + # Walk the "objects" (nodes) in the DOT output + for obj in jzon['objects']: + # These are used to keep tabs on the current line and column numbers + # as per the markers. They are tracked so as to make sure a marker + # is only emitted if there's a change in the marker. + cur_line, cur_col = -1, -1 + label = obj['label'] + name = obj['name'] + gvid = obj['_gvid'] + node_ids[gvid] = name + # Label is DOT format, it needs the head and tail removing and then + # splitting for walking. + label = label[1:-1] + lines = label.split('\\l') + + # Holds the new lines + new_lines = [] + + # Aim is to produce an HTML table a bit like this: + # + # |------------| + # | HEADER | <-- this is the block header + # |------------| + # | LLVM SRC | <-- + # | Marker? | < this is the label/block body + # | Python src?| <-- + # |------------| + # | T | F | <-- this is the "ports", also determines col_span + # -------------- + # + + # This is HTML syntax, its the column span. If there's a switch or a + # branch at the bottom of the node this is rendered as multiple + # columns in a table. First job is to go and render that and work + # out how many columns are needed as that dictates how many columns + # the rest of the source lines must span. In DOT syntax the places + # that edges join nodes are referred to as "ports". Syntax in DOT + # is like `node:port`. + col_span = 1 + + # First see if there is a port entry for this node + port_line = '' + matched = port_match.match(lines[-1]) + sliced_lines = lines + if matched is not None: + # There is a port + ports = matched.groups()[0] + ports_tokens = ports.split('|') + col_span = len(ports_tokens) + # Generate HTML table data cells, one for each port. If the + # ports correspond to a branch then they can optionally + # highlighted based on T/F. + tdfmt = ('{}') + tbl_data = [] + if _highlight.branches: + colors = {'T': cs['truebr'], 'F': cs['falsebr']} + else: + colors = {} + for tok in ports_tokens: + target, value = port_jmp_match.match(tok).groups() + color = colors.get(value, 'white') + tbl_data.append(tdfmt.format(color, target, value)) + port_line = ''.join(tbl_data) + # Drop the last line from the rest of the parse as it's the port + # and just been dealt with. + sliced_lines = lines[:-1] + + # loop peel the block header, it needs a HTML border + fmtheader = ('{}') + new_lines.append(fmtheader.format(cs['default'], col_span, + clean(sliced_lines[0].strip()))) + + # process rest of block creating the table row at a time. + fmt = ('{}') + + def metadata_interleave(l, new_lines): + """ + Search line `l` for metadata associated with python or line info + and inject it into `new_lines` if requested. + """ + matched = metadata_marker.match(l) + if matched is not None: + # there's a metadata marker + g = matched.groups() + if g is not None: + assert len(g) == 1, g + marker = g[0] + debug_data = md.get(marker, None) + if debug_data is not None: + # and the metadata marker has a corresponding piece + # of metadata + ld = location_entry.match(debug_data) + if ld is not None: + # and the metadata is line info... proceed + assert len(ld.groups()) == 2, ld + line, col = ld.groups() + # only emit a new marker if the line number in + # the metadata is "new". + if line != cur_line or col != cur_col: + if _interleave.lineinfo: + mfmt = 'Marker %s, Line %s, column %s' + mark_line = mfmt % (marker, line, col) + ln = fmt.format(cs['marker'], col_span, + clean(mark_line)) + new_lines.append(ln) + if _interleave.python: + # TODO: + # +1 for decorator, this probably needs + # the same thing doing as for the + # error messages where the decorator + # is scanned for, its not always +1! + lidx = int(line) - (firstlineno + 1) + source_line = src_code[lidx + 1] + ln = fmt.format(cs['python'], col_span, + clean(source_line)) + new_lines.append(ln) + return line, col + + for l in sliced_lines[1:]: + + # Drop LLVM debug call entries + if dbg_value.match(l): + continue + + # if requested generate interleaving of markers or python from + # metadata + if _interleave.lineinfo or _interleave.python: + updated_lineinfo = metadata_interleave(l, new_lines) + if updated_lineinfo is not None: + cur_line, cur_col = updated_lineinfo + + # Highlight other LLVM features if requested, HTML BGCOLOR + # property is set by this. + if _highlight.incref and nrt_incref.search(l): + colour = cs['incref'] + elif _highlight.decref and nrt_decref.search(l): + colour = cs['decref'] + elif _highlight.meminfo and nrt_meminfo.search(l): + colour = cs['meminfo'] + elif _highlight.raises and ll_raise.search(l): + # search for raise as its more specific than exit + colour = cs['raise'] + elif _highlight.returns and ll_return.search(l): + colour = cs['return'] + elif _highlight.llvm_intrin_calls and ll_intrin_calls.search(l): + colour = cs['llvm_intrin_calls'] + elif _highlight.function_calls and ll_function_call.search(l): + colour = cs['function_calls'] + else: + colour = cs['default'] + + # Use the default coloring as a flag to force printing if a + # special token print was requested AND LLVM ir stripping is + # required + if colour is not cs['default'] or not strip_ir: + for x in wrap(clean(l)): + new_lines.append(fmt.format(colour, col_span, x)) + + # add in the port line at the end of the block if it was present + # (this was built right at the top of the parse) + if port_line: + new_lines.append('{}'.format(port_line)) + + # If there was data, create a table, else don't! + dat = ''.join(new_lines) + if dat: + tab = (('%s
') % (idc, + dat)) + label = '<{}>'.format(tab) + else: + label = '' + + # finally, add a replacement node for the original with a new marked + # up label. + f.node(name, label=label) + + # Parse the edge data + if 'edges' in jzon: # might be a single block, no edges + for edge in jzon['edges']: + gvid = edge['_gvid'] + tp = edge.get('tailport', None) + edge_ids[gvid] = (edge['head'], edge['tail'], tp) + + # Write in the edge wiring with respect to the new nodes:ports. + for gvid, edge in edge_ids.items(): + tail = node_ids[edge[1]] + head = node_ids[edge[0]] + port = edge[2] + if port is not None: + tail += ':%s' % port + f.edge(tail, head) + + # Add a key to the graph if requested. + if show_key: + key_tab = [] + for k, v in cs.items(): + key_tab.append(('{}').format(v, k)) + # The first < and last > are DOT syntax, rest is DOT HTML. + f.node("Key", label=('<{}
' + 'Key:
>').format(''.join(key_tab))) + + # Render if required + if filename is not None or view is not None: + f.render(filename=filename, view=view, format=render_format) + + # Else pipe out a SVG + return f.pipe(format='svg') + + def display(self, filename=None, format='pdf', view=False): + """ + Plot the CFG. In IPython notebook, the return image object can be + inlined. + + The *filename* option can be set to a specific path for the rendered + output to write to. If *view* option is True, the plot is opened by + the system default application for the image format (PDF). *format* can + be any valid format string accepted by graphviz, default is 'pdf'. + """ + rawbyt = self.pretty_printer(filename=filename, view=view, + render_format=format, **self.kwargs) + return rawbyt.decode('utf-8') + + def _repr_svg_(self): + return self.pretty_printer(**self.kwargs).decode('utf-8') + + def __repr__(self): + return self.dot + + +class CodeLibrary(metaclass=ABCMeta): + """ + An interface for bundling LLVM code together and compiling it. + It is tied to a *codegen* instance (e.g. JITCPUCodegen) that will + determine how the LLVM code is transformed and linked together. + """ + + _finalized = False + _object_caching_enabled = False + _disable_inspection = False + + def __init__(self, codegen: "CPUCodegen", name: str): + self._codegen = codegen + self._name = name + ptc_name = f"{self.__class__.__name__}({self._name!r})" + self._recorded_timings = PassTimingsCollection(ptc_name) + # Track names of the dynamic globals + self._dynamic_globals = [] + + @property + def has_dynamic_globals(self): + self._ensure_finalized() + return len(self._dynamic_globals) > 0 + + @property + def recorded_timings(self): + return self._recorded_timings + + @property + def codegen(self): + """ + The codegen object owning this library. + """ + return self._codegen + + @property + def name(self): + return self._name + + def __repr__(self): + return "" % (self.name, id(self)) + + def _raise_if_finalized(self): + if self._finalized: + raise RuntimeError("operation impossible on finalized object %r" + % (self,)) + + def _ensure_finalized(self): + if not self._finalized: + self.finalize() + + def create_ir_module(self, name): + """ + Create an LLVM IR module for use by this library. + """ + self._raise_if_finalized() + ir_module = self._codegen._create_empty_module(name) + return ir_module + + @abstractmethod + def add_linking_library(self, library): + """ + Add a library for linking into this library, without losing + the original library. + """ + + @abstractmethod + def add_ir_module(self, ir_module): + """ + Add an LLVM IR module's contents to this library. + """ + + @abstractmethod + def finalize(self): + """ + Finalize the library. After this call, nothing can be added anymore. + Finalization involves various stages of code optimization and + linking. + """ + + @abstractmethod + def get_function(self, name): + """ + Return the function named ``name``. + """ + + @abstractmethod + def get_llvm_str(self): + """ + Get the human-readable form of the LLVM module. + """ + + @abstractmethod + def get_asm_str(self): + """ + Get the human-readable assembly. + """ + + # + # Object cache hooks and serialization + # + + def enable_object_caching(self): + self._object_caching_enabled = True + self._compiled_object = None + self._compiled = False + + def _get_compiled_object(self): + if not self._object_caching_enabled: + raise ValueError("object caching not enabled in %s" % (self,)) + if self._compiled_object is None: + raise RuntimeError("no compiled object yet for %s" % (self,)) + return self._compiled_object + + def _set_compiled_object(self, value): + if not self._object_caching_enabled: + raise ValueError("object caching not enabled in %s" % (self,)) + if self._compiled: + raise ValueError("library already compiled: %s" % (self,)) + self._compiled_object = value + self._disable_inspection = True + + +class CPUCodeLibrary(CodeLibrary): + + def __init__(self, codegen, name): + super().__init__(codegen, name) + self._linking_libraries = [] # maintain insertion order + self._final_module = ll.parse_assembly( + str(self._codegen._create_empty_module(self.name))) + self._final_module.name = cgutils.normalize_ir_text(self.name) + self._shared_module = None + + def _optimize_functions(self, ll_module): + """ + Internal: run function-level optimizations inside *ll_module*. + """ + # Enforce data layout to enable layout-specific optimizations + ll_module.data_layout = self._codegen._data_layout + with self._codegen._function_pass_manager(ll_module) as fpm: + # Run function-level optimizations to reduce memory usage and improve + # module-level optimization. + for func in ll_module.functions: + k = f"Function passes on {func.name!r}" + with self._recorded_timings.record(k): + fpm.initialize() + fpm.run(func) + fpm.finalize() + + def _optimize_final_module(self): + """ + Internal: optimize this library's final module. + """ + cheap_name = "Module passes (cheap optimization for refprune)" + with self._recorded_timings.record(cheap_name): + # A cheaper optimisation pass is run first to try and get as many + # refops into the same function as possible via inlining + self._codegen._mpm_cheap.run(self._final_module) + # Refop pruning is then run on the heavily inlined function + if not config.LLVM_REFPRUNE_PASS: + self._final_module = remove_redundant_nrt_refct(self._final_module) + full_name = "Module passes (full optimization)" + with self._recorded_timings.record(full_name): + # The full optimisation suite is then run on the refop pruned IR + self._codegen._mpm_full.run(self._final_module) + + def _get_module_for_linking(self): + """ + Internal: get a LLVM module suitable for linking multiple times + into another library. Exported functions are made "linkonce_odr" + to allow for multiple definitions, inlining, and removal of + unused exports. + + See discussion in https://github.com/numba/numba/pull/890 + """ + self._ensure_finalized() + if self._shared_module is not None: + return self._shared_module + mod = self._final_module + to_fix = [] + nfuncs = 0 + for fn in mod.functions: + nfuncs += 1 + if not fn.is_declaration and fn.linkage == ll.Linkage.external: + to_fix.append(fn.name) + if nfuncs == 0: + # This is an issue which can occur if loading a module + # from an object file and trying to link with it, so detect it + # here to make debugging easier. + raise RuntimeError("library unfit for linking: " + "no available functions in %s" + % (self,)) + if to_fix: + mod = mod.clone() + for name in to_fix: + # NOTE: this will mark the symbol WEAK if serialized + # to an ELF file + mod.get_function(name).linkage = 'linkonce_odr' + self._shared_module = mod + return mod + + def add_linking_library(self, library): + library._ensure_finalized() + self._linking_libraries.append(library) + + def add_ir_module(self, ir_module): + self._raise_if_finalized() + assert isinstance(ir_module, llvmir.Module) + ir = cgutils.normalize_ir_text(str(ir_module)) + ll_module = ll.parse_assembly(ir) + ll_module.name = ir_module.name + ll_module.verify() + self.add_llvm_module(ll_module) + + def add_llvm_module(self, ll_module): + self._optimize_functions(ll_module) + # TODO: we shouldn't need to recreate the LLVM module object + if not config.LLVM_REFPRUNE_PASS: + ll_module = remove_redundant_nrt_refct(ll_module) + self._final_module.link_in(ll_module) + + def finalize(self): + require_global_compiler_lock() + + # Report any LLVM-related problems to the user + self._codegen._check_llvm_bugs() + + self._raise_if_finalized() + + if config.DUMP_FUNC_OPT: + dump("FUNCTION OPTIMIZED DUMP %s" % self.name, + self.get_llvm_str(), 'llvm') + + # Link libraries for shared code + seen = set() + for library in self._linking_libraries: + if library not in seen: + seen.add(library) + self._final_module.link_in( + library._get_module_for_linking(), preserve=True, + ) + + # Optimize the module after all dependences are linked in above, + # to allow for inlining. + self._optimize_final_module() + + self._final_module.verify() + self._finalize_final_module() + + def _finalize_dynamic_globals(self): + # Scan for dynamic globals + for gv in self._final_module.global_variables: + if gv.name.startswith('numba.dynamic.globals'): + self._dynamic_globals.append(gv.name) + + def _verify_declare_only_symbols(self): + # Verify that no declare-only function compiled by numba. + for fn in self._final_module.functions: + # We will only check for symbol name starting with '_ZN5numba' + if fn.is_declaration and fn.name.startswith('_ZN5numba'): + msg = 'Symbol {} not linked properly' + raise AssertionError(msg.format(fn.name)) + + def _finalize_final_module(self): + """ + Make the underlying LLVM module ready to use. + """ + self._finalize_dynamic_globals() + self._verify_declare_only_symbols() + + # Remember this on the module, for the object cache hooks + self._final_module.__library = weakref.proxy(self) + + # It seems add_module() must be done only here and not before + # linking in other modules, otherwise get_pointer_to_function() + # could fail. + cleanup = self._codegen._add_module(self._final_module) + if cleanup: + weakref.finalize(self, cleanup) + self._finalize_specific() + + self._finalized = True + + if config.DUMP_OPTIMIZED: + dump("OPTIMIZED DUMP %s" % self.name, self.get_llvm_str(), 'llvm') + + if config.DUMP_ASSEMBLY: + dump("ASSEMBLY %s" % self.name, self.get_asm_str(), 'asm') + + def get_defined_functions(self): + """ + Get all functions defined in the library. The library must have + been finalized. + """ + mod = self._final_module + for fn in mod.functions: + if not fn.is_declaration: + yield fn + + def get_function(self, name): + return self._final_module.get_function(name) + + def _sentry_cache_disable_inspection(self): + if self._disable_inspection: + warnings.warn('Inspection disabled for cached code. ' + 'Invalid result is returned.') + + def get_llvm_str(self): + self._sentry_cache_disable_inspection() + return str(self._final_module) + + def get_asm_str(self): + self._sentry_cache_disable_inspection() + return str(self._codegen._tm.emit_assembly(self._final_module)) + + def get_function_cfg(self, name, py_func=None, **kwargs): + """ + Get control-flow graph of the LLVM function + """ + self._sentry_cache_disable_inspection() + return _CFG(self, name, py_func, **kwargs) + + def get_disasm_cfg(self, mangled_name): + """ + Get the CFG of the disassembly of the ELF object at symbol mangled_name. + + Requires python package: r2pipe + Requires radare2 binary on $PATH. + Notebook rendering requires python package: graphviz + Optionally requires a compiler toolchain (via pycc) to link the ELF to + get better disassembly results. + """ + elf = self._get_compiled_object() + return disassemble_elf_to_cfg(elf, mangled_name) + + @classmethod + def _dump_elf(cls, buf): + """ + Dump the symbol table of an ELF file. + Needs pyelftools (https://github.com/eliben/pyelftools) + """ + from elftools.elf.elffile import ELFFile + from elftools.elf import descriptions + from io import BytesIO + f = ELFFile(BytesIO(buf)) + print("ELF file:") + for sec in f.iter_sections(): + if sec['sh_type'] == 'SHT_SYMTAB': + symbols = sorted(sec.iter_symbols(), key=lambda sym: sym.name) + print(" symbols:") + for sym in symbols: + if not sym.name: + continue + print(" - %r: size=%d, value=0x%x, type=%s, bind=%s" + % (sym.name.decode(), + sym['st_size'], + sym['st_value'], + descriptions.describe_symbol_type(sym['st_info']['type']), + descriptions.describe_symbol_bind(sym['st_info']['bind']), + )) + print() + + @classmethod + def _object_compiled_hook(cls, ll_module, buf): + """ + `ll_module` was compiled into object code `buf`. + """ + try: + self = ll_module.__library + except AttributeError: + return + if self._object_caching_enabled: + self._compiled = True + self._compiled_object = buf + + @classmethod + def _object_getbuffer_hook(cls, ll_module): + """ + Return a cached object code for `ll_module`. + """ + try: + self = ll_module.__library + except AttributeError: + return + if self._object_caching_enabled and self._compiled_object: + buf = self._compiled_object + self._compiled_object = None + return buf + + def serialize_using_bitcode(self): + """ + Serialize this library using its bitcode as the cached representation. + """ + self._ensure_finalized() + return (self.name, 'bitcode', self._final_module.as_bitcode()) + + def serialize_using_object_code(self): + """ + Serialize this library using its object code as the cached + representation. We also include its bitcode for further inlining + with other libraries. + """ + self._ensure_finalized() + data = (self._get_compiled_object(), + self._get_module_for_linking().as_bitcode()) + return (self.name, 'object', data) + + @classmethod + def _unserialize(cls, codegen, state): + name, kind, data = state + self = codegen.create_library(name) + assert isinstance(self, cls) + if kind == 'bitcode': + # No need to re-run optimizations, just make the module ready + self._final_module = ll.parse_bitcode(data) + self._finalize_final_module() + return self + elif kind == 'object': + object_code, shared_bitcode = data + self.enable_object_caching() + self._set_compiled_object(object_code) + self._shared_module = ll.parse_bitcode(shared_bitcode) + self._finalize_final_module() + # Load symbols from cache + self._codegen._engine._load_defined_symbols(self._shared_module) + return self + else: + raise ValueError("unsupported serialization kind %r" % (kind,)) + + +class AOTCodeLibrary(CPUCodeLibrary): + + def emit_native_object(self): + """ + Return this library as a native object (a bytestring) -- for example + ELF under Linux. + + This function implicitly calls .finalize(). + """ + self._ensure_finalized() + return self._codegen._tm.emit_object(self._final_module) + + def emit_bitcode(self): + """ + Return this library as LLVM bitcode (a bytestring). + + This function implicitly calls .finalize(). + """ + self._ensure_finalized() + return self._final_module.as_bitcode() + + def _finalize_specific(self): + pass + + +class JITCodeLibrary(CPUCodeLibrary): + + def get_pointer_to_function(self, name): + """ + Generate native code for function named *name* and return a pointer + to the start of the function (as an integer). + + This function implicitly calls .finalize(). + + Returns + ------- + pointer : int + - zero (null) if no symbol of *name* is defined by this code + library. + - non-zero if the symbol is defined. + """ + self._ensure_finalized() + ee = self._codegen._engine + if not ee.is_symbol_defined(name): + return 0 + else: + return self._codegen._engine.get_function_address(name) + + def _finalize_specific(self): + self._codegen._scan_and_fix_unresolved_refs(self._final_module) + with self._recorded_timings.record("Finalize object"): + self._codegen._engine.finalize_object() + + +class RuntimeLinker(object): + """ + For tracking unresolved symbols generated at runtime due to recursion. + """ + PREFIX = '.numba.unresolved$' + + def __init__(self): + self._unresolved = utils.UniqueDict() + self._defined = set() + self._resolved = [] + + def scan_unresolved_symbols(self, module, engine): + """ + Scan and track all unresolved external symbols in the module and + allocate memory for it. + """ + prefix = self.PREFIX + + for gv in module.global_variables: + if gv.name.startswith(prefix): + sym = gv.name[len(prefix):] + # Avoid remapping to existing GV + if engine.is_symbol_defined(gv.name): + continue + # Allocate a memory space for the pointer + abortfn = rtsys.library.get_pointer_to_function("nrt_unresolved_abort") + ptr = ctypes.c_void_p(abortfn) + engine.add_global_mapping(gv, ctypes.addressof(ptr)) + self._unresolved[sym] = ptr + + def scan_defined_symbols(self, module): + """ + Scan and track all defined symbols. + """ + for fn in module.functions: + if not fn.is_declaration: + self._defined.add(fn.name) + + def resolve(self, engine): + """ + Fix unresolved symbols if they are defined. + """ + # An iterator to get all unresolved but available symbols + pending = [name for name in self._unresolved if name in self._defined] + # Resolve pending symbols + for name in pending: + # Get runtime address + fnptr = engine.get_function_address(name) + # Fix all usage + ptr = self._unresolved[name] + ptr.value = fnptr + self._resolved.append((name, ptr)) # keep ptr alive + # Delete resolved + del self._unresolved[name] + +def _proxy(old): + @functools.wraps(old) + def wrapper(self, *args, **kwargs): + return old(self._ee, *args, **kwargs) + return wrapper + + +class JitEngine(object): + """Wraps an ExecutionEngine to provide custom symbol tracking. + Since the symbol tracking is incomplete (doesn't consider + loaded code object), we are not putting it in llvmlite. + """ + def __init__(self, ee): + self._ee = ee + # Track symbol defined via codegen'd Module + # but not any cached object. + # NOTE: `llvm::ExecutionEngine` will catch duplicated symbols and + # we are not going to protect against that. A proper duplicated + # symbol detection will need a more logic to check for the linkage + # (e.g. like `weak` linkage symbol can override). This + # `_defined_symbols` set will be just enough to tell if a symbol + # exists and will not cause the `EE` symbol lookup to `exit(1)` + # when symbol-not-found. + self._defined_symbols = set() + + def is_symbol_defined(self, name): + """Is the symbol defined in this session? + """ + return name in self._defined_symbols + + def _load_defined_symbols(self, mod): + """Extract symbols from the module + """ + for gsets in (mod.functions, mod.global_variables): + self._defined_symbols |= {gv.name for gv in gsets + if not gv.is_declaration} + + def add_module(self, module): + """Override ExecutionEngine.add_module + to keep info about defined symbols. + """ + self._load_defined_symbols(module) + return self._ee.add_module(module) + + def add_global_mapping(self, gv, addr): + """Override ExecutionEngine.add_global_mapping + to keep info about defined symbols. + """ + self._defined_symbols.add(gv.name) + return self._ee.add_global_mapping(gv, addr) + + # + # The remaining methods are re-export of the ExecutionEngine APIs + # + set_object_cache = _proxy(ll.ExecutionEngine.set_object_cache) + finalize_object = _proxy(ll.ExecutionEngine.finalize_object) + get_function_address = _proxy(ll.ExecutionEngine.get_function_address) + get_global_value_address = _proxy( + ll.ExecutionEngine.get_global_value_address + ) + + +class Codegen(metaclass=ABCMeta): + """ + Base Codegen class. It is expected that subclasses set the class attribute + ``_library_class``, indicating the CodeLibrary class for the target. + + Subclasses should also initialize: + + ``self._data_layout``: the data layout for the target. + ``self._target_data``: the binding layer ``TargetData`` for the target. + """ + + @abstractmethod + def _create_empty_module(self, name): + """ + Create a new empty module suitable for the target. + """ + + @abstractmethod + def _add_module(self, module): + """ + Add a module to the execution engine. Ownership of the module is + transferred to the engine. + """ + + @property + def target_data(self): + """ + The LLVM "target data" object for this codegen instance. + """ + return self._target_data + + def create_library(self, name, **kwargs): + """ + Create a :class:`CodeLibrary` object for use with this codegen + instance. + """ + return self._library_class(self, name, **kwargs) + + def unserialize_library(self, serialized): + return self._library_class._unserialize(self, serialized) + + +class CPUCodegen(Codegen): + + def __init__(self, module_name): + initialize_llvm() + + self._data_layout = None + self._llvm_module = ll.parse_assembly( + str(self._create_empty_module(module_name))) + self._llvm_module.name = "global_codegen_module" + self._rtlinker = RuntimeLinker() + self._init(self._llvm_module) + + def _init(self, llvm_module): + assert list(llvm_module.global_variables) == [], "Module isn't empty" + + target = ll.Target.from_triple(ll.get_process_triple()) + tm_options = dict(opt=config.OPT) + self._tm_features = self._customize_tm_features() + self._customize_tm_options(tm_options) + tm = target.create_target_machine(**tm_options) + engine = ll.create_mcjit_compiler(llvm_module, tm) + + if config.ENABLE_PROFILING: + engine.enable_jit_events() + + self._tm = tm + self._engine = JitEngine(engine) + self._target_data = engine.target_data + self._data_layout = str(self._target_data) + self._mpm_cheap = self._module_pass_manager(loop_vectorize=False, + slp_vectorize=False, + opt=0, + cost="cheap") + self._mpm_full = self._module_pass_manager() + + self._engine.set_object_cache(self._library_class._object_compiled_hook, + self._library_class._object_getbuffer_hook) + + def _create_empty_module(self, name): + ir_module = llvmir.Module(cgutils.normalize_ir_text(name)) + ir_module.triple = ll.get_process_triple() + if self._data_layout: + ir_module.data_layout = self._data_layout + return ir_module + + def _module_pass_manager(self, **kwargs): + pm = ll.create_module_pass_manager() + self._tm.add_analysis_passes(pm) + cost = kwargs.pop("cost", None) + with self._pass_manager_builder(**kwargs) as pmb: + pmb.populate(pm) + # If config.OPT==0 do not include these extra passes to help with + # vectorization. + if cost is not None and cost == "cheap" and config.OPT != 0: + # This knocks loops into rotated form early to reduce the likelihood + # of vectorization failing due to unknown PHI nodes. + pm.add_loop_rotate_pass() + # LLVM 11 added LFTR to the IV Simplification pass, this interacted + # badly with the existing use of the InstructionCombiner here and + # ended up with PHI nodes that prevented vectorization from + # working. The desired vectorization effects can be achieved + # with this in LLVM 11 (and also < 11) but at a potentially + # slightly higher cost: + pm.add_licm_pass() + pm.add_cfg_simplification_pass() + if config.LLVM_REFPRUNE_PASS: + pm.add_refprune_pass(_parse_refprune_flags()) + return pm + + def _function_pass_manager(self, llvm_module, **kwargs): + pm = ll.create_function_pass_manager(llvm_module) + self._tm.add_analysis_passes(pm) + with self._pass_manager_builder(**kwargs) as pmb: + pmb.populate(pm) + if config.LLVM_REFPRUNE_PASS: + pm.add_refprune_pass(_parse_refprune_flags()) + return pm + + def _pass_manager_builder(self, **kwargs): + """ + Create a PassManagerBuilder. + + Note: a PassManagerBuilder seems good only for one use, so you + should call this method each time you want to populate a module + or function pass manager. Otherwise some optimizations will be + missed... + """ + opt_level = kwargs.pop('opt', config.OPT) + loop_vectorize = kwargs.pop('loop_vectorize', config.LOOP_VECTORIZE) + slp_vectorize = kwargs.pop('slp_vectorize', config.SLP_VECTORIZE) + + pmb = create_pass_manager_builder(opt=opt_level, + loop_vectorize=loop_vectorize, + slp_vectorize=slp_vectorize, + **kwargs) + + return pmb + + def _check_llvm_bugs(self): + """ + Guard against some well-known LLVM bug(s). + """ + # Check the locale bug at https://github.com/numba/numba/issues/1569 + # Note we can't cache the result as locale settings can change + # across a process's lifetime. Also, for this same reason, + # the check here is a mere heuristic (there may be a race condition + # between now and actually compiling IR). + ir = """ + define double @func() + { + ret double 1.23e+01 + } + """ + mod = ll.parse_assembly(ir) + ir_out = str(mod) + if "12.3" in ir_out or "1.23" in ir_out: + # Everything ok + return + if "1.0" in ir_out: + loc = locale.getlocale() + raise RuntimeError( + "LLVM will produce incorrect floating-point code " + "in the current locale %s.\nPlease read " + "https://numba.readthedocs.io/en/stable/user/faq.html#llvm-locale-bug " + "for more information." + % (loc,)) + raise AssertionError("Unexpected IR:\n%s\n" % (ir_out,)) + + def magic_tuple(self): + """ + Return a tuple unambiguously describing the codegen behaviour. + """ + return (self._llvm_module.triple, self._get_host_cpu_name(), + self._tm_features) + + def _scan_and_fix_unresolved_refs(self, module): + self._rtlinker.scan_unresolved_symbols(module, self._engine) + self._rtlinker.scan_defined_symbols(module) + self._rtlinker.resolve(self._engine) + + def insert_unresolved_ref(self, builder, fnty, name): + voidptr = llvmir.IntType(8).as_pointer() + ptrname = self._rtlinker.PREFIX + name + llvm_mod = builder.module + try: + fnptr = llvm_mod.get_global(ptrname) + except KeyError: + # Not defined? + fnptr = llvmir.GlobalVariable(llvm_mod, voidptr, name=ptrname) + fnptr.linkage = 'external' + return builder.bitcast(builder.load(fnptr), fnty.as_pointer()) + + def _get_host_cpu_name(self): + return (ll.get_host_cpu_name() + if config.CPU_NAME is None + else config.CPU_NAME) + + def _get_host_cpu_features(self): + if config.CPU_FEATURES is not None: + return config.CPU_FEATURES + return get_host_cpu_features() + + +class AOTCPUCodegen(CPUCodegen): + """ + A codegen implementation suitable for Ahead-Of-Time compilation + (e.g. generation of object files). + """ + + _library_class = AOTCodeLibrary + + def __init__(self, module_name, cpu_name=None): + # By default, use generic cpu model for the arch + self._cpu_name = cpu_name or '' + CPUCodegen.__init__(self, module_name) + + def _customize_tm_options(self, options): + cpu_name = self._cpu_name + if cpu_name == 'host': + cpu_name = self._get_host_cpu_name() + options['cpu'] = cpu_name + options['reloc'] = 'pic' + options['codemodel'] = 'default' + options['features'] = self._tm_features + + def _customize_tm_features(self): + # ISA features are selected according to the requested CPU model + # in _customize_tm_options() + return '' + + def _add_module(self, module): + pass + + +class JITCPUCodegen(CPUCodegen): + """ + A codegen implementation suitable for Just-In-Time compilation. + """ + + _library_class = JITCodeLibrary + + def _customize_tm_options(self, options): + # As long as we don't want to ship the code to another machine, + # we can specialize for this CPU. + options['cpu'] = self._get_host_cpu_name() + # LLVM 7 change: # https://reviews.llvm.org/D47211#inline-425406 + # JIT needs static relocation on x86* + # native target is already initialized from base class __init__ + arch = ll.Target.from_default_triple().name + if arch.startswith('x86'): # one of x86 or x86_64 + reloc_model = 'static' + elif arch.startswith('ppc'): + reloc_model = 'pic' + else: + reloc_model = 'default' + options['reloc'] = reloc_model + options['codemodel'] = 'jitdefault' + + # Set feature attributes (such as ISA extensions) + # This overrides default feature selection by CPU model above + options['features'] = self._tm_features + + # Deal with optional argument to ll.Target.create_target_machine + sig = utils.pysignature(ll.Target.create_target_machine) + if 'jit' in sig.parameters: + # Mark that this is making a JIT engine + options['jit'] = True + + def _customize_tm_features(self): + # For JIT target, we will use LLVM to get the feature map + return self._get_host_cpu_features() + + def _add_module(self, module): + self._engine.add_module(module) + # XXX: disabling remove module due to MCJIT engine leakage in + # removeModule. The removeModule causes consistent access + # violation with certain test combinations. + # # Early bind the engine method to avoid keeping a reference to self. + # return functools.partial(self._engine.remove_module, module) + + def set_env(self, env_name, env): + """Set the environment address. + + Update the GlobalVariable named *env_name* to the address of *env*. + """ + gvaddr = self._engine.get_global_value_address(env_name) + envptr = (ctypes.c_void_p * 1).from_address(gvaddr) + envptr[0] = ctypes.c_void_p(id(env)) + + +def initialize_llvm(): + """Safe to use multiple times. + """ + ll.initialize() + ll.initialize_native_target() + ll.initialize_native_asmprinter() + + +def get_host_cpu_features(): + """Get host CPU features using LLVM. + + The features may be modified due to user setting. + See numba.config.ENABLE_AVX. + """ + try: + features = ll.get_host_cpu_features() + except RuntimeError: + return '' + else: + if not config.ENABLE_AVX: + # Disable all features with name starting with 'avx' + for k in features: + if k.startswith('avx'): + features[k] = False + + # Set feature attributes + return features.flatten() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/compiler.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/compiler.py new file mode 100644 index 0000000000000000000000000000000000000000..1ff3762803aa97828017272a7673f4bfd8445c86 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/compiler.py @@ -0,0 +1,790 @@ +from collections import namedtuple +import copy +import warnings +from numba.core.tracing import event + +from numba.core import (utils, errors, typing, interpreter, bytecode, postproc, + config, callconv, cpu) +from numba.parfors.parfor import ParforDiagnostics +from numba.core.errors import CompilerError +from numba.core.environment import lookup_environment + +from numba.core.compiler_machinery import PassManager + +from numba.core.untyped_passes import (ExtractByteCode, TranslateByteCode, + FixupArgs, IRProcessing, DeadBranchPrune, + RewriteSemanticConstants, + InlineClosureLikes, GenericRewrites, + WithLifting, InlineInlinables, + FindLiterallyCalls, + MakeFunctionToJitFunction, + CanonicalizeLoopExit, + CanonicalizeLoopEntry, LiteralUnroll, + ReconstructSSA, + LiteralPropagationSubPipelinePass, + ) + +from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes, + NopythonRewrites, PreParforPass, + ParforPass, DumpParforDiagnostics, + IRLegalization, NoPythonBackend, + InlineOverloads, PreLowerStripPhis, + NativeLowering, + NoPythonSupportedFeatureValidation, + ) + +from numba.core.object_mode_passes import (ObjectModeFrontEnd, + ObjectModeBackEnd) +from numba.core.targetconfig import TargetConfig, Option, ConfigStack + + +class Flags(TargetConfig): + enable_looplift = Option( + type=bool, + default=False, + doc="Enable loop-lifting", + ) + enable_pyobject = Option( + type=bool, + default=False, + doc="Enable pyobject mode (in general)", + ) + enable_pyobject_looplift = Option( + type=bool, + default=False, + doc="Enable pyobject mode inside lifted loops", + ) + enable_ssa = Option( + type=bool, + default=True, + doc="Enable SSA", + ) + force_pyobject = Option( + type=bool, + default=False, + doc="Force pyobject mode inside the whole function", + ) + release_gil = Option( + type=bool, + default=False, + doc="Release GIL inside the native function", + ) + no_compile = Option( + type=bool, + default=False, + doc="TODO", + ) + debuginfo = Option( + type=bool, + default=False, + doc="TODO", + ) + boundscheck = Option( + type=bool, + default=False, + doc="TODO", + ) + forceinline = Option( + type=bool, + default=False, + doc="Force inlining of the function. Overrides _dbg_optnone.", + ) + no_cpython_wrapper = Option( + type=bool, + default=False, + doc="TODO", + ) + no_cfunc_wrapper = Option( + type=bool, + default=False, + doc="TODO", + ) + auto_parallel = Option( + type=cpu.ParallelOptions, + default=cpu.ParallelOptions(False), + doc="""Enable automatic parallel optimization, can be fine-tuned by +taking a dictionary of sub-options instead of a boolean, see parfor.py for +detail""", + ) + nrt = Option( + type=bool, + default=False, + doc="TODO", + ) + no_rewrites = Option( + type=bool, + default=False, + doc="TODO", + ) + error_model = Option( + type=str, + default="python", + doc="TODO", + ) + fastmath = Option( + type=cpu.FastMathOptions, + default=cpu.FastMathOptions(False), + doc="TODO", + ) + noalias = Option( + type=bool, + default=False, + doc="TODO", + ) + inline = Option( + type=cpu.InlineOptions, + default=cpu.InlineOptions("never"), + doc="TODO", + ) + # Defines a new target option for tracking the "target backend". + # This will be the XYZ in @jit(_target=XYZ). + target_backend = Option( + type=str, + default="cpu", # if not set, default to CPU + doc="backend" + ) + + dbg_extend_lifetimes = Option( + type=bool, + default=False, + doc=("Extend variable lifetime for debugging. " + "This automatically turns on with debug=True."), + ) + + dbg_optnone = Option( + type=bool, + default=False, + doc=("Disable optimization for debug. " + "Equivalent to adding optnone attribute in the LLVM Function.") + ) + + +DEFAULT_FLAGS = Flags() +DEFAULT_FLAGS.nrt = True + + +CR_FIELDS = ["typing_context", + "target_context", + "entry_point", + "typing_error", + "type_annotation", + "signature", + "objectmode", + "lifted", + "fndesc", + "library", + "call_helper", + "environment", + "metadata", + # List of functions to call to initialize on unserialization + # (i.e cache load). + "reload_init", + "referenced_envs", + ] + + +class CompileResult(namedtuple("_CompileResult", CR_FIELDS)): + """ + A structure holding results from the compilation of a function. + """ + + __slots__ = () + + def _reduce(self): + """ + Reduce a CompileResult to picklable components. + """ + libdata = self.library.serialize_using_object_code() + # Make it (un)picklable efficiently + typeann = str(self.type_annotation) + fndesc = self.fndesc + # Those don't need to be pickled and may fail + fndesc.typemap = fndesc.calltypes = None + # Include all referenced environments + referenced_envs = self._find_referenced_environments() + return (libdata, self.fndesc, self.environment, self.signature, + self.objectmode, self.lifted, typeann, self.reload_init, + tuple(referenced_envs)) + + def _find_referenced_environments(self): + """Returns a list of referenced environments + """ + mod = self.library._final_module + # Find environments + referenced_envs = [] + for gv in mod.global_variables: + gvn = gv.name + if gvn.startswith("_ZN08NumbaEnv"): + env = lookup_environment(gvn) + if env is not None: + if env.can_cache(): + referenced_envs.append(env) + return referenced_envs + + @classmethod + def _rebuild(cls, target_context, libdata, fndesc, env, + signature, objectmode, lifted, typeann, + reload_init, referenced_envs): + if reload_init: + # Re-run all + for fn in reload_init: + fn() + + library = target_context.codegen().unserialize_library(libdata) + cfunc = target_context.get_executable(library, fndesc, env) + cr = cls(target_context=target_context, + typing_context=target_context.typing_context, + library=library, + environment=env, + entry_point=cfunc, + fndesc=fndesc, + type_annotation=typeann, + signature=signature, + objectmode=objectmode, + lifted=lifted, + typing_error=None, + call_helper=None, + metadata=None, # Do not store, arbitrary & potentially large! + reload_init=reload_init, + referenced_envs=referenced_envs, + ) + + # Load Environments + for env in referenced_envs: + library.codegen.set_env(env.env_name, env) + + return cr + + @property + def codegen(self): + return self.target_context.codegen() + + def dump(self, tab=''): + print(f'{tab}DUMP {type(self).__name__} {self.entry_point}') + self.signature.dump(tab=tab + ' ') + print(f'{tab}END DUMP') + + +_LowerResult = namedtuple("_LowerResult", [ + "fndesc", + "call_helper", + "cfunc", + "env", +]) + + +def sanitize_compile_result_entries(entries): + keys = set(entries.keys()) + fieldset = set(CR_FIELDS) + badnames = keys - fieldset + if badnames: + raise NameError(*badnames) + missing = fieldset - keys + for k in missing: + entries[k] = None + # Avoid keeping alive traceback variables + err = entries['typing_error'] + if err is not None: + entries['typing_error'] = err.with_traceback(None) + return entries + + +def compile_result(**entries): + entries = sanitize_compile_result_entries(entries) + return CompileResult(**entries) + + +def compile_isolated(func, args, return_type=None, flags=DEFAULT_FLAGS, + locals={}): + """ + Compile the function in an isolated environment (typing and target + context). + Good for testing. + """ + from numba.core.registry import cpu_target + typingctx = typing.Context() + targetctx = cpu.CPUContext(typingctx, target='cpu') + # Register the contexts in case for nested @jit or @overload calls + with cpu_target.nested_context(typingctx, targetctx): + return compile_extra(typingctx, targetctx, func, args, return_type, + flags, locals) + + +def run_frontend(func, inline_closures=False, emit_dels=False): + """ + Run the compiler frontend over the given Python function, and return + the function's canonical Numba IR. + + If inline_closures is Truthy then closure inlining will be run + If emit_dels is Truthy the ir.Del nodes will be emitted appropriately + """ + # XXX make this a dedicated Pipeline? + func_id = bytecode.FunctionIdentity.from_function(func) + interp = interpreter.Interpreter(func_id) + bc = bytecode.ByteCode(func_id=func_id) + func_ir = interp.interpret(bc) + if inline_closures: + from numba.core.inline_closurecall import InlineClosureCallPass + inline_pass = InlineClosureCallPass(func_ir, cpu.ParallelOptions(False), + {}, False) + inline_pass.run() + post_proc = postproc.PostProcessor(func_ir) + post_proc.run(emit_dels) + return func_ir + + +class _CompileStatus(object): + """ + Describes the state of compilation. Used like a C record. + """ + __slots__ = ['fail_reason', 'can_fallback'] + + def __init__(self, can_fallback): + self.fail_reason = None + self.can_fallback = can_fallback + + def __repr__(self): + vals = [] + for k in self.__slots__: + vals.append("{k}={v}".format(k=k, v=getattr(self, k))) + return ', '.join(vals) + + +class _EarlyPipelineCompletion(Exception): + """ + Raised to indicate that a pipeline has completed early + """ + + def __init__(self, result): + self.result = result + + +class StateDict(dict): + """ + A dictionary that has an overloaded getattr and setattr to permit getting + and setting key/values through the use of attributes. + """ + + def __getattr__(self, attr): + try: + return self[attr] + except KeyError: + raise AttributeError(attr) + + def __setattr__(self, attr, value): + self[attr] = value + + +def _make_subtarget(targetctx, flags): + """ + Make a new target context from the given target context and flags. + """ + subtargetoptions = {} + if flags.debuginfo: + subtargetoptions['enable_debuginfo'] = True + if flags.boundscheck: + subtargetoptions['enable_boundscheck'] = True + if flags.nrt: + subtargetoptions['enable_nrt'] = True + if flags.auto_parallel: + subtargetoptions['auto_parallel'] = flags.auto_parallel + if flags.fastmath: + subtargetoptions['fastmath'] = flags.fastmath + error_model = callconv.create_error_model(flags.error_model, targetctx) + subtargetoptions['error_model'] = error_model + + return targetctx.subtarget(**subtargetoptions) + + +class CompilerBase(object): + """ + Stores and manages states for the compiler + """ + + def __init__(self, typingctx, targetctx, library, args, return_type, flags, + locals): + # Make sure the environment is reloaded + config.reload_config() + typingctx.refresh() + targetctx.refresh() + + self.state = StateDict() + + self.state.typingctx = typingctx + self.state.targetctx = _make_subtarget(targetctx, flags) + self.state.library = library + self.state.args = args + self.state.return_type = return_type + self.state.flags = flags + self.state.locals = locals + + # Results of various steps of the compilation pipeline + self.state.bc = None + self.state.func_id = None + self.state.func_ir = None + self.state.lifted = None + self.state.lifted_from = None + self.state.typemap = None + self.state.calltypes = None + self.state.type_annotation = None + # holds arbitrary inter-pipeline stage meta data + self.state.metadata = {} + self.state.reload_init = [] + # hold this for e.g. with_lifting, null out on exit + self.state.pipeline = self + + # parfor diagnostics info, add to metadata + self.state.parfor_diagnostics = ParforDiagnostics() + self.state.metadata['parfor_diagnostics'] = \ + self.state.parfor_diagnostics + self.state.metadata['parfors'] = {} + + self.state.status = _CompileStatus( + can_fallback=self.state.flags.enable_pyobject + ) + + def compile_extra(self, func): + self.state.func_id = bytecode.FunctionIdentity.from_function(func) + ExtractByteCode().run_pass(self.state) + + self.state.lifted = () + self.state.lifted_from = None + return self._compile_bytecode() + + def compile_ir(self, func_ir, lifted=(), lifted_from=None): + self.state.func_id = func_ir.func_id + self.state.lifted = lifted + self.state.lifted_from = lifted_from + self.state.func_ir = func_ir + self.state.nargs = self.state.func_ir.arg_count + + FixupArgs().run_pass(self.state) + return self._compile_ir() + + def define_pipelines(self): + """Child classes override this to customize the pipelines in use. + """ + raise NotImplementedError() + + def _compile_core(self): + """ + Populate and run compiler pipeline + """ + with ConfigStack().enter(self.state.flags.copy()): + pms = self.define_pipelines() + for pm in pms: + pipeline_name = pm.pipeline_name + func_name = "%s.%s" % (self.state.func_id.modname, + self.state.func_id.func_qualname) + + event("Pipeline: %s for %s" % (pipeline_name, func_name)) + self.state.metadata['pipeline_times'] = {pipeline_name: + pm.exec_times} + is_final_pipeline = pm == pms[-1] + res = None + try: + pm.run(self.state) + if self.state.cr is not None: + break + except _EarlyPipelineCompletion as e: + res = e.result + break + except Exception as e: + if (utils.use_new_style_errors() and not + isinstance(e, errors.NumbaError)): + raise e + + self.state.status.fail_reason = e + if is_final_pipeline: + raise e + else: + raise CompilerError("All available pipelines exhausted") + + # Pipeline is done, remove self reference to release refs to user + # code + self.state.pipeline = None + + # organise a return + if res is not None: + # Early pipeline completion + return res + else: + assert self.state.cr is not None + return self.state.cr + + def _compile_bytecode(self): + """ + Populate and run pipeline for bytecode input + """ + assert self.state.func_ir is None + return self._compile_core() + + def _compile_ir(self): + """ + Populate and run pipeline for IR input + """ + assert self.state.func_ir is not None + return self._compile_core() + + +class Compiler(CompilerBase): + """The default compiler + """ + + def define_pipelines(self): + # this maintains the objmode fallback behaviour + pms = [] + if not self.state.flags.force_pyobject: + pms.append(DefaultPassBuilder.define_nopython_pipeline(self.state)) + if self.state.status.can_fallback or self.state.flags.force_pyobject: + pms.append( + DefaultPassBuilder.define_objectmode_pipeline(self.state) + ) + return pms + + +class DefaultPassBuilder(object): + """ + This is the default pass builder, it contains the "classic" default + pipelines as pre-canned PassManager instances: + - nopython + - objectmode + - interpreted + - typed + - untyped + - nopython lowering + """ + @staticmethod + def define_nopython_pipeline(state, name='nopython'): + """Returns an nopython mode pipeline based PassManager + """ + # compose pipeline from untyped, typed and lowering parts + dpb = DefaultPassBuilder + pm = PassManager(name) + untyped_passes = dpb.define_untyped_pipeline(state) + pm.passes.extend(untyped_passes.passes) + + typed_passes = dpb.define_typed_pipeline(state) + pm.passes.extend(typed_passes.passes) + + lowering_passes = dpb.define_nopython_lowering_pipeline(state) + pm.passes.extend(lowering_passes.passes) + + pm.finalize() + return pm + + @staticmethod + def define_nopython_lowering_pipeline(state, name='nopython_lowering'): + pm = PassManager(name) + # legalise + pm.add_pass(NoPythonSupportedFeatureValidation, + "ensure features that are in use are in a valid form") + pm.add_pass(IRLegalization, + "ensure IR is legal prior to lowering") + # Annotate only once legalized + pm.add_pass(AnnotateTypes, "annotate types") + # lower + pm.add_pass(NativeLowering, "native lowering") + pm.add_pass(NoPythonBackend, "nopython mode backend") + pm.add_pass(DumpParforDiagnostics, "dump parfor diagnostics") + pm.finalize() + return pm + + @staticmethod + def define_typed_pipeline(state, name="typed"): + """Returns the typed part of the nopython pipeline""" + pm = PassManager(name) + # typing + pm.add_pass(NopythonTypeInference, "nopython frontend") + + # strip phis + pm.add_pass(PreLowerStripPhis, "remove phis nodes") + + # optimisation + pm.add_pass(InlineOverloads, "inline overloaded functions") + if state.flags.auto_parallel.enabled: + pm.add_pass(PreParforPass, "Preprocessing for parfors") + if not state.flags.no_rewrites: + pm.add_pass(NopythonRewrites, "nopython rewrites") + if state.flags.auto_parallel.enabled: + pm.add_pass(ParforPass, "convert to parfors") + + pm.finalize() + return pm + + @staticmethod + def define_untyped_pipeline(state, name='untyped'): + """Returns an untyped part of the nopython pipeline""" + pm = PassManager(name) + if state.func_ir is None: + pm.add_pass(TranslateByteCode, "analyzing bytecode") + pm.add_pass(FixupArgs, "fix up args") + pm.add_pass(IRProcessing, "processing IR") + pm.add_pass(WithLifting, "Handle with contexts") + + # inline closures early in case they are using nonlocal's + # see issue #6585. + pm.add_pass(InlineClosureLikes, + "inline calls to locally defined closures") + + # pre typing + if not state.flags.no_rewrites: + pm.add_pass(RewriteSemanticConstants, "rewrite semantic constants") + pm.add_pass(DeadBranchPrune, "dead branch pruning") + pm.add_pass(GenericRewrites, "nopython rewrites") + + # convert any remaining closures into functions + pm.add_pass(MakeFunctionToJitFunction, + "convert make_function into JIT functions") + # inline functions that have been determined as inlinable and rerun + # branch pruning, this needs to be run after closures are inlined as + # the IR repr of a closure masks call sites if an inlinable is called + # inside a closure + pm.add_pass(InlineInlinables, "inline inlinable functions") + if not state.flags.no_rewrites: + pm.add_pass(DeadBranchPrune, "dead branch pruning") + + pm.add_pass(FindLiterallyCalls, "find literally calls") + pm.add_pass(LiteralUnroll, "handles literal_unroll") + + if state.flags.enable_ssa: + pm.add_pass(ReconstructSSA, "ssa") + + pm.add_pass(LiteralPropagationSubPipelinePass, "Literal propagation") + + pm.finalize() + return pm + + @staticmethod + def define_objectmode_pipeline(state, name='object'): + """Returns an object-mode pipeline based PassManager + """ + pm = PassManager(name) + if state.func_ir is None: + pm.add_pass(TranslateByteCode, "analyzing bytecode") + pm.add_pass(FixupArgs, "fix up args") + else: + # Reaches here if it's a fallback from nopython mode. + # Strip the phi nodes. + pm.add_pass(PreLowerStripPhis, "remove phis nodes") + pm.add_pass(IRProcessing, "processing IR") + + if utils.PYVERSION >= (3, 7): + # The following passes are needed to adjust for looplifting + pm.add_pass(CanonicalizeLoopEntry, "canonicalize loop entry") + pm.add_pass(CanonicalizeLoopExit, "canonicalize loop exit") + + pm.add_pass(ObjectModeFrontEnd, "object mode frontend") + pm.add_pass(InlineClosureLikes, + "inline calls to locally defined closures") + # convert any remaining closures into functions + pm.add_pass(MakeFunctionToJitFunction, + "convert make_function into JIT functions") + pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering") + pm.add_pass(AnnotateTypes, "annotate types") + pm.add_pass(ObjectModeBackEnd, "object mode backend") + pm.finalize() + return pm + + +def compile_extra(typingctx, targetctx, func, args, return_type, flags, + locals, library=None, pipeline_class=Compiler): + """Compiler entry point + + Parameter + --------- + typingctx : + typing context + targetctx : + target context + func : function + the python function to be compiled + args : tuple, list + argument types + return_type : + Use ``None`` to indicate void return + flags : numba.compiler.Flags + compiler flags + library : numba.codegen.CodeLibrary + Used to store the compiled code. + If it is ``None``, a new CodeLibrary is used. + pipeline_class : type like numba.compiler.CompilerBase + compiler pipeline + """ + pipeline = pipeline_class(typingctx, targetctx, library, + args, return_type, flags, locals) + return pipeline.compile_extra(func) + + +def compile_ir(typingctx, targetctx, func_ir, args, return_type, flags, + locals, lifted=(), lifted_from=None, is_lifted_loop=False, + library=None, pipeline_class=Compiler): + """ + Compile a function with the given IR. + + For internal use only. + """ + + # This is a special branch that should only run on IR from a lifted loop + if is_lifted_loop: + # This code is pessimistic and costly, but it is a not often trodden + # path and it will go away once IR is made immutable. The problem is + # that the rewrite passes can mutate the IR into a state that makes + # it possible for invalid tokens to be transmitted to lowering which + # then trickle through into LLVM IR and causes RuntimeErrors as LLVM + # cannot compile it. As a result the following approach is taken: + # 1. Create some new flags that copy the original ones but switch + # off rewrites. + # 2. Compile with 1. to get a compile result + # 3. Try and compile another compile result but this time with the + # original flags (and IR being rewritten). + # 4. If 3 was successful, use the result, else use 2. + + # create flags with no rewrites + norw_flags = copy.deepcopy(flags) + norw_flags.no_rewrites = True + + def compile_local(the_ir, the_flags): + pipeline = pipeline_class(typingctx, targetctx, library, + args, return_type, the_flags, locals) + return pipeline.compile_ir(func_ir=the_ir, lifted=lifted, + lifted_from=lifted_from) + + # compile with rewrites off, IR shouldn't be mutated irreparably + norw_cres = compile_local(func_ir.copy(), norw_flags) + + # try and compile with rewrites on if no_rewrites was not set in the + # original flags, IR might get broken but we've got a CompileResult + # that's usable from above. + rw_cres = None + if not flags.no_rewrites: + # Suppress warnings in compilation retry + with warnings.catch_warnings(): + warnings.simplefilter("ignore", errors.NumbaWarning) + try: + rw_cres = compile_local(func_ir.copy(), flags) + except Exception: + pass + # if the rewrite variant of compilation worked, use it, else use + # the norewrites backup + if rw_cres is not None: + cres = rw_cres + else: + cres = norw_cres + return cres + + else: + pipeline = pipeline_class(typingctx, targetctx, library, + args, return_type, flags, locals) + return pipeline.compile_ir(func_ir=func_ir, lifted=lifted, + lifted_from=lifted_from) + + +def compile_internal(typingctx, targetctx, library, + func, args, return_type, flags, locals): + """ + For internal use only. + """ + pipeline = Compiler(typingctx, targetctx, library, + args, return_type, flags, locals) + return pipeline.compile_extra(func) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/compiler_lock.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/compiler_lock.py new file mode 100644 index 0000000000000000000000000000000000000000..874fced9fd9924d549655b55caec96d7518182f4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/compiler_lock.py @@ -0,0 +1,56 @@ +import threading +import functools +import numba.core.event as ev + + +# Lock for the preventing multiple compiler execution +class _CompilerLock(object): + def __init__(self): + self._lock = threading.RLock() + + def acquire(self): + ev.start_event("numba:compiler_lock") + self._lock.acquire() + + def release(self): + self._lock.release() + ev.end_event("numba:compiler_lock") + + def __enter__(self): + self.acquire() + + def __exit__(self, exc_val, exc_type, traceback): + self.release() + + def is_locked(self): + is_owned = getattr(self._lock, '_is_owned') + if not callable(is_owned): + is_owned = self._is_owned + return is_owned() + + def __call__(self, func): + @functools.wraps(func) + def _acquire_compile_lock(*args, **kwargs): + with self: + return func(*args, **kwargs) + return _acquire_compile_lock + + def _is_owned(self): + # This method is borrowed from threading.Condition. + # Return True if lock is owned by current_thread. + # This method is called only if _lock doesn't have _is_owned(). + if self._lock.acquire(0): + self._lock.release() + return False + else: + return True + + +global_compiler_lock = _CompilerLock() + + +def require_global_compiler_lock(): + """Sentry that checks the global_compiler_lock is acquired. + """ + # Use assert to allow turning off this check + assert global_compiler_lock.is_locked() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/compiler_machinery.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/compiler_machinery.py new file mode 100644 index 0000000000000000000000000000000000000000..2573888cdf7787c1e8c9390645686281dec3ba38 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/compiler_machinery.py @@ -0,0 +1,463 @@ +import timeit +from abc import abstractmethod, ABCMeta +from collections import namedtuple, OrderedDict +import inspect +from pprint import pformat + + +from numba.core.compiler_lock import global_compiler_lock +from numba.core import errors, config, transforms, utils +from numba.core.tracing import event +from numba.core.postproc import PostProcessor +from numba.core.ir_utils import enforce_no_dels, legalize_single_scope +import numba.core.event as ev + +# terminal color markup +_termcolor = errors.termcolor() + + +class SimpleTimer(object): + """ + A simple context managed timer + """ + + def __enter__(self): + self.ts = timeit.default_timer() + return self + + def __exit__(self, *exc): + self.elapsed = timeit.default_timer() - self.ts + + +class CompilerPass(metaclass=ABCMeta): + """ The base class for all compiler passes. + """ + + @abstractmethod + def __init__(self, *args, **kwargs): + self._analysis = None + self._pass_id = None + + @classmethod + def name(cls): + """ + Returns the name of the pass + """ + return cls._name + + @property + def pass_id(self): + """ + The ID of the pass + """ + return self._pass_id + + @pass_id.setter + def pass_id(self, val): + """ + Sets the ID of the pass + """ + self._pass_id = val + + @property + def analysis(self): + """ + Analysis data for the pass + """ + return self._analysis + + @analysis.setter + def analysis(self, val): + """ + Set the analysis data for the pass + """ + self._analysis = val + + def run_initialization(self, *args, **kwargs): + """ + Runs the initialization sequence for the pass, will run before + `run_pass`. + """ + return False + + @abstractmethod + def run_pass(self, *args, **kwargs): + """ + Runs the pass itself. Must return True/False depending on whether + statement level modification took place. + """ + pass + + def run_finalizer(self, *args, **kwargs): + """ + Runs the initialization sequence for the pass, will run before + `run_pass`. + """ + return False + + def get_analysis_usage(self, AU): + """ Override to set analysis usage + """ + pass + + def get_analysis(self, pass_name): + """ + Gets the analysis from a given pass + """ + return self._analysis[pass_name] + + +class SSACompliantMixin(object): + """ Mixin to indicate a pass is SSA form compliant. Nothing is asserted + about this condition at present. + """ + pass + + +class FunctionPass(CompilerPass): + """ Base class for function passes + """ + pass + + +class AnalysisPass(CompilerPass): + """ Base class for analysis passes (no modification made to state) + """ + pass + + +class LoweringPass(CompilerPass): + """ Base class for lowering passes + """ + pass + + +class AnalysisUsage(object): + """This looks and behaves like LLVM's AnalysisUsage because its like that. + """ + + def __init__(self): + self._required = set() + self._preserved = set() + + def get_required_set(self): + return self._required + + def get_preserved_set(self): + return self._preserved + + def add_required(self, pss): + self._required.add(pss) + + def add_preserved(self, pss): + self._preserved.add(pss) + + def __str__(self): + return "required: %s\n" % self._required + + +_DEBUG = False + + +def debug_print(*args, **kwargs): + if _DEBUG: + print(*args, **kwargs) + + +pass_timings = namedtuple('pass_timings', 'init run finalize') + + +class PassManager(object): + """ + The PassManager is a named instance of a particular compilation pipeline + """ + # TODO: Eventually enable this, it enforces self consistency after each pass + _ENFORCING = False + + def __init__(self, pipeline_name): + """ + Create a new pipeline with name "pipeline_name" + """ + self.passes = [] + self.exec_times = OrderedDict() + self._finalized = False + self._analysis = None + self._print_after = None + self.pipeline_name = pipeline_name + + def _validate_pass(self, pass_cls): + if (not (isinstance(pass_cls, str) or + (inspect.isclass(pass_cls) and + issubclass(pass_cls, CompilerPass)))): + msg = ("Pass must be referenced by name or be a subclass of a " + "CompilerPass. Have %s" % pass_cls) + raise TypeError(msg) + if isinstance(pass_cls, str): + pass_cls = _pass_registry.find_by_name(pass_cls) + else: + if not _pass_registry.is_registered(pass_cls): + raise ValueError("Pass %s is not registered" % pass_cls) + + def add_pass(self, pss, description=""): + """ + Append a pass to the PassManager's compilation pipeline + """ + self._validate_pass(pss) + func_desc_tuple = (pss, description) + self.passes.append(func_desc_tuple) + self._finalized = False + + def add_pass_after(self, pass_cls, location): + """ + Add a pass `pass_cls` to the PassManager's compilation pipeline after + the pass `location`. + """ + assert self.passes + self._validate_pass(pass_cls) + self._validate_pass(location) + for idx, (x, _) in enumerate(self.passes): + if x == location: + break + else: + raise ValueError("Could not find pass %s" % location) + self.passes.insert(idx + 1, (pass_cls, str(pass_cls))) + # if a pass has been added, it's not finalized + self._finalized = False + + def _debug_init(self): + # determine after which passes IR dumps should take place + def parse(conf_item): + print_passes = [] + if conf_item != "none": + if conf_item == "all": + print_passes = [x.name() for (x, _) in self.passes] + else: + # we don't validate whether the named passes exist in this + # pipeline the compiler may be used reentrantly and + # different pipelines may contain different passes + splitted = conf_item.split(',') + print_passes = [x.strip() for x in splitted] + return print_passes + ret = (parse(config.DEBUG_PRINT_AFTER), + parse(config.DEBUG_PRINT_BEFORE), + parse(config.DEBUG_PRINT_WRAP),) + return ret + + def finalize(self): + """ + Finalize the PassManager, after which no more passes may be added + without re-finalization. + """ + self._analysis = self.dependency_analysis() + self._print_after, self._print_before, self._print_wrap = \ + self._debug_init() + self._finalized = True + + @property + def finalized(self): + return self._finalized + + def _patch_error(self, desc, exc): + """ + Patches the error to show the stage that it arose in. + """ + newmsg = "{desc}\n{exc}".format(desc=desc, exc=exc) + exc.args = (newmsg,) + return exc + + @global_compiler_lock # this need a lock, likely calls LLVM + def _runPass(self, index, pss, internal_state): + mutated = False + + def check(func, compiler_state): + mangled = func(compiler_state) + if mangled not in (True, False): + msg = ("CompilerPass implementations should return True/False. " + "CompilerPass with name '%s' did not.") + raise ValueError(msg % pss.name()) + return mangled + + def debug_print(pass_name, print_condition, printable_condition): + if pass_name in print_condition: + fid = internal_state.func_id + args = (fid.modname, fid.func_qualname, self.pipeline_name, + printable_condition, pass_name) + print(("%s.%s: %s: %s %s" % args).center(120, '-')) + if internal_state.func_ir is not None: + internal_state.func_ir.dump() + else: + print("func_ir is None") + + # debug print before this pass? + debug_print(pss.name(), self._print_before + self._print_wrap, "BEFORE") + + # wire in the analysis info so it's accessible + pss.analysis = self._analysis + + qualname = internal_state.func_id.func_qualname + + ev_details = dict( + name=f"{pss.name()} [{qualname}]", + qualname=qualname, + module=internal_state.func_id.modname, + flags=pformat(internal_state.flags.values()), + args=str(internal_state.args), + return_type=str(internal_state.return_type), + ) + with ev.trigger_event("numba:run_pass", data=ev_details): + with SimpleTimer() as init_time: + mutated |= check(pss.run_initialization, internal_state) + with SimpleTimer() as pass_time: + mutated |= check(pss.run_pass, internal_state) + with SimpleTimer() as finalize_time: + mutated |= check(pss.run_finalizer, internal_state) + + # Check that if the pass is an instance of a FunctionPass that it hasn't + # emitted ir.Dels. + if isinstance(pss, FunctionPass): + enforce_no_dels(internal_state.func_ir) + + if self._ENFORCING: + # TODO: Add in self consistency enforcement for + # `func_ir._definitions` etc + if _pass_registry.get(pss.__class__).mutates_CFG: + if mutated: # block level changes, rebuild all + PostProcessor(internal_state.func_ir).run() + else: # CFG level changes rebuild CFG + internal_state.func_ir.blocks = transforms.canonicalize_cfg( + internal_state.func_ir.blocks) + # Check the func_ir has exactly one Scope instance + if not legalize_single_scope(internal_state.func_ir.blocks): + raise errors.CompilerError( + f"multiple scope in func_ir detected in {pss}", + ) + # inject runtimes + pt = pass_timings(init_time.elapsed, pass_time.elapsed, + finalize_time.elapsed) + self.exec_times["%s_%s" % (index, pss.name())] = pt + + # debug print after this pass? + debug_print(pss.name(), self._print_after + self._print_wrap, "AFTER") + + def run(self, state): + """ + Run the defined pipelines on the state. + """ + from numba.core.compiler import _EarlyPipelineCompletion + if not self.finalized: + raise RuntimeError("Cannot run non-finalised pipeline") + + # walk the passes and run them + for idx, (pss, pass_desc) in enumerate(self.passes): + try: + event("-- %s" % pass_desc) + pass_inst = _pass_registry.get(pss).pass_inst + if isinstance(pass_inst, CompilerPass): + self._runPass(idx, pass_inst, state) + else: + raise BaseException("Legacy pass in use") + except _EarlyPipelineCompletion as e: + raise e + except Exception as e: + if (utils.use_new_style_errors() and not + isinstance(e, errors.NumbaError)): + raise e + msg = "Failed in %s mode pipeline (step: %s)" % \ + (self.pipeline_name, pass_desc) + patched_exception = self._patch_error(msg, e) + raise patched_exception + + def dependency_analysis(self): + """ + Computes dependency analysis + """ + deps = dict() + for (pss, _) in self.passes: + x = _pass_registry.get(pss).pass_inst + au = AnalysisUsage() + x.get_analysis_usage(au) + deps[type(x)] = au + + requires_map = dict() + for k, v in deps.items(): + requires_map[k] = v.get_required_set() + + def resolve_requires(key, rmap): + def walk(lkey, rmap): + dep_set = rmap[lkey] if lkey in rmap else set() + if dep_set: + for x in dep_set: + dep_set |= (walk(x, rmap)) + return dep_set + else: + return set() + ret = set() + for k in key: + ret |= walk(k, rmap) + return ret + + dep_chain = dict() + for k, v in requires_map.items(): + dep_chain[k] = set(v) | (resolve_requires(v, requires_map)) + + return dep_chain + + +pass_info = namedtuple('pass_info', 'pass_inst mutates_CFG analysis_only') + + +class PassRegistry(object): + """ + Pass registry singleton class. + """ + + _id = 0 + + _registry = dict() + + def register(self, mutates_CFG, analysis_only): + def make_festive(pass_class): + assert not self.is_registered(pass_class) + assert not self._does_pass_name_alias(pass_class.name()) + pass_class.pass_id = self._id + self._id += 1 + self._registry[pass_class] = pass_info(pass_class(), mutates_CFG, + analysis_only) + return pass_class + return make_festive + + def is_registered(self, clazz): + return clazz in self._registry.keys() + + def get(self, clazz): + assert self.is_registered(clazz) + return self._registry[clazz] + + def _does_pass_name_alias(self, check): + for k, v in self._registry.items(): + if v.pass_inst.name == check: + return True + return False + + def find_by_name(self, class_name): + assert isinstance(class_name, str) + for k, v in self._registry.items(): + if v.pass_inst.name == class_name: + return v + else: + raise ValueError("No pass with name %s is registered" % class_name) + + def dump(self): + for k, v in self._registry.items(): + print("%s: %s" % (k, v)) + + +_pass_registry = PassRegistry() +del PassRegistry + + +""" +register_pass is used to register a compiler pass class for use with PassManager +instances. +""" +register_pass = _pass_registry.register diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/config.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/config.py new file mode 100644 index 0000000000000000000000000000000000000000..da5bed21b9945ba547a66d4385116d7e5307b5b5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/config.py @@ -0,0 +1,527 @@ +import platform +import sys +import os +import re +import shutil +import warnings + +# YAML needed to use file based Numba config +try: + import yaml + _HAVE_YAML = True +except ImportError: + _HAVE_YAML = False + + +import llvmlite.binding as ll + + +IS_WIN32 = sys.platform.startswith('win32') +IS_OSX = sys.platform.startswith('darwin') +MACHINE_BITS = tuple.__itemsize__ * 8 +IS_32BITS = MACHINE_BITS == 32 +# Python version in (major, minor) tuple +PYVERSION = sys.version_info[:2] + +# this is the name of the user supplied configuration file +_config_fname = '.numba_config.yaml' + + +def _parse_cc(text): + """ + Parse CUDA compute capability version string. + """ + if not text: + return None + else: + m = re.match(r'(\d+)\.(\d+)', text) + if not m: + raise ValueError("Compute capability must be specified as a " + "string of \"major.minor\" where major " + "and minor are decimals") + grp = m.groups() + return int(grp[0]), int(grp[1]) + + +def _os_supports_avx(): + """ + Whether the current OS supports AVX, regardless of the CPU. + + This is necessary because the user may be running a very old Linux + kernel (e.g. CentOS 5) on a recent CPU. + """ + if (not sys.platform.startswith('linux') + or platform.machine() not in ('i386', 'i586', 'i686', 'x86_64')): + return True + # Executing the CPUID instruction may report AVX available even though + # the kernel doesn't support it, so parse /proc/cpuinfo instead. + try: + f = open('/proc/cpuinfo', 'r') + except OSError: + # If /proc isn't available, assume yes + return True + with f: + for line in f: + head, _, body = line.partition(':') + if head.strip() == 'flags' and 'avx' in body.split(): + return True + else: + return False + + +# Choose how to handle captured errors +def _validate_captured_errors_style(style_str): + rendered_style = str(style_str) + if rendered_style not in ('new_style', 'old_style'): + msg = ("Invalid style in NUMBA_CAPTURED_ERRORS: " + f"{rendered_style}") + raise ValueError(msg) + else: + return rendered_style + + +class _EnvReloader(object): + + def __init__(self): + self.reset() + + def reset(self): + self.old_environ = {} + self.update(force=True) + + def update(self, force=False): + new_environ = {} + + # first check if there's a .numba_config.yaml and use values from that + if os.path.exists(_config_fname) and os.path.isfile(_config_fname): + if not _HAVE_YAML: + msg = ("A Numba config file is found but YAML parsing " + "capabilities appear to be missing. " + "To use this feature please install `pyyaml`. e.g. " + "`conda install pyyaml`.") + warnings.warn(msg) + else: + with open(_config_fname, 'rt') as f: + y_conf = yaml.safe_load(f) + if y_conf is not None: + for k, v in y_conf.items(): + new_environ['NUMBA_' + k.upper()] = v + + # clobber file based config with any locally defined env vars + for name, value in os.environ.items(): + if name.startswith('NUMBA_'): + new_environ[name] = value + # We update the config variables if at least one NUMBA environment + # variable was modified. This lets the user modify values + # directly in the config module without having them when + # reload_config() is called by the compiler. + if force or self.old_environ != new_environ: + self.process_environ(new_environ) + # Store a copy + self.old_environ = dict(new_environ) + + self.validate() + + def validate(self): + global CUDA_USE_NVIDIA_BINDING + + if CUDA_USE_NVIDIA_BINDING: # noqa: F821 + try: + import cuda # noqa: F401 + except ImportError as ie: + msg = ("CUDA Python bindings requested (the environment " + "variable NUMBA_CUDA_USE_NVIDIA_BINDING is set), " + f"but they are not importable: {ie.msg}.") + warnings.warn(msg) + + CUDA_USE_NVIDIA_BINDING = False + + if CUDA_PER_THREAD_DEFAULT_STREAM: # noqa: F821 + warnings.warn("PTDS support is handled by CUDA Python when " + "using the NVIDIA binding. Please set the " + "environment variable " + "CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM to 1 " + "instead.") + + def process_environ(self, environ): + def _readenv(name, ctor, default): + value = environ.get(name) + if value is None: + return default() if callable(default) else default + try: + return ctor(value) + except Exception: + warnings.warn("environ %s defined but failed to parse '%s'" % + (name, value), RuntimeWarning) + return default + + def optional_str(x): + return str(x) if x is not None else None + + # developer mode produces full tracebacks, disables help instructions + DEVELOPER_MODE = _readenv("NUMBA_DEVELOPER_MODE", int, 0) + + # disable performance warnings, will switch of the generation of + # warnings of the class NumbaPerformanceWarning + DISABLE_PERFORMANCE_WARNINGS = _readenv( + "NUMBA_DISABLE_PERFORMANCE_WARNINGS", int, 0) + + # Flag to enable full exception reporting + FULL_TRACEBACKS = _readenv( + "NUMBA_FULL_TRACEBACKS", int, DEVELOPER_MODE) + + # Show help text when an error occurs + SHOW_HELP = _readenv("NUMBA_SHOW_HELP", int, 0) + + # The color scheme to use for error messages, default is no color + # just bold fonts in use. + COLOR_SCHEME = _readenv("NUMBA_COLOR_SCHEME", str, "no_color") + + # Whether to globally enable bounds checking. The default None means + # to use the value of the flag to @njit. 0 or 1 overrides the flag + # globally. + BOUNDSCHECK = _readenv("NUMBA_BOUNDSCHECK", int, None) + + # Whether to always warn about potential uninitialized variables + # because static controlflow analysis cannot find a definition + # in one or more of the incoming paths. + ALWAYS_WARN_UNINIT_VAR = _readenv( + "NUMBA_ALWAYS_WARN_UNINIT_VAR", int, 0, + ) + + # Whether to warn about kernel launches where the grid size will + # under utilize the GPU due to low occupancy. On by default. + CUDA_LOW_OCCUPANCY_WARNINGS = _readenv( + "NUMBA_CUDA_LOW_OCCUPANCY_WARNINGS", int, 1) + + # Whether to use the official CUDA Python API Bindings + CUDA_USE_NVIDIA_BINDING = _readenv( + "NUMBA_CUDA_USE_NVIDIA_BINDING", int, 0) + + # Debug flag to control compiler debug print + DEBUG = _readenv("NUMBA_DEBUG", int, 0) + + # DEBUG print IR after pass names + DEBUG_PRINT_AFTER = _readenv("NUMBA_DEBUG_PRINT_AFTER", str, "none") + + # DEBUG print IR before pass names + DEBUG_PRINT_BEFORE = _readenv("NUMBA_DEBUG_PRINT_BEFORE", str, "none") + + # DEBUG print IR before and after pass names + DEBUG_PRINT_WRAP = _readenv("NUMBA_DEBUG_PRINT_WRAP", str, "none") + + # Highlighting in intermediate dumps + HIGHLIGHT_DUMPS = _readenv("NUMBA_HIGHLIGHT_DUMPS", int, 0) + + # JIT Debug flag to trigger IR instruction print + DEBUG_JIT = _readenv("NUMBA_DEBUG_JIT", int, 0) + + # Enable debugging of front-end operation + # (up to and including IR generation) + DEBUG_FRONTEND = _readenv("NUMBA_DEBUG_FRONTEND", int, 0) + + # Enable debug prints in nrtdynmod + DEBUG_NRT = _readenv("NUMBA_DEBUG_NRT", int, 0) + + # How many recently deserialized functions to retain regardless + # of external references + FUNCTION_CACHE_SIZE = _readenv("NUMBA_FUNCTION_CACHE_SIZE", int, 128) + + # Maximum tuple size that parfors will unpack and pass to + # internal gufunc. + PARFOR_MAX_TUPLE_SIZE = _readenv("NUMBA_PARFOR_MAX_TUPLE_SIZE", + int, 100) + + # Enable logging of cache operation + DEBUG_CACHE = _readenv("NUMBA_DEBUG_CACHE", int, DEBUG) + + # Redirect cache directory + # Contains path to the directory + CACHE_DIR = _readenv("NUMBA_CACHE_DIR", str, "") + + # Enable tracing support + TRACE = _readenv("NUMBA_TRACE", int, 0) + + # Enable chrome tracing support + CHROME_TRACE = _readenv("NUMBA_CHROME_TRACE", str, "") + + # Enable debugging of type inference + DEBUG_TYPEINFER = _readenv("NUMBA_DEBUG_TYPEINFER", int, 0) + + # Configure compilation target to use the specified CPU name + # and CPU feature as the host information. + # Note: this overrides "host" option for AOT compilation. + CPU_NAME = _readenv("NUMBA_CPU_NAME", optional_str, None) + CPU_FEATURES = _readenv("NUMBA_CPU_FEATURES", optional_str, + ("" if str(CPU_NAME).lower() == 'generic' + else None)) + # Optimization level + OPT = _readenv("NUMBA_OPT", int, 3) + + # Force dump of Python bytecode + DUMP_BYTECODE = _readenv("NUMBA_DUMP_BYTECODE", int, DEBUG_FRONTEND) + + # Force dump of control flow graph + DUMP_CFG = _readenv("NUMBA_DUMP_CFG", int, DEBUG_FRONTEND) + + # Force dump of Numba IR + DUMP_IR = _readenv("NUMBA_DUMP_IR", int, + DEBUG_FRONTEND) + + # Force dump of Numba IR in SSA form + DUMP_SSA = _readenv("NUMBA_DUMP_SSA", int, + DEBUG_FRONTEND or DEBUG_TYPEINFER) + + # print debug info of analysis and optimization on array operations + DEBUG_ARRAY_OPT = _readenv("NUMBA_DEBUG_ARRAY_OPT", int, 0) + + # insert debug stmts to print information at runtime + DEBUG_ARRAY_OPT_RUNTIME = _readenv( + "NUMBA_DEBUG_ARRAY_OPT_RUNTIME", int, 0) + + # print stats about parallel for-loops + DEBUG_ARRAY_OPT_STATS = _readenv("NUMBA_DEBUG_ARRAY_OPT_STATS", int, 0) + + # prints user friendly information about parallel + PARALLEL_DIAGNOSTICS = _readenv("NUMBA_PARALLEL_DIAGNOSTICS", int, 0) + + # print debug info of inline closure pass + DEBUG_INLINE_CLOSURE = _readenv("NUMBA_DEBUG_INLINE_CLOSURE", int, 0) + + # Force dump of LLVM IR + DUMP_LLVM = _readenv("NUMBA_DUMP_LLVM", int, DEBUG) + + # Force dump of Function optimized LLVM IR + DUMP_FUNC_OPT = _readenv("NUMBA_DUMP_FUNC_OPT", int, DEBUG) + + # Force dump of Optimized LLVM IR + DUMP_OPTIMIZED = _readenv("NUMBA_DUMP_OPTIMIZED", int, DEBUG) + + # Force disable loop vectorize + # Loop vectorizer is disabled on 32-bit win32 due to a bug (#649) + LOOP_VECTORIZE = _readenv("NUMBA_LOOP_VECTORIZE", int, + not (IS_WIN32 and IS_32BITS)) + + # Switch on superword-level parallelism vectorization, default is on. + SLP_VECTORIZE = _readenv("NUMBA_SLP_VECTORIZE", int, 1) + + # Force dump of generated assembly + DUMP_ASSEMBLY = _readenv("NUMBA_DUMP_ASSEMBLY", int, DEBUG) + + # Force dump of type annotation + ANNOTATE = _readenv("NUMBA_DUMP_ANNOTATION", int, 0) + + # Dump IR in such as way as to aid in "diff"ing. + DIFF_IR = _readenv("NUMBA_DIFF_IR", int, 0) + + # Dump type annotation in html format + def fmt_html_path(path): + if path is None: + return path + else: + return os.path.abspath(path) + + HTML = _readenv("NUMBA_DUMP_HTML", fmt_html_path, None) + + # x86-64 specific + # Enable AVX on supported platforms where it won't degrade performance. + def avx_default(): + if not _os_supports_avx(): + return False + else: + # There are various performance issues with AVX and LLVM + # on some CPUs (list at + # http://llvm.org/bugs/buglist.cgi?quicksearch=avx). + # For now we'd rather disable it, since it can pessimize code + cpu_name = ll.get_host_cpu_name() + return cpu_name not in ('corei7-avx', 'core-avx-i', + 'sandybridge', 'ivybridge') + + ENABLE_AVX = _readenv("NUMBA_ENABLE_AVX", int, avx_default) + + # if set and SVML is available, it will be disabled + # By default, it's disabled on 32-bit platforms. + DISABLE_INTEL_SVML = _readenv( + "NUMBA_DISABLE_INTEL_SVML", int, IS_32BITS) + + # Disable jit for debugging + DISABLE_JIT = _readenv("NUMBA_DISABLE_JIT", int, 0) + + # choose parallel backend to use + THREADING_LAYER_PRIORITY = _readenv( + "NUMBA_THREADING_LAYER_PRIORITY", + lambda string: string.split(), + ['tbb', 'omp', 'workqueue'], + ) + THREADING_LAYER = _readenv("NUMBA_THREADING_LAYER", str, 'default') + + CAPTURED_ERRORS = _readenv("NUMBA_CAPTURED_ERRORS", + _validate_captured_errors_style, + 'old_style') + + # CUDA Configs + + # Whether to warn about kernel launches where a host array + # is used as a parameter, forcing a copy to and from the device. + # On by default. + CUDA_WARN_ON_IMPLICIT_COPY = _readenv( + "NUMBA_CUDA_WARN_ON_IMPLICIT_COPY", int, 1) + + # Force CUDA compute capability to a specific version + FORCE_CUDA_CC = _readenv("NUMBA_FORCE_CUDA_CC", _parse_cc, None) + + # The default compute capability to target when compiling to PTX. + CUDA_DEFAULT_PTX_CC = _readenv("NUMBA_CUDA_DEFAULT_PTX_CC", _parse_cc, + (5, 3)) + + # Disable CUDA support + DISABLE_CUDA = _readenv("NUMBA_DISABLE_CUDA", + int, int(MACHINE_BITS == 32)) + + # Enable CUDA simulator + ENABLE_CUDASIM = _readenv("NUMBA_ENABLE_CUDASIM", int, 0) + + # CUDA logging level + # Any level name from the *logging* module. Case insensitive. + # Defaults to CRITICAL if not set or invalid. + # Note: This setting only applies when logging is not configured. + # Any existing logging configuration is preserved. + CUDA_LOG_LEVEL = _readenv("NUMBA_CUDA_LOG_LEVEL", str, '') + + # Include argument values in the CUDA Driver API logs + CUDA_LOG_API_ARGS = _readenv("NUMBA_CUDA_LOG_API_ARGS", int, 0) + + # Maximum number of pending CUDA deallocations (default: 10) + CUDA_DEALLOCS_COUNT = _readenv("NUMBA_CUDA_MAX_PENDING_DEALLOCS_COUNT", + int, 10) + + # Maximum ratio of pending CUDA deallocations to capacity (default: 0.2) + CUDA_DEALLOCS_RATIO = _readenv("NUMBA_CUDA_MAX_PENDING_DEALLOCS_RATIO", + float, 0.2) + + CUDA_ARRAY_INTERFACE_SYNC = _readenv("NUMBA_CUDA_ARRAY_INTERFACE_SYNC", + int, 1) + + # Path of the directory that the CUDA driver libraries are located + CUDA_DRIVER = _readenv("NUMBA_CUDA_DRIVER", str, '') + + # Buffer size for logs produced by CUDA driver operations (e.g. + # linking) + CUDA_LOG_SIZE = _readenv("NUMBA_CUDA_LOG_SIZE", int, 1024) + + # Whether to generate verbose log messages when JIT linking + CUDA_VERBOSE_JIT_LOG = _readenv("NUMBA_CUDA_VERBOSE_JIT_LOG", int, 1) + + # Whether the default stream is the per-thread default stream + CUDA_PER_THREAD_DEFAULT_STREAM = _readenv( + "NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM", int, 0) + + # Location of the CUDA include files + if IS_WIN32: + cuda_path = os.environ.get('CUDA_PATH') + if cuda_path: + default_cuda_include_path = os.path.join(cuda_path, "include") + else: + default_cuda_include_path = "cuda_include_not_found" + else: + default_cuda_include_path = os.path.join(os.sep, 'usr', 'local', + 'cuda', 'include') + CUDA_INCLUDE_PATH = _readenv("NUMBA_CUDA_INCLUDE_PATH", str, + default_cuda_include_path) + + # Threading settings + + # The default number of threads to use. + def num_threads_default(): + try: + sched_getaffinity = os.sched_getaffinity + except AttributeError: + pass + else: + return max(1, len(sched_getaffinity(0))) + + cpu_count = os.cpu_count() + if cpu_count is not None: + return max(1, cpu_count) + + return 1 + + NUMBA_DEFAULT_NUM_THREADS = num_threads_default() + + # Numba thread pool size (defaults to number of CPUs on the system). + _NUMBA_NUM_THREADS = _readenv("NUMBA_NUM_THREADS", int, + NUMBA_DEFAULT_NUM_THREADS) + if ('NUMBA_NUM_THREADS' in globals() + and globals()['NUMBA_NUM_THREADS'] != _NUMBA_NUM_THREADS): + + from numba.np.ufunc import parallel + if parallel._is_initialized: + raise RuntimeError("Cannot set NUMBA_NUM_THREADS to a " + "different value once the threads have been " + "launched (currently have %s, " + "trying to set %s)" % + (_NUMBA_NUM_THREADS, + globals()['NUMBA_NUM_THREADS'])) + + NUMBA_NUM_THREADS = _NUMBA_NUM_THREADS + del _NUMBA_NUM_THREADS + + # Profiling support + + # Indicates if a profiler detected. Only VTune can be detected for now + RUNNING_UNDER_PROFILER = 'VS_PROFILER' in os.environ + + # Enables jit events in LLVM to support profiling of dynamic code + ENABLE_PROFILING = _readenv( + "NUMBA_ENABLE_PROFILING", int, int(RUNNING_UNDER_PROFILER)) + + # Debug Info + + # The default value for the `debug` flag + DEBUGINFO_DEFAULT = _readenv("NUMBA_DEBUGINFO", int, ENABLE_PROFILING) + CUDA_DEBUGINFO_DEFAULT = _readenv("NUMBA_CUDA_DEBUGINFO", int, 0) + + EXTEND_VARIABLE_LIFETIMES = _readenv("NUMBA_EXTEND_VARIABLE_LIFETIMES", + int, 0) + + # gdb binary location + def which_gdb(path_or_bin): + gdb = shutil.which(path_or_bin) + return gdb if gdb is not None else path_or_bin + + GDB_BINARY = _readenv("NUMBA_GDB_BINARY", which_gdb, 'gdb') + + # CUDA Memory management + CUDA_MEMORY_MANAGER = _readenv("NUMBA_CUDA_MEMORY_MANAGER", str, + 'default') + + # Experimental refprune pass + LLVM_REFPRUNE_PASS = _readenv( + "NUMBA_LLVM_REFPRUNE_PASS", int, 1, + ) + LLVM_REFPRUNE_FLAGS = _readenv( + "NUMBA_LLVM_REFPRUNE_FLAGS", str, + "all" if LLVM_REFPRUNE_PASS else "", + ) + + # Timing support. + + # LLVM_PASS_TIMINGS enables LLVM recording of pass timings. + LLVM_PASS_TIMINGS = _readenv( + "NUMBA_LLVM_PASS_TIMINGS", int, 0, + ) + + # Inject the configuration values into the module globals + for name, value in locals().copy().items(): + if name.isupper(): + globals()[name] = value + + +_env_reloader = _EnvReloader() + + +def reload_config(): + """ + Reload the configuration from environment variables, if necessary. + """ + _env_reloader.update() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/consts.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/consts.py new file mode 100644 index 0000000000000000000000000000000000000000..d062b320893e5902ff828f68bc7d8b4221a8a9a1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/consts.py @@ -0,0 +1,118 @@ +from types import ModuleType + +import weakref + +from numba.core.errors import ConstantInferenceError, NumbaError +from numba.core import ir + + +class ConstantInference(object): + """ + A constant inference engine for a given interpreter. + Inference inspects the IR to try and compute a compile-time constant for + a variable. + + This shouldn't be used directly, instead call Interpreter.infer_constant(). + """ + + def __init__(self, func_ir): + # Avoid cyclic references as some user-visible objects may be + # held alive in the cache + self._func_ir = weakref.proxy(func_ir) + self._cache = {} + + def infer_constant(self, name, loc=None): + """ + Infer a constant value for the given variable *name*. + If no value can be inferred, numba.errors.ConstantInferenceError + is raised. + """ + if name not in self._cache: + try: + self._cache[name] = (True, self._do_infer(name)) + except ConstantInferenceError as exc: + # Store the exception args only, to avoid keeping + # a whole traceback alive. + self._cache[name] = (False, (exc.__class__, exc.args)) + success, val = self._cache[name] + if success: + return val + else: + exc, args = val + if issubclass(exc, NumbaError): + raise exc(*args, loc=loc) + else: + raise exc(*args) + + def _fail(self, val): + # The location here is set to None because `val` is the ir.Var name + # and not the actual offending use of the var. When this is raised it is + # caught in the flow control of `infer_constant` and the class and args + # (the message) are captured and then raised again but with the location + # set to the expression that caused the constant inference error. + raise ConstantInferenceError( + "Constant inference not possible for: %s" % (val,), loc=None) + + def _do_infer(self, name): + if not isinstance(name, str): + raise TypeError("infer_constant() called with non-str %r" + % (name,)) + try: + defn = self._func_ir.get_definition(name) + except KeyError: + raise ConstantInferenceError( + "no single definition for %r" % (name,)) + try: + const = defn.infer_constant() + except ConstantInferenceError: + if isinstance(defn, ir.Expr): + return self._infer_expr(defn) + self._fail(defn) + return const + + def _infer_expr(self, expr): + # Infer an expression: handle supported cases + if expr.op == 'call': + func = self.infer_constant(expr.func.name, loc=expr.loc) + return self._infer_call(func, expr) + elif expr.op == 'getattr': + value = self.infer_constant(expr.value.name, loc=expr.loc) + return self._infer_getattr(value, expr) + elif expr.op == 'build_list': + return [self.infer_constant(i.name, loc=expr.loc) for i in + expr.items] + elif expr.op == 'build_tuple': + return tuple(self.infer_constant(i.name, loc=expr.loc) for i in + expr.items) + self._fail(expr) + + def _infer_call(self, func, expr): + if expr.kws or expr.vararg: + self._fail(expr) + # Check supported callables + _slice = func in (slice,) + _exc = isinstance(func, type) and issubclass(func, BaseException) + if _slice or _exc: + args = [self.infer_constant(a.name, loc=expr.loc) for a in + expr.args] + if _slice: + return func(*args) + elif _exc: + # If the exception class is user defined it may implement a ctor + # that does not pass the args to the super. Therefore return the + # raw class and the args so this can be instantiated at the call + # site in the way the user source expects it to be. + return func, args + else: + assert 0, 'Unreachable' + + self._fail(expr) + + def _infer_getattr(self, value, expr): + if isinstance(value, (ModuleType, type)): + # Allow looking up a constant on a class or module + try: + return getattr(value, expr.attr) + except AttributeError: + pass + self._fail(expr) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/controlflow.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/controlflow.py new file mode 100644 index 0000000000000000000000000000000000000000..344f6e2ce0c33b80b0bc409d8fcee6e3ebf04349 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/controlflow.py @@ -0,0 +1,954 @@ +import collections +import functools +import sys + +from numba.core import utils +from numba.core.ir import Loc +from numba.core.errors import UnsupportedError + +# List of bytecodes creating a new block in the control flow graph +# (in addition to explicit jump labels). +NEW_BLOCKERS = frozenset(['SETUP_LOOP', 'FOR_ITER', 'SETUP_WITH']) + + +class CFBlock(object): + + def __init__(self, offset): + self.offset = offset + self.body = [] + # A map of jumps to outgoing blocks (successors): + # { offset of outgoing block -> number of stack pops } + self.outgoing_jumps = {} + # A map of jumps to incoming blocks (predecessors): + # { offset of incoming block -> number of stack pops } + self.incoming_jumps = {} + self.terminating = False + + def __repr__(self): + args = (self.offset, + sorted(self.outgoing_jumps), + sorted(self.incoming_jumps)) + return "block(offset:%d, outgoing: %s, incoming: %s)" % args + + def __iter__(self): + return iter(self.body) + + +class Loop(collections.namedtuple("Loop", + ("entries", "exits", "header", "body"))): + """ + A control flow loop, as detected by a CFGraph object. + """ + + __slots__ = () + + # The loop header is enough to detect that two loops are really + # the same, assuming they belong to the same graph. + # (note: in practice, only one loop instance is created per graph + # loop, so identity would be fine) + + def __eq__(self, other): + return isinstance(other, Loop) and other.header == self.header + + def __hash__(self): + return hash(self.header) + + +class _DictOfContainers(collections.defaultdict): + """A defaultdict with customized equality checks that ignore empty values. + + Non-empty value is checked by: `bool(value_item) == True`. + """ + + def __eq__(self, other): + if isinstance(other, _DictOfContainers): + mine = self._non_empty_items() + theirs = other._non_empty_items() + return mine == theirs + + return NotImplemented + + def __ne__(self, other): + ret = self.__eq__(other) + if ret is NotImplemented: + return ret + else: + return not ret + + def _non_empty_items(self): + return [(k, vs) for k, vs in sorted(self.items()) if vs] + + +class CFGraph(object): + """ + Generic (almost) implementation of a Control Flow Graph. + """ + + def __init__(self): + self._nodes = set() + self._preds = _DictOfContainers(set) + self._succs = _DictOfContainers(set) + self._edge_data = {} + self._entry_point = None + + def add_node(self, node): + """ + Add *node* to the graph. This is necessary before adding any + edges from/to the node. *node* can be any hashable object. + """ + self._nodes.add(node) + + def add_edge(self, src, dest, data=None): + """ + Add an edge from node *src* to node *dest*, with optional + per-edge *data*. + If such an edge already exists, it is replaced (duplicate edges + are not possible). + """ + if src not in self._nodes: + raise ValueError("Cannot add edge as src node %s not in nodes %s" % + (src, self._nodes)) + if dest not in self._nodes: + raise ValueError("Cannot add edge as dest node %s not in nodes %s" % + (dest, self._nodes)) + self._add_edge(src, dest, data) + + def successors(self, src): + """ + Yield (node, data) pairs representing the successors of node *src*. + (*data* will be None if no data was specified when adding the edge) + """ + for dest in self._succs[src]: + yield dest, self._edge_data[src, dest] + + def predecessors(self, dest): + """ + Yield (node, data) pairs representing the predecessors of node *dest*. + (*data* will be None if no data was specified when adding the edge) + """ + for src in self._preds[dest]: + yield src, self._edge_data[src, dest] + + def set_entry_point(self, node): + """ + Set the entry point of the graph to *node*. + """ + assert node in self._nodes + self._entry_point = node + + def process(self): + """ + Compute essential properties of the control flow graph. The graph + must have been fully populated, and its entry point specified. Other + graph properties are computed on-demand. + """ + if self._entry_point is None: + raise RuntimeError("no entry point defined!") + self._eliminate_dead_blocks() + + def dominators(self): + """ + Return a dictionary of {node -> set(nodes)} mapping each node to + the nodes dominating it. + + A node D dominates a node N when any path leading to N must go through D + """ + return self._doms + + def post_dominators(self): + """ + Return a dictionary of {node -> set(nodes)} mapping each node to + the nodes post-dominating it. + + A node P post-dominates a node N when any path starting from N must go + through P. + """ + return self._post_doms + + def immediate_dominators(self): + """ + Return a dictionary of {node -> node} mapping each node to its + immediate dominator (idom). + + The idom(B) is the closest strict dominator of V + """ + return self._idom + + def dominance_frontier(self): + """ + Return a dictionary of {node -> set(nodes)} mapping each node to + the nodes in its dominance frontier. + + The dominance frontier _df(N) is the set of all nodes that are + immediate successors to blocks dominated by N but which aren't + strictly dominated by N + """ + return self._df + + def dominator_tree(self): + """ + return a dictionary of {node -> set(nodes)} mapping each node to + the set of nodes it immediately dominates + + The domtree(B) is the closest strict set of nodes that B dominates + """ + return self._domtree + + @utils.cached_property + def _exit_points(self): + return self._find_exit_points() + + @utils.cached_property + def _doms(self): + return self._find_dominators() + + @utils.cached_property + def _back_edges(self): + return self._find_back_edges() + + @utils.cached_property + def _topo_order(self): + return self._find_topo_order() + + @utils.cached_property + def _descs(self): + return self._find_descendents() + + @utils.cached_property + def _loops(self): + return self._find_loops() + + @utils.cached_property + def _in_loops(self): + return self._find_in_loops() + + @utils.cached_property + def _post_doms(self): + return self._find_post_dominators() + + @utils.cached_property + def _idom(self): + return self._find_immediate_dominators() + + @utils.cached_property + def _df(self): + return self._find_dominance_frontier() + + @utils.cached_property + def _domtree(self): + return self._find_dominator_tree() + + def descendents(self, node): + """ + Return the set of descendents of the given *node*, in topological + order (ignoring back edges). + """ + return self._descs[node] + + def entry_point(self): + """ + Return the entry point node. + """ + assert self._entry_point is not None + return self._entry_point + + def exit_points(self): + """ + Return the computed set of exit nodes (may be empty). + """ + return self._exit_points + + def backbone(self): + """ + Return the set of nodes constituting the graph's backbone. + (i.e. the nodes that every path starting from the entry point + must go through). By construction, it is non-empty: it contains + at least the entry point. + """ + return self._post_doms[self._entry_point] + + def loops(self): + """ + Return a dictionary of {node -> loop} mapping each loop header + to the loop (a Loop instance) starting with it. + """ + return self._loops + + def in_loops(self, node): + """ + Return the list of Loop objects the *node* belongs to, + from innermost to outermost. + """ + return [self._loops[x] for x in self._in_loops.get(node, ())] + + def dead_nodes(self): + """ + Return the set of dead nodes (eliminated from the graph). + """ + return self._dead_nodes + + def nodes(self): + """ + Return the set of live nodes. + """ + return self._nodes + + def topo_order(self): + """ + Return the sequence of nodes in topological order (ignoring back + edges). + """ + return self._topo_order + + def topo_sort(self, nodes, reverse=False): + """ + Iterate over the *nodes* in topological order (ignoring back edges). + The sort isn't guaranteed to be stable. + """ + nodes = set(nodes) + it = self._topo_order + if reverse: + it = reversed(it) + for n in it: + if n in nodes: + yield n + + def dump(self, file=None): + """ + Dump extensive debug information. + """ + import pprint + file = file or sys.stdout + if 1: + print("CFG adjacency lists:", file=file) + self._dump_adj_lists(file) + print("CFG dominators:", file=file) + pprint.pprint(self._doms, stream=file) + print("CFG post-dominators:", file=file) + pprint.pprint(self._post_doms, stream=file) + print("CFG back edges:", sorted(self._back_edges), file=file) + print("CFG loops:", file=file) + pprint.pprint(self._loops, stream=file) + print("CFG node-to-loops:", file=file) + pprint.pprint(self._in_loops, stream=file) + print("CFG backbone:", file=file) + pprint.pprint(self.backbone(), stream=file) + + def render_dot(self, filename="numba_cfg.dot"): + """Render the controlflow graph with GraphViz DOT via the + ``graphviz`` python binding. + + Returns + ------- + g : graphviz.Digraph + Use `g.view()` to open the graph in the default PDF application. + """ + + try: + import graphviz as gv + except ImportError: + raise ImportError( + "The feature requires `graphviz` but it is not available. " + "Please install with `pip install graphviz`" + ) + g = gv.Digraph(filename=filename) + # Populate the nodes + for n in self._nodes: + g.node(str(n)) + # Populate the edges + for n in self._nodes: + for edge in self._succs[n]: + g.edge(str(n), str(edge)) + return g + + # Internal APIs + + def _add_edge(self, from_, to, data=None): + # This internal version allows adding edges to/from unregistered + # (ghost) nodes. + self._preds[to].add(from_) + self._succs[from_].add(to) + self._edge_data[from_, to] = data + + def _remove_node_edges(self, node): + for succ in self._succs.pop(node, ()): + self._preds[succ].remove(node) + del self._edge_data[node, succ] + for pred in self._preds.pop(node, ()): + self._succs[pred].remove(node) + del self._edge_data[pred, node] + + def _dfs(self, entries=None): + if entries is None: + entries = (self._entry_point,) + seen = set() + stack = list(entries) + while stack: + node = stack.pop() + if node not in seen: + yield node + seen.add(node) + for succ in self._succs[node]: + stack.append(succ) + + def _eliminate_dead_blocks(self): + """ + Eliminate all blocks not reachable from the entry point, and + stash them into self._dead_nodes. + """ + live = set() + for node in self._dfs(): + live.add(node) + self._dead_nodes = self._nodes - live + self._nodes = live + # Remove all edges leading from dead nodes + for dead in self._dead_nodes: + self._remove_node_edges(dead) + + def _find_exit_points(self): + """ + Compute the graph's exit points. + """ + exit_points = set() + for n in self._nodes: + if not self._succs.get(n): + exit_points.add(n) + return exit_points + + def _find_postorder(self): + succs = self._succs + back_edges = self._back_edges + post_order = [] + seen = set() + + post_order = [] + + # DFS + def dfs_rec(node): + if node not in seen: + seen.add(node) + stack.append((post_order.append, node)) + for dest in succs[node]: + if (node, dest) not in back_edges: + stack.append((dfs_rec, dest)) + + stack = [(dfs_rec, self._entry_point)] + while stack: + cb, data = stack.pop() + cb(data) + + return post_order + + def _find_immediate_dominators(self): + # The algorithm implemented computes the immediate dominator + # for each node in the CFG which is equivalent to build a dominator tree + # Based on the implementation from NetworkX + # library - nx.immediate_dominators + # https://github.com/networkx/networkx/blob/858e7cb183541a78969fed0cbcd02346f5866c02/networkx/algorithms/dominance.py # noqa: E501 + # References: + # Keith D. Cooper, Timothy J. Harvey, and Ken Kennedy + # A Simple, Fast Dominance Algorithm + # https://www.cs.rice.edu/~keith/EMBED/dom.pdf + def intersect(u, v): + while u != v: + while idx[u] < idx[v]: + u = idom[u] + while idx[u] > idx[v]: + v = idom[v] + return u + + entry = self._entry_point + preds_table = self._preds + + order = self._find_postorder() + idx = {e: i for i, e in enumerate(order)} # index of each node + idom = {entry : entry} + order.pop() + order.reverse() + + changed = True + while changed: + changed = False + for u in order: + new_idom = functools.reduce(intersect, + (v for v in preds_table[u] + if v in idom)) + if u not in idom or idom[u] != new_idom: + idom[u] = new_idom + changed = True + + return idom + + def _find_dominator_tree(self): + idom = self._idom + domtree = _DictOfContainers(set) + + for u, v in idom.items(): + # v dominates u + if u not in domtree: + domtree[u] = set() + if u != v: + domtree[v].add(u) + + return domtree + + def _find_dominance_frontier(self): + idom = self._idom + preds_table = self._preds + df = {u: set() for u in idom} + + for u in idom: + if len(preds_table[u]) < 2: + continue + for v in preds_table[u]: + while v != idom[u]: + df[v].add(u) + v = idom[v] + + return df + + def _find_dominators_internal(self, post=False): + # See theoretical description in + # http://en.wikipedia.org/wiki/Dominator_%28graph_theory%29 + # The algorithm implemented here uses a todo-list as described + # in http://pages.cs.wisc.edu/~fischer/cs701.f08/finding.loops.html + if post: + entries = set(self._exit_points) + preds_table = self._succs + succs_table = self._preds + else: + entries = set([self._entry_point]) + preds_table = self._preds + succs_table = self._succs + + if not entries: + raise RuntimeError("no entry points: dominator algorithm " + "cannot be seeded") + + doms = {} + for e in entries: + doms[e] = set([e]) + + todo = [] + for n in self._nodes: + if n not in entries: + doms[n] = set(self._nodes) + todo.append(n) + + while todo: + n = todo.pop() + if n in entries: + continue + new_doms = set([n]) + preds = preds_table[n] + if preds: + new_doms |= functools.reduce(set.intersection, + [doms[p] for p in preds]) + if new_doms != doms[n]: + assert len(new_doms) < len(doms[n]) + doms[n] = new_doms + todo.extend(succs_table[n]) + return doms + + def _find_dominators(self): + return self._find_dominators_internal(post=False) + + def _find_post_dominators(self): + # To handle infinite loops correctly, we need to add a dummy + # exit point, and link members of infinite loops to it. + dummy_exit = object() + self._exit_points.add(dummy_exit) + for loop in self._loops.values(): + if not loop.exits: + for b in loop.body: + self._add_edge(b, dummy_exit) + pdoms = self._find_dominators_internal(post=True) + # Fix the _post_doms table to make no reference to the dummy exit + del pdoms[dummy_exit] + for doms in pdoms.values(): + doms.discard(dummy_exit) + self._remove_node_edges(dummy_exit) + self._exit_points.remove(dummy_exit) + return pdoms + + # Finding loops and back edges: see + # http://pages.cs.wisc.edu/~fischer/cs701.f08/finding.loops.html + + def _find_back_edges(self, stats=None): + """ + Find back edges. An edge (src, dest) is a back edge if and + only if *dest* dominates *src*. + """ + # Prepare stats to capture execution information + if stats is not None: + if not isinstance(stats, dict): + raise TypeError(f"*stats* must be a dict; got {type(stats)}") + stats.setdefault('iteration_count', 0) + + # Uses a simple DFS to find back-edges. + # The new algorithm is faster than the the previous dominator based + # algorithm. + back_edges = set() + # stack: keeps track of the traversal path + stack = [] + # succs_state: keep track of unvisited successors of a node + succs_state = {} + entry_point = self.entry_point() + + checked = set() + + def push_state(node): + stack.append(node) + succs_state[node] = [dest for dest in self._succs[node]] + + push_state(entry_point) + + # Keep track for iteration count for debugging + iter_ct = 0 + while stack: + iter_ct += 1 + tos = stack[-1] + tos_succs = succs_state[tos] + # Are there successors not checked? + if tos_succs: + # Check the next successor + cur_node = tos_succs.pop() + # Is it in our traversal path? + if cur_node in stack: + # Yes, it's a backedge + back_edges.add((tos, cur_node)) + elif cur_node not in checked: + # Push + push_state(cur_node) + else: + # Checked all successors. Pop + stack.pop() + checked.add(tos) + + if stats is not None: + stats['iteration_count'] += iter_ct + return back_edges + + def _find_topo_order(self): + succs = self._succs + back_edges = self._back_edges + post_order = [] + seen = set() + + def _dfs_rec(node): + if node not in seen: + seen.add(node) + for dest in succs[node]: + if (node, dest) not in back_edges: + _dfs_rec(dest) + post_order.append(node) + + _dfs_rec(self._entry_point) + post_order.reverse() + return post_order + + def _find_descendents(self): + descs = {} + for node in reversed(self._topo_order): + descs[node] = node_descs = set() + for succ in self._succs[node]: + if (node, succ) not in self._back_edges: + node_descs.add(succ) + node_descs.update(descs[succ]) + return descs + + def _find_loops(self): + """ + Find the loops defined by the graph's back edges. + """ + bodies = {} + for src, dest in self._back_edges: + # The destination of the back edge is the loop header + header = dest + # Build up the loop body from the back edge's source node, + # up to the source header. + body = set([header]) + queue = [src] + while queue: + n = queue.pop() + if n not in body: + body.add(n) + queue.extend(self._preds[n]) + # There can be several back edges to a given loop header; + # if so, merge the resulting body fragments. + if header in bodies: + bodies[header].update(body) + else: + bodies[header] = body + + # Create a Loop object for each header. + loops = {} + for header, body in bodies.items(): + entries = set() + exits = set() + for n in body: + entries.update(self._preds[n] - body) + exits.update(self._succs[n] - body) + loop = Loop(header=header, body=body, entries=entries, exits=exits) + loops[header] = loop + return loops + + def _find_in_loops(self): + loops = self._loops + # Compute the loops to which each node belongs. + in_loops = dict((n, []) for n in self._nodes) + # Sort loops from longest to shortest + # This ensures that outer loops will come before inner loops + for loop in sorted(loops.values(), key=lambda loop: len(loop.body)): + for n in loop.body: + in_loops[n].append(loop.header) + return in_loops + + def _dump_adj_lists(self, file): + adj_lists = dict((src, sorted(list(dests))) + for src, dests in self._succs.items()) + import pprint + pprint.pprint(adj_lists, stream=file) + + def __eq__(self, other): + if not isinstance(other, CFGraph): + raise NotImplementedError + + for x in ['_nodes', '_edge_data', '_entry_point', '_preds', '_succs']: + this = getattr(self, x, None) + that = getattr(other, x, None) + if this != that: + return False + return True + + def __ne__(self, other): + return not self.__eq__(other) + + +class ControlFlowAnalysis(object): + """ + Attributes + ---------- + - bytecode + + - blocks + + - blockseq + + - doms: dict of set + Dominators + + - backbone: set of block offsets + The set of block that is common to all possible code path. + + """ + def __init__(self, bytecode): + self.bytecode = bytecode + self.blocks = {} + self.liveblocks = {} + self.blockseq = [] + self.doms = None + self.backbone = None + # Internal temp states + self._force_new_block = True + self._curblock = None + self._blockstack = [] + self._loops = [] + self._withs = [] + + def iterblocks(self): + """ + Return all blocks in sequence of occurrence + """ + for i in self.blockseq: + yield self.blocks[i] + + def iterliveblocks(self): + """ + Return all live blocks in sequence of occurrence + """ + for i in self.blockseq: + if i in self.liveblocks: + yield self.blocks[i] + + def incoming_blocks(self, block): + """ + Yield (incoming block, number of stack pops) pairs for *block*. + """ + for i, pops in block.incoming_jumps.items(): + if i in self.liveblocks: + yield self.blocks[i], pops + + def dump(self, file=None): + self.graph.dump(file=None) + + def run(self): + for inst in self._iter_inst(): + fname = "op_%s" % inst.opname + fn = getattr(self, fname, None) + if fn is not None: + fn(inst) + elif inst.is_jump: + # this catches e.g. try... except + l = Loc(self.bytecode.func_id.filename, inst.lineno) + if inst.opname in {"SETUP_EXCEPT", "SETUP_FINALLY"}: + msg = "'try' block not supported until python3.7 or later" + else: + msg = "Use of unsupported opcode (%s) found" % inst.opname + raise UnsupportedError(msg, loc=l) + else: + # Non-jump instructions are ignored + pass # intentionally + + # Close all blocks + for cur, nxt in zip(self.blockseq, self.blockseq[1:]): + blk = self.blocks[cur] + if not blk.outgoing_jumps and not blk.terminating: + blk.outgoing_jumps[nxt] = 0 + + graph = CFGraph() + for b in self.blocks: + graph.add_node(b) + for b in self.blocks.values(): + for out, pops in b.outgoing_jumps.items(): + graph.add_edge(b.offset, out, pops) + graph.set_entry_point(min(self.blocks)) + graph.process() + self.graph = graph + + # Fill incoming + for b in self.blocks.values(): + for out, pops in b.outgoing_jumps.items(): + self.blocks[out].incoming_jumps[b.offset] = pops + + # Find liveblocks + self.liveblocks = dict((i, self.blocks[i]) + for i in self.graph.nodes()) + + for lastblk in reversed(self.blockseq): + if lastblk in self.liveblocks: + break + else: + raise AssertionError("No live block that exits!?") + + # Find backbone + backbone = self.graph.backbone() + # Filter out in loop blocks (Assuming no other cyclic control blocks) + # This is to unavoid variable defined in loops to be considered as + # function scope. + inloopblocks = set() + + for b in self.blocks.keys(): + if self.graph.in_loops(b): + inloopblocks.add(b) + + self.backbone = backbone - inloopblocks + + def jump(self, target, pops=0): + """ + Register a jump (conditional or not) to *target* offset. + *pops* is the number of stack pops implied by the jump (default 0). + """ + self._curblock.outgoing_jumps[target] = pops + + def _iter_inst(self): + for inst in self.bytecode: + if self._use_new_block(inst): + self._guard_with_as(inst) + self._start_new_block(inst) + self._curblock.body.append(inst.offset) + yield inst + + def _use_new_block(self, inst): + if inst.offset in self.bytecode.labels: + res = True + elif inst.opname in NEW_BLOCKERS: + res = True + else: + res = self._force_new_block + + self._force_new_block = False + return res + + def _start_new_block(self, inst): + self._curblock = CFBlock(inst.offset) + self.blocks[inst.offset] = self._curblock + self.blockseq.append(inst.offset) + + def _guard_with_as(self, current_inst): + """Checks if the next instruction after a SETUP_WITH is something other + than a POP_TOP, if it is something else it'll be some sort of store + which is not supported (this corresponds to `with CTXMGR as VAR(S)`).""" + if current_inst.opname == "SETUP_WITH": + next_op = self.bytecode[current_inst.next].opname + if next_op != "POP_TOP": + msg = ("The 'with (context manager) as " + "(variable):' construct is not " + "supported.") + raise UnsupportedError(msg) + + def op_SETUP_LOOP(self, inst): + end = inst.get_jump_target() + self._blockstack.append(end) + self._loops.append((inst.offset, end)) + # TODO: Looplifting requires the loop entry be its own block. + # Forcing a new block here is the simplest solution for now. + # But, we should consider other less ad-hoc ways. + self.jump(inst.next) + self._force_new_block = True + + def op_SETUP_WITH(self, inst): + end = inst.get_jump_target() + self._blockstack.append(end) + self._withs.append((inst.offset, end)) + # TODO: WithLifting requires the loop entry be its own block. + # Forcing a new block here is the simplest solution for now. + # But, we should consider other less ad-hoc ways. + self.jump(inst.next) + self._force_new_block = True + + def op_POP_BLOCK(self, inst): + self._blockstack.pop() + + def op_FOR_ITER(self, inst): + self.jump(inst.get_jump_target()) + self.jump(inst.next) + self._force_new_block = True + + def _op_ABSOLUTE_JUMP_IF(self, inst): + self.jump(inst.get_jump_target()) + self.jump(inst.next) + self._force_new_block = True + + op_POP_JUMP_IF_FALSE = _op_ABSOLUTE_JUMP_IF + op_POP_JUMP_IF_TRUE = _op_ABSOLUTE_JUMP_IF + op_JUMP_IF_FALSE = _op_ABSOLUTE_JUMP_IF + op_JUMP_IF_TRUE = _op_ABSOLUTE_JUMP_IF + + def _op_ABSOLUTE_JUMP_OR_POP(self, inst): + self.jump(inst.get_jump_target()) + self.jump(inst.next, pops=1) + self._force_new_block = True + + op_JUMP_IF_FALSE_OR_POP = _op_ABSOLUTE_JUMP_OR_POP + op_JUMP_IF_TRUE_OR_POP = _op_ABSOLUTE_JUMP_OR_POP + + def op_JUMP_ABSOLUTE(self, inst): + self.jump(inst.get_jump_target()) + self._force_new_block = True + + def op_JUMP_FORWARD(self, inst): + self.jump(inst.get_jump_target()) + self._force_new_block = True + + def op_RETURN_VALUE(self, inst): + self._curblock.terminating = True + self._force_new_block = True + + def op_RAISE_VARARGS(self, inst): + self._curblock.terminating = True + self._force_new_block = True + + def op_BREAK_LOOP(self, inst): + self.jump(self._blockstack[-1]) + self._force_new_block = True diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/cpu.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..a27deb1365c6e4603c5dbfd174fc171a8552d9ec --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/cpu.py @@ -0,0 +1,379 @@ +import sys +import platform + +import llvmlite.binding as ll +from llvmlite import ir + +from numba import _dynfunc +from numba.core.callwrapper import PyCallWrapper +from numba.core.base import BaseContext, PYOBJECT +from numba.core import utils, types, config, cgutils, callconv, codegen, externals, fastmathpass, intrinsics +from numba.core.utils import cached_property +from numba.core.options import TargetOptions, include_default_options +from numba.core.runtime import rtsys +from numba.core.compiler_lock import global_compiler_lock +import numba.core.entrypoints +from numba.core.cpu_options import (ParallelOptions, FastMathOptions, + InlineOptions) +from numba.np import ufunc_db + +# Keep those structures in sync with _dynfunc.c. + +class ClosureBody(cgutils.Structure): + _fields = [('env', types.pyobject)] + + +class EnvBody(cgutils.Structure): + _fields = [ + ('globals', types.pyobject), + ('consts', types.pyobject), + ] + + +class CPUContext(BaseContext): + """ + Changes BaseContext calling convention + """ + allow_dynamic_globals = True + + def __init__(self, typingctx, target='cpu'): + super().__init__(typingctx, target) + + # Overrides + def create_module(self, name): + return self._internal_codegen._create_empty_module(name) + + @global_compiler_lock + def init(self): + self.is32bit = (utils.MACHINE_BITS == 32) + self._internal_codegen = codegen.JITCPUCodegen("numba.exec") + + # Add ARM ABI functions from libgcc_s + if platform.machine() == 'armv7l': + ll.load_library_permanently('libgcc_s.so.1') + + # Map external C functions. + externals.c_math_functions.install(self) + + # Initialize NRT runtime + rtsys.initialize(self) + + # Add lower_extension attribute + self.lower_extensions = {} + from numba.parfors.parfor_lowering import _lower_parfor_parallel + from numba.parfors.parfor import Parfor + # Specify how to lower Parfor nodes using the lower_extensions + self.lower_extensions[Parfor] = _lower_parfor_parallel + + def load_additional_registries(self): + # Add implementations that work via import + from numba.cpython import (builtins, charseq, enumimpl, hashing, heapq, + iterators, listobj, numbers, rangeobj, + setobj, slicing, tupleobj, unicode,) + from numba.core import optional + from numba.misc import gdb_hook, literal + from numba.np import linalg, polynomial, arraymath, arrayobj + from numba.np.random import generator_core, generator_methods + from numba.typed import typeddict, dictimpl + from numba.typed import typedlist, listobject + from numba.experimental import jitclass, function_type + from numba.np import npdatetime + + # Add target specific implementations + from numba.np import npyimpl + from numba.cpython import cmathimpl, mathimpl, printimpl, randomimpl + from numba.misc import cffiimpl + from numba.experimental.jitclass.base import ClassBuilder as \ + jitclassimpl + self.install_registry(cmathimpl.registry) + self.install_registry(cffiimpl.registry) + self.install_registry(mathimpl.registry) + self.install_registry(npyimpl.registry) + self.install_registry(printimpl.registry) + self.install_registry(randomimpl.registry) + self.install_registry(jitclassimpl.class_impl_registry) + + # load 3rd party extensions + numba.core.entrypoints.init_all() + + @property + def target_data(self): + return self._internal_codegen.target_data + + def with_aot_codegen(self, name, **aot_options): + aot_codegen = codegen.AOTCPUCodegen(name, **aot_options) + return self.subtarget(_internal_codegen=aot_codegen, + aot_mode=True) + + def codegen(self): + return self._internal_codegen + + @cached_property + def call_conv(self): + return callconv.CPUCallConv(self) + + def get_env_body(self, builder, envptr): + """ + From the given *envptr* (a pointer to a _dynfunc.Environment object), + get a EnvBody allowing structured access to environment fields. + """ + body_ptr = cgutils.pointer_add( + builder, envptr, _dynfunc._impl_info['offsetof_env_body']) + return EnvBody(self, builder, ref=body_ptr, cast_ref=True) + + def get_env_manager(self, builder): + envgv = self.declare_env_global(builder.module, + self.get_env_name(self.fndesc)) + envarg = builder.load(envgv) + pyapi = self.get_python_api(builder) + pyapi.emit_environment_sentry( + envarg, debug_msg=self.fndesc.env_name, + ) + env_body = self.get_env_body(builder, envarg) + return pyapi.get_env_manager(self.environment, env_body, envarg) + + def get_generator_state(self, builder, genptr, return_type): + """ + From the given *genptr* (a pointer to a _dynfunc.Generator object), + get a pointer to its state area. + """ + return cgutils.pointer_add( + builder, genptr, _dynfunc._impl_info['offsetof_generator_state'], + return_type=return_type) + + def build_list(self, builder, list_type, items): + """ + Build a list from the Numba *list_type* and its initial *items*. + """ + from numba.cpython import listobj + return listobj.build_list(self, builder, list_type, items) + + def build_set(self, builder, set_type, items): + """ + Build a set from the Numba *set_type* and its initial *items*. + """ + from numba.cpython import setobj + return setobj.build_set(self, builder, set_type, items) + + def build_map(self, builder, dict_type, item_types, items): + from numba.typed import dictobject + + return dictobject.build_map(self, builder, dict_type, item_types, items) + + + def post_lowering(self, mod, library): + if self.fastmath: + fastmathpass.rewrite_module(mod, self.fastmath) + + if self.is32bit: + # 32-bit machine needs to replace all 64-bit div/rem to avoid + # calls to compiler-rt + intrinsics.fix_divmod(mod) + + library.add_linking_library(rtsys.library) + + def create_cpython_wrapper(self, library, fndesc, env, call_helper, + release_gil=False): + wrapper_module = self.create_module("wrapper") + fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes) + wrapper_callee = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name) + builder = PyCallWrapper(self, wrapper_module, wrapper_callee, + fndesc, env, call_helper=call_helper, + release_gil=release_gil) + builder.build() + library.add_ir_module(wrapper_module) + + def create_cfunc_wrapper(self, library, fndesc, env, call_helper): + wrapper_module = self.create_module("cfunc_wrapper") + fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes) + wrapper_callee = ir.Function(wrapper_module, fnty, fndesc.llvm_func_name) + + ll_argtypes = [self.get_value_type(ty) for ty in fndesc.argtypes] + ll_return_type = self.get_value_type(fndesc.restype) + wrapty = ir.FunctionType(ll_return_type, ll_argtypes) + wrapfn = ir.Function(wrapper_module, wrapty, fndesc.llvm_cfunc_wrapper_name) + builder = ir.IRBuilder(wrapfn.append_basic_block('entry')) + + status, out = self.call_conv.call_function( + builder, wrapper_callee, fndesc.restype, fndesc.argtypes, + wrapfn.args, attrs=('noinline',)) + + with builder.if_then(status.is_error, likely=False): + # If (and only if) an error occurred, acquire the GIL + # and use the interpreter to write out the exception. + pyapi = self.get_python_api(builder) + gil_state = pyapi.gil_ensure() + self.call_conv.raise_error(builder, pyapi, status) + cstr = self.insert_const_string(builder.module, repr(self)) + strobj = pyapi.string_from_string(cstr) + pyapi.err_write_unraisable(strobj) + pyapi.decref(strobj) + pyapi.gil_release(gil_state) + + builder.ret(out) + library.add_ir_module(wrapper_module) + + def get_executable(self, library, fndesc, env): + """ + Returns + ------- + (cfunc, fnptr) + + - cfunc + callable function (Can be None) + - fnptr + callable function address + - env + an execution environment (from _dynfunc) + """ + # Code generation + baseptr = library.get_pointer_to_function(fndesc.llvm_func_name) + fnptr = library.get_pointer_to_function(fndesc.llvm_cpython_wrapper_name) + + # Note: we avoid reusing the original docstring to avoid encoding + # issues on Python 2, see issue #1908 + doc = "compiled wrapper for %r" % (fndesc.qualname,) + cfunc = _dynfunc.make_function(fndesc.lookup_module(), + fndesc.qualname.split('.')[-1], + doc, fnptr, env, + # objects to keepalive with the function + (library,) + ) + library.codegen.set_env(self.get_env_name(fndesc), env) + return cfunc + + def calc_array_sizeof(self, ndim): + ''' + Calculate the size of an array struct on the CPU target + ''' + aryty = types.Array(types.int32, ndim, 'A') + return self.get_abi_sizeof(self.get_value_type(aryty)) + + # Overrides + def get_ufunc_info(self, ufunc_key): + return ufunc_db.get_ufunc_info(ufunc_key) + + +# ---------------------------------------------------------------------------- +# TargetOptions + +_options_mixin = include_default_options( + "nopython", + "forceobj", + "looplift", + "_nrt", + "debug", + "boundscheck", + "nogil", + "no_rewrites", + "no_cpython_wrapper", + "no_cfunc_wrapper", + "parallel", + "fastmath", + "error_model", + "inline", + "forceinline", + # Add "target_backend" as a accepted option for the CPU in @jit(...) + "target_backend", + "_dbg_extend_lifetimes", + "_dbg_optnone", +) + +class CPUTargetOptions(_options_mixin, TargetOptions): + def finalize(self, flags, options): + if not flags.is_set("enable_pyobject"): + flags.enable_pyobject = True + + if not flags.is_set("enable_looplift"): + flags.enable_looplift = True + + flags.inherit_if_not_set("nrt", default=True) + + if not flags.is_set("debuginfo"): + flags.debuginfo = config.DEBUGINFO_DEFAULT + + if not flags.is_set("dbg_extend_lifetimes"): + if flags.debuginfo: + # auto turn on extend-lifetimes if debuginfo is on and + # dbg_extend_lifetimes is not set + flags.dbg_extend_lifetimes = True + else: + # set flag using env-var config + flags.dbg_extend_lifetimes = config.EXTEND_VARIABLE_LIFETIMES + + if not flags.is_set("boundscheck"): + flags.boundscheck = flags.debuginfo + + flags.enable_pyobject_looplift = True + + flags.inherit_if_not_set("fastmath") + + flags.inherit_if_not_set("error_model", default="python") + + # Add "target_backend" as a option that inherits from the caller + flags.inherit_if_not_set("target_backend") + + flags.inherit_if_not_set("forceinline") + + if flags.forceinline: + # forceinline turns off optnone, just like clang. + flags.optnone = False + +# ---------------------------------------------------------------------------- +# Internal + +def remove_refct_calls(func): + """ + Remove redundant incref/decref within on a per block basis + """ + for bb in func.basic_blocks: + remove_null_refct_call(bb) + remove_refct_pairs(bb) + + +def remove_null_refct_call(bb): + """ + Remove refct api calls to NULL pointer + """ + pass + ## Skipped for now + # for inst in bb.instructions: + # if isinstance(inst, ir.CallInstr): + # fname = inst.called_function.name + # if fname == "Py_IncRef" or fname == "Py_DecRef": + # arg = inst.args[0] + # print(type(arg)) + # if isinstance(arg, lc.ConstantPointerNull): + # inst.erase_from_parent() + + +def remove_refct_pairs(bb): + """ + Remove incref decref pairs on the same variable + """ + + didsomething = True + + while didsomething: + didsomething = False + + increfs = {} + decrefs = {} + + # Mark + for inst in bb.instructions: + if isinstance(inst, ir.CallInstr): + fname = inst.called_function.name + if fname == "Py_IncRef": + arg = inst.operands[0] + increfs[arg] = inst + elif fname == "Py_DecRef": + arg = inst.operands[0] + decrefs[arg] = inst + + # Sweep + for val in increfs.keys(): + if val in decrefs: + increfs[val].erase_from_parent() + decrefs[val].erase_from_parent() + didsomething = True diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/cpu_options.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/cpu_options.py new file mode 100644 index 0000000000000000000000000000000000000000..e2136c3194b44542072d69d9f5d8cd2736171886 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/cpu_options.py @@ -0,0 +1,181 @@ +""" +Defines CPU Options for use in the CPU target +""" +from abc import ABCMeta, abstractmethod + + +class AbstractOptionValue(metaclass=ABCMeta): + """Abstract base class for custom option values. + """ + @abstractmethod + def encode(self) -> str: + """Returns an encoding of the values + """ + ... + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.encode()})" + + +class FastMathOptions(AbstractOptionValue): + """ + Options for controlling fast math optimization. + """ + + def __init__(self, value): + # https://releases.llvm.org/7.0.0/docs/LangRef.html#fast-math-flags + valid_flags = { + 'fast', + 'nnan', 'ninf', 'nsz', 'arcp', + 'contract', 'afn', 'reassoc', + } + + if isinstance(value, FastMathOptions): + self.flags = value.flags.copy() + elif value is True: + self.flags = {'fast'} + elif value is False: + self.flags = set() + elif isinstance(value, set): + invalid = value - valid_flags + if invalid: + raise ValueError("Unrecognized fastmath flags: %s" % invalid) + self.flags = value + elif isinstance(value, dict): + invalid = set(value.keys()) - valid_flags + if invalid: + raise ValueError("Unrecognized fastmath flags: %s" % invalid) + self.flags = {v for v, enable in value.items() if enable} + else: + msg = "Expected fastmath option(s) to be either a bool, dict or set" + raise ValueError(msg) + + def __bool__(self): + return bool(self.flags) + + __nonzero__ = __bool__ + + def encode(self) -> str: + return str(self.flags) + + def __eq__(self, other): + if type(other) is type(self): + return self.flags == other.flags + return NotImplemented + + +class ParallelOptions(AbstractOptionValue): + """ + Options for controlling auto parallelization. + """ + __slots__ = ("enabled", "comprehension", "reduction", "inplace_binop", + "setitem", "numpy", "stencil", "fusion", "prange") + + def __init__(self, value): + if isinstance(value, bool): + self.enabled = value + self.comprehension = value + self.reduction = value + self.inplace_binop = value + self.setitem = value + self.numpy = value + self.stencil = value + self.fusion = value + self.prange = value + elif isinstance(value, dict): + self.enabled = True + self.comprehension = value.pop('comprehension', True) + self.reduction = value.pop('reduction', True) + self.inplace_binop = value.pop('inplace_binop', True) + self.setitem = value.pop('setitem', True) + self.numpy = value.pop('numpy', True) + self.stencil = value.pop('stencil', True) + self.fusion = value.pop('fusion', True) + self.prange = value.pop('prange', True) + if value: + msg = "Unrecognized parallel options: %s" % value.keys() + raise NameError(msg) + elif isinstance(value, ParallelOptions): + self.enabled = value.enabled + self.comprehension = value.comprehension + self.reduction = value.reduction + self.inplace_binop = value.inplace_binop + self.setitem = value.setitem + self.numpy = value.numpy + self.stencil = value.stencil + self.fusion = value.fusion + self.prange = value.prange + else: + msg = "Expect parallel option to be either a bool or a dict" + raise ValueError(msg) + + def _get_values(self): + """Get values as dictionary. + """ + return {k: getattr(self, k) for k in self.__slots__} + + def __eq__(self, other): + if type(other) is type(self): + return self._get_values() == other._get_values() + return NotImplemented + + def encode(self) -> str: + return ", ".join(f"{k}={v}" for k, v in self._get_values().items()) + + +class InlineOptions(AbstractOptionValue): + """ + Options for controlling inlining + """ + + def __init__(self, value): + ok = False + if isinstance(value, str): + if value in ('always', 'never'): + ok = True + else: + ok = hasattr(value, '__call__') + + if ok: + self._inline = value + else: + msg = ("kwarg 'inline' must be one of the strings 'always' or " + "'never', or it can be a callable that returns True/False. " + "Found value %s" % value) + raise ValueError(msg) + + @property + def is_never_inline(self): + """ + True if never inline + """ + return self._inline == 'never' + + @property + def is_always_inline(self): + """ + True if always inline + """ + return self._inline == 'always' + + @property + def has_cost_model(self): + """ + True if a cost model is provided + """ + return not (self.is_always_inline or self.is_never_inline) + + @property + def value(self): + """ + The raw value + """ + return self._inline + + def __eq__(self, other): + if type(other) is type(self): + return self.value == other.value + return NotImplemented + + def encode(self) -> str: + return repr(self._inline) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/dataflow.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/dataflow.py new file mode 100644 index 0000000000000000000000000000000000000000..5415bd5a1b9dc2580371bfa4edcb06dda557462e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/dataflow.py @@ -0,0 +1,914 @@ +import collections +from pprint import pprint +import sys +import warnings + +from numba.core.errors import UnsupportedError +from numba.core.ir import Loc + + +class DataFlowAnalysis(object): + """ + Perform stack2reg + + This is necessary to resolve blocks that propagates stack value. + This would allow the use of `and` and `or` and python2.6 jumps. + """ + + def __init__(self, cfa): + self.cfa = cfa + self.bytecode = cfa.bytecode + # { block offset -> BlockInfo } + self.infos = {} + self.edge_process = {} + + def run(self): + for blk in self.cfa.iterliveblocks(): + self.infos[blk.offset] = self.run_on_block(blk) + + def run_on_block(self, blk): + incoming_blocks = [] + info = BlockInfo(blk, blk.offset, incoming_blocks) + edge_callbacks = [] + + for ib, pops in self.cfa.incoming_blocks(blk): + # By nature of Python bytecode, there will be no incoming + # variables from subsequent blocks. This is an easy way + # of breaking the potential circularity of the problem. + if ib.offset >= blk.offset: + continue + ib = self.infos[ib.offset] + incoming_blocks.append(ib) + if (ib.offset, blk.offset) in self.edge_process: + edge_callbacks.append(self.edge_process[(ib.offset, blk.offset)]) + + # Compute stack offset at block entry + # The stack effect of our predecessors should be known + assert ib.stack_offset is not None, ib + new_offset = ib.stack_offset + ib.stack_effect - pops + if new_offset < 0: + raise RuntimeError("computed negative stack offset for %s" + % blk) + if info.stack_offset is None: + info.stack_offset = new_offset + elif info.stack_offset != new_offset: + warnings.warn("inconsistent stack offset for %s" % blk, + RuntimeWarning) + + # Compute syntax blocks at block entry + assert ib.syntax_blocks is not None, ib + if info.syntax_blocks is None: + info.syntax_blocks = ib.syntax_blocks[:] + elif info.syntax_blocks != ib.syntax_blocks: + warnings.warn("inconsistent entry syntax blocks for %s" % blk, + RuntimeWarning) + + if info.stack_offset is None: + # No incoming blocks => assume it's the entry block + info.stack_offset = 0 + info.syntax_blocks = [] + info.stack_effect = 0 + + for callback in edge_callbacks: + callback(info) + + for offset in blk: + inst = self.bytecode[offset] + self.dispatch(info, inst) + return info + + def dump(self): + for blk in self.infos.values(): + blk.dump() + + def dispatch(self, info, inst): + fname = "op_%s" % inst.opname.replace('+', '_') + fn = getattr(self, fname, self.handle_unknown_opcode) + fn(info, inst) + + def handle_unknown_opcode(self, info, inst): + raise UnsupportedError( + "Use of unknown opcode '{}'".format(inst.opname), + loc=Loc(filename=self.bytecode.func_id.filename, + line=inst.lineno) + ) + + def dup_topx(self, info, inst, count): + orig = [info.pop() for _ in range(count)] + orig.reverse() + # We need to actually create new temporaries if we want the + # IR optimization pass to work correctly (see issue #580) + duped = [info.make_temp() for _ in range(count)] + info.append(inst, orig=orig, duped=duped) + for val in orig: + info.push(val) + for val in duped: + info.push(val) + + def add_syntax_block(self, info, block): + """ + Add an inner syntax block. + """ + block.stack_offset = info.stack_offset + info.syntax_blocks.append(block) + + def pop_syntax_block(self, info): + """ + Pop the innermost syntax block and revert its stack effect. + """ + block = info.syntax_blocks.pop() + assert info.stack_offset >= block.stack_offset + while info.stack_offset + info.stack_effect > block.stack_offset: + info.pop(discard=True) + return block + + def op_NOP(self, info, inst): + pass + + def op_DUP_TOPX(self, info, inst): + count = inst.arg + assert 1 <= count <= 5, "Invalid DUP_TOPX count" + self.dup_topx(info, inst, count) + + def op_DUP_TOP(self, info, inst): + self.dup_topx(info, inst, count=1) + + def op_DUP_TOP_TWO(self, info, inst): + self.dup_topx(info, inst, count=2) + + def op_ROT_TWO(self, info, inst): + first = info.pop() + second = info.pop() + info.push(first) + info.push(second) + + def op_ROT_THREE(self, info, inst): + first = info.pop() + second = info.pop() + third = info.pop() + info.push(first) + info.push(third) + info.push(second) + + def op_ROT_FOUR(self, info, inst): + first = info.pop() + second = info.pop() + third = info.pop() + forth = info.pop() + info.push(first) + info.push(forth) + info.push(third) + info.push(second) + + def op_UNPACK_SEQUENCE(self, info, inst): + count = inst.arg + iterable = info.pop() + stores = [info.make_temp() for _ in range(count)] + tupleobj = info.make_temp() + info.append(inst, iterable=iterable, stores=stores, tupleobj=tupleobj) + for st in reversed(stores): + info.push(st) + + def op_FORMAT_VALUE(self, info, inst): + """ + FORMAT_VALUE(flags): flags argument specifies format spec which is + not supported yet. Currently, str() is simply called on the value. + Pops a value from stack and pushes results back. + Required for supporting f-strings. + https://docs.python.org/3/library/dis.html#opcode-FORMAT_VALUE + """ + if inst.arg != 0: + msg = "format spec in f-strings not supported yet" + raise UnsupportedError( + msg, + loc=Loc(filename=self.bytecode.func_id.filename, + line=inst.lineno) + ) + value = info.pop() + strvar = info.make_temp() + res = info.make_temp() + info.append(inst, value=value, res=res, strvar=strvar) + info.push(res) + + def op_BUILD_STRING(self, info, inst): + """ + BUILD_STRING(count): Concatenates count strings from the stack and + pushes the resulting string onto the stack. + Required for supporting f-strings. + https://docs.python.org/3/library/dis.html#opcode-BUILD_STRING + """ + count = inst.arg + strings = list(reversed([info.pop() for _ in range(count)])) + # corner case: f"" + if count == 0: + tmps = [info.make_temp()] + else: + tmps = [info.make_temp() for _ in range(count - 1)] + info.append(inst, strings=strings, tmps=tmps) + info.push(tmps[-1]) + + def op_BUILD_TUPLE(self, info, inst): + count = inst.arg + items = list(reversed([info.pop() for _ in range(count)])) + tup = info.make_temp() + info.append(inst, items=items, res=tup) + info.push(tup) + + def op_BUILD_LIST(self, info, inst): + count = inst.arg + items = list(reversed([info.pop() for _ in range(count)])) + lst = info.make_temp() + info.append(inst, items=items, res=lst) + info.push(lst) + + def op_LIST_APPEND(self, info, inst): + value = info.pop() + index = inst.arg + target = info.peek(index) + appendvar = info.make_temp() + res = info.make_temp() + info.append(inst, target=target, value=value, appendvar=appendvar, res=res) + + def op_BUILD_MAP(self, info, inst): + dct = info.make_temp() + count = inst.arg + items = [] + # BUILD_MAP takes pairs from the stack + for i in range(count): + v, k = info.pop(), info.pop() + items.append((k, v)) + info.append(inst, items=items[::-1], size=count, res=dct) + info.push(dct) + + def op_MAP_ADD(self, info, inst): + key = info.pop() + value = info.pop() + index = inst.arg + target = info.peek(index) + setitemvar = info.make_temp() + res = info.make_temp() + info.append(inst, target=target, key=key, value=value, + setitemvar=setitemvar, res=res) + + def op_BUILD_SET(self, info, inst): + count = inst.arg + # Note: related python bug http://bugs.python.org/issue26020 + items = list(reversed([info.pop() for _ in range(count)])) + res = info.make_temp() + info.append(inst, items=items, res=res) + info.push(res) + + def op_POP_TOP(self, info, inst): + info.pop(discard=True) + + def op_STORE_ATTR(self, info, inst): + target = info.pop() + value = info.pop() + info.append(inst, target=target, value=value) + + def op_DELETE_ATTR(self, info, inst): + target = info.pop() + info.append(inst, target=target) + + def op_STORE_FAST(self, info, inst): + value = info.pop() + info.append(inst, value=value) + + def op_STORE_MAP(self, info, inst): + key = info.pop() + value = info.pop() + dct = info.tos + info.append(inst, dct=dct, key=key, value=value) + + def op_STORE_DEREF(self, info, inst): + value = info.pop() + info.append(inst, value=value) + + def op_LOAD_FAST(self, info, inst): + name = self.bytecode.co_varnames[inst.arg] + res = info.make_temp(name) + info.append(inst, res=res) + info.push(res) + + def op_LOAD_CONST(self, info, inst): + res = info.make_temp('const') + info.append(inst, res=res) + info.push(res) + + def op_LOAD_GLOBAL(self, info, inst): + res = info.make_temp() + info.append(inst, res=res) + info.push(res) + + def op_LOAD_DEREF(self, info, inst): + res = info.make_temp() + info.append(inst, res=res) + info.push(res) + + def op_LOAD_ATTR(self, info, inst): + item = info.pop() + res = info.make_temp() + info.append(inst, item=item, res=res) + info.push(res) + + def op_BINARY_SUBSCR(self, info, inst): + index = info.pop() + target = info.pop() + res = info.make_temp() + info.append(inst, index=index, target=target, res=res) + info.push(res) + + def op_STORE_SUBSCR(self, info, inst): + index = info.pop() + target = info.pop() + value = info.pop() + info.append(inst, target=target, index=index, value=value) + + def op_DELETE_SUBSCR(self, info, inst): + index = info.pop() + target = info.pop() + info.append(inst, target=target, index=index) + + def op_GET_ITER(self, info, inst): + value = info.pop() + res = info.make_temp() + info.append(inst, value=value, res=res) + info.push(res) + + def op_FOR_ITER(self, info, inst): + iterator = info.tos + pair = info.make_temp() + indval = info.make_temp() + pred = info.make_temp() + info.append(inst, iterator=iterator, pair=pair, indval=indval, pred=pred) + info.push(indval) + # Setup for stack POP (twice) at loop exit (before processing instruction at jump target) + def pop_info(info): + info.pop() + info.pop() + self.edge_process[(info.block.offset, inst.get_jump_target())] = pop_info + + def op_CALL_FUNCTION(self, info, inst): + narg = inst.arg + args = list(reversed([info.pop() for _ in range(narg)])) + func = info.pop() + + res = info.make_temp() + info.append(inst, func=func, args=args, res=res) + info.push(res) + + def op_CALL_FUNCTION_KW(self, info, inst): + narg = inst.arg + names = info.pop() # tuple of names + args = list(reversed([info.pop() for _ in range(narg)])) + func = info.pop() + + res = info.make_temp() + info.append(inst, func=func, args=args, names=names, res=res) + info.push(res) + + def op_CALL_FUNCTION_EX(self, info, inst): + if inst.arg & 1: + errmsg = 'CALL_FUNCTION_EX with **kwargs not supported' + raise NotImplementedError(errmsg) + vararg = info.pop() + func = info.pop() + res = info.make_temp() + info.append(inst, func=func, vararg=vararg, res=res) + info.push(res) + + def _build_tuple_unpack(self, info, inst): + # Builds tuple from other tuples on the stack + tuples = list(reversed([info.pop() for _ in range(inst.arg)])) + temps = [info.make_temp() for _ in range(len(tuples) - 1)] + # if the unpack is assign-like, e.g. x = (*y,), it needs handling + # differently. + is_assign = len(tuples) == 1 + if is_assign: + temps = [info.make_temp(),] + + info.append(inst, tuples=tuples, temps=temps, is_assign=is_assign) + # The result is in the last temp var + info.push(temps[-1]) + + def op_BUILD_TUPLE_UNPACK_WITH_CALL(self, info, inst): + # just unpack the input tuple, call inst will be handled afterwards + self._build_tuple_unpack(info, inst) + + def op_BUILD_TUPLE_UNPACK(self, info, inst): + self._build_tuple_unpack(info, inst) + + def op_BUILD_CONST_KEY_MAP(self, info, inst): + keys = info.pop() + vals = list(reversed([info.pop() for _ in range(inst.arg)])) + keytmps = [info.make_temp() for _ in range(inst.arg)] + res = info.make_temp() + info.append(inst, keys=keys, keytmps=keytmps, values=vals, res=res) + info.push(res) + + def op_PRINT_ITEM(self, info, inst): + warnings.warn("Python2 style print partially supported. Please use " + "Python3 style print.", RuntimeWarning) + item = info.pop() + printvar = info.make_temp() + res = info.make_temp() + info.append(inst, item=item, printvar=printvar, res=res) + + def op_PRINT_NEWLINE(self, info, inst): + printvar = info.make_temp() + res = info.make_temp() + info.append(inst, printvar=printvar, res=res) + + def _unaryop(self, info, inst): + val = info.pop() + res = info.make_temp() + info.append(inst, value=val, res=res) + info.push(res) + + op_UNARY_NEGATIVE = _unaryop + op_UNARY_POSITIVE = _unaryop + op_UNARY_NOT = _unaryop + op_UNARY_INVERT = _unaryop + + def _binaryop(self, info, inst): + rhs = info.pop() + lhs = info.pop() + res = info.make_temp() + info.append(inst, lhs=lhs, rhs=rhs, res=res) + info.push(res) + + op_COMPARE_OP = _binaryop + op_IS_OP = _binaryop + op_CONTAINS_OP = _binaryop + + op_INPLACE_ADD = _binaryop + op_INPLACE_SUBTRACT = _binaryop + op_INPLACE_MULTIPLY = _binaryop + op_INPLACE_DIVIDE = _binaryop + op_INPLACE_TRUE_DIVIDE = _binaryop + op_INPLACE_FLOOR_DIVIDE = _binaryop + op_INPLACE_MODULO = _binaryop + op_INPLACE_POWER = _binaryop + op_INPLACE_MATRIX_MULTIPLY = _binaryop + + op_INPLACE_LSHIFT = _binaryop + op_INPLACE_RSHIFT = _binaryop + op_INPLACE_AND = _binaryop + op_INPLACE_OR = _binaryop + op_INPLACE_XOR = _binaryop + + op_BINARY_ADD = _binaryop + op_BINARY_SUBTRACT = _binaryop + op_BINARY_MULTIPLY = _binaryop + op_BINARY_DIVIDE = _binaryop + op_BINARY_TRUE_DIVIDE = _binaryop + op_BINARY_FLOOR_DIVIDE = _binaryop + op_BINARY_MODULO = _binaryop + op_BINARY_POWER = _binaryop + op_BINARY_MATRIX_MULTIPLY = _binaryop + + op_BINARY_LSHIFT = _binaryop + op_BINARY_RSHIFT = _binaryop + op_BINARY_AND = _binaryop + op_BINARY_OR = _binaryop + op_BINARY_XOR = _binaryop + + def op_SLICE_0(self, info, inst): + """ + TOS = TOS[:] + """ + tos = info.pop() + res = info.make_temp() + slicevar = info.make_temp() + indexvar = info.make_temp() + nonevar = info.make_temp() + info.append(inst, base=tos, res=res, slicevar=slicevar, + indexvar=indexvar, nonevar=nonevar) + info.push(res) + + def op_SLICE_1(self, info, inst): + """ + TOS = TOS1[TOS:] + """ + tos = info.pop() + tos1 = info.pop() + res = info.make_temp() + slicevar = info.make_temp() + indexvar = info.make_temp() + nonevar = info.make_temp() + info.append(inst, base=tos1, start=tos, res=res, slicevar=slicevar, + indexvar=indexvar, nonevar=nonevar) + info.push(res) + + def op_SLICE_2(self, info, inst): + """ + TOS = TOS1[:TOS] + """ + tos = info.pop() + tos1 = info.pop() + res = info.make_temp() + slicevar = info.make_temp() + indexvar = info.make_temp() + nonevar = info.make_temp() + info.append(inst, base=tos1, stop=tos, res=res, slicevar=slicevar, + indexvar=indexvar, nonevar=nonevar) + info.push(res) + + def op_SLICE_3(self, info, inst): + """ + TOS = TOS2[TOS1:TOS] + """ + tos = info.pop() + tos1 = info.pop() + tos2 = info.pop() + res = info.make_temp() + slicevar = info.make_temp() + indexvar = info.make_temp() + info.append(inst, base=tos2, start=tos1, stop=tos, res=res, + slicevar=slicevar, indexvar=indexvar) + info.push(res) + + def op_STORE_SLICE_0(self, info, inst): + """ + TOS[:] = TOS1 + """ + tos = info.pop() + value = info.pop() + slicevar = info.make_temp() + indexvar = info.make_temp() + nonevar = info.make_temp() + info.append(inst, base=tos, value=value, slicevar=slicevar, + indexvar=indexvar, nonevar=nonevar) + + def op_STORE_SLICE_1(self, info, inst): + """ + TOS1[TOS:] = TOS2 + """ + tos = info.pop() + tos1 = info.pop() + value = info.pop() + slicevar = info.make_temp() + indexvar = info.make_temp() + nonevar = info.make_temp() + info.append(inst, base=tos1, start=tos, slicevar=slicevar, + value=value, indexvar=indexvar, nonevar=nonevar) + + def op_STORE_SLICE_2(self, info, inst): + """ + TOS1[:TOS] = TOS2 + """ + tos = info.pop() + tos1 = info.pop() + value = info.pop() + slicevar = info.make_temp() + indexvar = info.make_temp() + nonevar = info.make_temp() + info.append(inst, base=tos1, stop=tos, value=value, slicevar=slicevar, + indexvar=indexvar, nonevar=nonevar) + + def op_STORE_SLICE_3(self, info, inst): + """ + TOS2[TOS1:TOS] = TOS3 + """ + tos = info.pop() + tos1 = info.pop() + tos2 = info.pop() + value = info.pop() + slicevar = info.make_temp() + indexvar = info.make_temp() + info.append(inst, base=tos2, start=tos1, stop=tos, value=value, + slicevar=slicevar, indexvar=indexvar) + + def op_DELETE_SLICE_0(self, info, inst): + """ + del TOS[:] + """ + tos = info.pop() + slicevar = info.make_temp() + indexvar = info.make_temp() + nonevar = info.make_temp() + info.append(inst, base=tos, slicevar=slicevar, + indexvar=indexvar, nonevar=nonevar) + + def op_DELETE_SLICE_1(self, info, inst): + """ + del TOS1[TOS:] + """ + tos = info.pop() + tos1 = info.pop() + slicevar = info.make_temp() + indexvar = info.make_temp() + nonevar = info.make_temp() + info.append(inst, base=tos1, start=tos, slicevar=slicevar, + indexvar=indexvar, nonevar=nonevar) + + def op_DELETE_SLICE_2(self, info, inst): + """ + del TOS1[:TOS] + """ + tos = info.pop() + tos1 = info.pop() + slicevar = info.make_temp() + indexvar = info.make_temp() + nonevar = info.make_temp() + info.append(inst, base=tos1, stop=tos, slicevar=slicevar, + indexvar=indexvar, nonevar=nonevar) + + def op_DELETE_SLICE_3(self, info, inst): + """ + del TOS2[TOS1:TOS] + """ + tos = info.pop() + tos1 = info.pop() + tos2 = info.pop() + slicevar = info.make_temp() + indexvar = info.make_temp() + info.append(inst, base=tos2, start=tos1, stop=tos, + slicevar=slicevar, indexvar=indexvar) + + def op_BUILD_SLICE(self, info, inst): + """ + slice(TOS1, TOS) or slice(TOS2, TOS1, TOS) + """ + argc = inst.arg + if argc == 2: + tos = info.pop() + tos1 = info.pop() + start = tos1 + stop = tos + step = None + elif argc == 3: + tos = info.pop() + tos1 = info.pop() + tos2 = info.pop() + start = tos2 + stop = tos1 + step = tos + else: + raise Exception("unreachable") + slicevar = info.make_temp() + res = info.make_temp() + info.append(inst, start=start, stop=stop, step=step, res=res, + slicevar=slicevar) + info.push(res) + + def op_POP_JUMP_IF_TRUE(self, info, inst): + pred = info.pop() + info.append(inst, pred=pred) + info.terminator = inst + + def op_POP_JUMP_IF_FALSE(self, info, inst): + pred = info.pop() + info.append(inst, pred=pred) + info.terminator = inst + + def op_JUMP_IF_TRUE(self, info, inst): + pred = info.tos + info.append(inst, pred=pred) + info.terminator = inst + + def op_JUMP_IF_FALSE(self, info, inst): + pred = info.tos + info.append(inst, pred=pred) + info.terminator = inst + + op_JUMP_IF_FALSE_OR_POP = op_JUMP_IF_FALSE + op_JUMP_IF_TRUE_OR_POP = op_JUMP_IF_TRUE + + def op_JUMP_ABSOLUTE(self, info, inst): + info.append(inst) + info.terminator = inst + + def op_JUMP_FORWARD(self, info, inst): + info.append(inst) + info.terminator = inst + + def op_BREAK_LOOP(self, info, inst): + self.pop_syntax_block(info) + info.append(inst) + info.terminator = inst + + def op_RETURN_VALUE(self, info, inst): + info.append(inst, retval=info.pop(), castval=info.make_temp()) + info.terminator = inst + + def op_YIELD_VALUE(self, info, inst): + val = info.pop() + res = info.make_temp() + info.append(inst, value=val, res=res) + info.push(res) + + def op_SETUP_LOOP(self, info, inst): + self.add_syntax_block(info, LoopBlock()) + info.append(inst) + + def op_SETUP_WITH(self, info, inst): + cm = info.pop() # the context-manager + self.add_syntax_block(info, WithBlock()) + yielded = info.make_temp() + info.push(yielded) + info.append(inst, contextmanager=cm) + + def op_WITH_CLEANUP(self, info, inst): + """ + Note: py2 only opcode + """ + # TOS is the return value of __exit__() + info.pop() + info.append(inst) + + def op_WITH_CLEANUP_START(self, info, inst): + # TOS is the return value of __exit__() + info.pop() + info.append(inst) + + def op_WITH_CLEANUP_FINISH(self, info, inst): + info.append(inst) + + def op_END_FINALLY(self, info, inst): + info.append(inst) + + def op_POP_BLOCK(self, info, inst): + block = self.pop_syntax_block(info) + info.append(inst) + + def op_RAISE_VARARGS(self, info, inst): + if inst.arg == 0: + exc = None + elif inst.arg == 1: + exc = info.pop() + else: + raise ValueError("Multiple argument raise is not supported.") + info.append(inst, exc=exc) + + def op_MAKE_FUNCTION(self, info, inst, MAKE_CLOSURE=False): + name = info.pop() + code = info.pop() + closure = annotations = kwdefaults = defaults = None + if inst.arg & 0x8: + closure = info.pop() + if inst.arg & 0x4: + annotations = info.pop() + if inst.arg & 0x2: + kwdefaults = info.pop() + if inst.arg & 0x1: + defaults = info.pop() + res = info.make_temp() + info.append(inst, name=name, code=code, closure=closure, annotations=annotations, + kwdefaults=kwdefaults, defaults=defaults, res=res) + info.push(res) + + def op_MAKE_CLOSURE(self, info, inst): + self.op_MAKE_FUNCTION(info, inst, MAKE_CLOSURE=True) + + def op_LOAD_CLOSURE(self, info, inst): + res = info.make_temp() + info.append(inst, res=res) + info.push(res) + + #NOTE: Please see notes in `interpreter.py` surrounding the implementation + # of LOAD_METHOD and CALL_METHOD. + + def op_LOAD_METHOD(self, *args, **kws): + self.op_LOAD_ATTR(*args, **kws) + + def op_CALL_METHOD(self, *args, **kws): + self.op_CALL_FUNCTION(*args, **kws) + + def _ignored(self, info, inst): + pass + + +class LoopBlock(object): + __slots__ = ('stack_offset',) + + def __init__(self): + self.stack_offset = None + + +class WithBlock(object): + __slots__ = ('stack_offset',) + + def __init__(self): + self.stack_offset = None + + +class BlockInfo(object): + def __init__(self, block, offset, incoming_blocks): + self.block = block + self.offset = offset + # The list of incoming BlockInfo objects (obtained by control + # flow analysis). + self.incoming_blocks = incoming_blocks + self.stack = [] + # Outgoing variables from this block: + # { outgoing phi name -> var name } + self.outgoing_phis = {} + self.insts = [] + self.tempct = 0 + self._term = None + self.stack_offset = None + self.stack_effect = 0 + self.syntax_blocks = None + + def __repr__(self): + return "<%s at offset %d>" % (self.__class__.__name__, self.offset) + + def dump(self): + print("offset", self.offset, "{") + print(" stack: ", end='') + pprint(self.stack) + pprint(self.insts) + print("}") + + def make_temp(self, prefix=''): + self.tempct += 1 + name = '$%s%s.%s' % (prefix, self.offset, self.tempct) + return name + + def push(self, val): + self.stack_effect += 1 + self.stack.append(val) + + def pop(self, discard=False): + """ + Pop a variable from the stack, or request it from incoming blocks if + the stack is empty. + If *discard* is true, the variable isn't meant to be used anymore, + which allows reducing the number of temporaries created. + """ + if not self.stack: + self.stack_offset -= 1 + if not discard: + return self.make_incoming() + else: + self.stack_effect -= 1 + return self.stack.pop() + + def peek(self, k): + """ + Return the k'th element back from the top of the stack. + peek(1) is the top of the stack. + """ + num_pops = k + top_k = [self.pop() for _ in range(num_pops)] + r = top_k[-1] + for i in range(num_pops - 1, -1, -1): + self.push(top_k[i]) + return r + + def make_incoming(self): + """ + Create an incoming variable (due to not enough values being + available on our stack) and request its assignment from our + incoming blocks' own stacks. + """ + assert self.incoming_blocks + ret = self.make_temp('phi') + for ib in self.incoming_blocks: + stack_index = self.stack_offset + self.stack_effect + ib.request_outgoing(self, ret, stack_index) + return ret + + def request_outgoing(self, outgoing_block, phiname, stack_index): + """ + Request the assignment of the next available stack variable + for block *outgoing_block* with target name *phiname*. + """ + if phiname in self.outgoing_phis: + # If phiname was already requested, ignore this new request + # (can happen with a diamond-shaped block flow structure). + return + if stack_index < self.stack_offset: + assert self.incoming_blocks + for ib in self.incoming_blocks: + ib.request_outgoing(self, phiname, stack_index) + else: + varname = self.stack[stack_index - self.stack_offset] + self.outgoing_phis[phiname] = varname + + @property + def tos(self): + r = self.pop() + self.push(r) + return r + + def append(self, inst, **kws): + self.insts.append((inst.offset, kws)) + + @property + def terminator(self): + assert self._term is None + return self._term + + @terminator.setter + def terminator(self, inst): + self._term = inst + + @property + def active_try_block(self): + """Try except not supported. + + See byteflow.py + """ + return None diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/__init__.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2cb8b104e8777cf354a188d16790a7a79a0811a0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/__init__.py @@ -0,0 +1,4 @@ +from .manager import DataModelManager +from .packer import ArgPacker, DataPacker +from .registry import register_default, default_manager, register +from .models import PrimitiveModel, CompositeModel, StructModel diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/manager.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/manager.py new file mode 100644 index 0000000000000000000000000000000000000000..819f33511c1b55101881a0b4617f5994d3743fd6 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/manager.py @@ -0,0 +1,47 @@ +import weakref + +from numba.core import types + + +class DataModelManager(object): + """Manages mapping of FE types to their corresponding data model + """ + + def __init__(self): + # { numba type class -> model factory } + self._handlers = {} + # { numba type instance -> model instance } + self._cache = weakref.WeakKeyDictionary() + + def register(self, fetypecls, handler): + """Register the datamodel factory corresponding to a frontend-type class + """ + assert issubclass(fetypecls, types.Type) + self._handlers[fetypecls] = handler + + def lookup(self, fetype): + """Returns the corresponding datamodel given the frontend-type instance + """ + try: + return self._cache[fetype] + except KeyError: + pass + handler = self._handlers[type(fetype)] + model = self._cache[fetype] = handler(self, fetype) + return model + + def __getitem__(self, fetype): + """Shorthand for lookup() + """ + return self.lookup(fetype) + + def copy(self): + """ + Make a copy of the manager. + Use this to inherit from the default data model and specialize it + for custom target. + """ + dmm = DataModelManager() + dmm._handlers = self._handlers.copy() + return dmm + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/models.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/models.py new file mode 100644 index 0000000000000000000000000000000000000000..cc62f60353799fa94e03cf0146e97c29424dcf55 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/models.py @@ -0,0 +1,1384 @@ +from functools import partial +from collections import deque + +from llvmlite import ir + +from numba.core.datamodel.registry import register_default +from numba.core import types, cgutils +from numba.np import numpy_support + + +class DataModel(object): + """ + DataModel describe how a FE type is represented in the LLVM IR at + different contexts. + + Contexts are: + + - value: representation inside function body. Maybe stored in stack. + The representation here are flexible. + + - data: representation used when storing into containers (e.g. arrays). + + - argument: representation used for function argument. All composite + types are unflattened into multiple primitive types. + + - return: representation used for return argument. + + Throughput the compiler pipeline, a LLVM value is usually passed around + in the "value" representation. All "as_" prefix function converts from + "value" representation. All "from_" prefix function converts to the + "value" representation. + + """ + def __init__(self, dmm, fe_type): + self._dmm = dmm + self._fe_type = fe_type + + @property + def fe_type(self): + return self._fe_type + + def get_value_type(self): + raise NotImplementedError(self) + + def get_data_type(self): + return self.get_value_type() + + def get_argument_type(self): + """Return a LLVM type or nested tuple of LLVM type + """ + return self.get_value_type() + + def get_return_type(self): + return self.get_value_type() + + def as_data(self, builder, value): + raise NotImplementedError(self) + + def as_argument(self, builder, value): + """ + Takes one LLVM value + Return a LLVM value or nested tuple of LLVM value + """ + raise NotImplementedError(self) + + def as_return(self, builder, value): + raise NotImplementedError(self) + + def from_data(self, builder, value): + raise NotImplementedError(self) + + def from_argument(self, builder, value): + """ + Takes a LLVM value or nested tuple of LLVM value + Returns one LLVM value + """ + raise NotImplementedError(self) + + def from_return(self, builder, value): + raise NotImplementedError(self) + + def load_from_data_pointer(self, builder, ptr, align=None): + """ + Load value from a pointer to data. + This is the default implementation, sufficient for most purposes. + """ + return self.from_data(builder, builder.load(ptr, align=align)) + + def traverse(self, builder): + """ + Traverse contained members. + Returns a iterable of contained (types, getters). + Each getter is a one-argument function accepting a LLVM value. + """ + return [] + + def traverse_models(self): + """ + Recursively list all models involved in this model. + """ + return [self._dmm[t] for t in self.traverse_types()] + + def traverse_types(self): + """ + Recursively list all frontend types involved in this model. + """ + types = [self._fe_type] + queue = deque([self]) + while len(queue) > 0: + dm = queue.popleft() + + for i_dm in dm.inner_models(): + if i_dm._fe_type not in types: + queue.append(i_dm) + types.append(i_dm._fe_type) + + return types + + def inner_models(self): + """ + List all *inner* models. + """ + return [] + + def get_nrt_meminfo(self, builder, value): + """ + Returns the MemInfo object or None if it is not tracked. + It is only defined for types.meminfo_pointer + """ + return None + + def has_nrt_meminfo(self): + return False + + def contains_nrt_meminfo(self): + """ + Recursively check all contained types for need for NRT meminfo. + """ + return any(model.has_nrt_meminfo() for model in self.traverse_models()) + + def _compared_fields(self): + return (type(self), self._fe_type) + + def __hash__(self): + return hash(tuple(self._compared_fields())) + + def __eq__(self, other): + if type(self) is type(other): + return self._compared_fields() == other._compared_fields() + else: + return False + + def __ne__(self, other): + return not self.__eq__(other) + + +@register_default(types.Omitted) +class OmittedArgDataModel(DataModel): + """ + A data model for omitted arguments. Only the "argument" representation + is defined, other representations raise a NotImplementedError. + """ + # Omitted arguments are using a dummy value type + def get_value_type(self): + return ir.LiteralStructType([]) + + # Omitted arguments don't produce any LLVM function argument. + def get_argument_type(self): + return () + + def as_argument(self, builder, val): + return () + + def from_argument(self, builder, val): + assert val == (), val + return None + + +@register_default(types.Boolean) +@register_default(types.BooleanLiteral) +class BooleanModel(DataModel): + _bit_type = ir.IntType(1) + _byte_type = ir.IntType(8) + + def get_value_type(self): + return self._bit_type + + def get_data_type(self): + return self._byte_type + + def get_return_type(self): + return self.get_data_type() + + def get_argument_type(self): + return self.get_data_type() + + def as_data(self, builder, value): + return builder.zext(value, self.get_data_type()) + + def as_argument(self, builder, value): + return self.as_data(builder, value) + + def as_return(self, builder, value): + return self.as_data(builder, value) + + def from_data(self, builder, value): + ty = self.get_value_type() + resalloca = cgutils.alloca_once(builder, ty) + cond = builder.icmp_unsigned('==', value, value.type(0)) + with builder.if_else(cond) as (then, otherwise): + with then: + builder.store(ty(0), resalloca) + with otherwise: + builder.store(ty(1), resalloca) + return builder.load(resalloca) + + def from_argument(self, builder, value): + return self.from_data(builder, value) + + def from_return(self, builder, value): + return self.from_data(builder, value) + + +class PrimitiveModel(DataModel): + """A primitive type can be represented natively in the target in all + usage contexts. + """ + + def __init__(self, dmm, fe_type, be_type): + super(PrimitiveModel, self).__init__(dmm, fe_type) + self.be_type = be_type + + def get_value_type(self): + return self.be_type + + def as_data(self, builder, value): + return value + + def as_argument(self, builder, value): + return value + + def as_return(self, builder, value): + return value + + def from_data(self, builder, value): + return value + + def from_argument(self, builder, value): + return value + + def from_return(self, builder, value): + return value + + +class ProxyModel(DataModel): + """ + Helper class for models which delegate to another model. + """ + + def get_value_type(self): + return self._proxied_model.get_value_type() + + def get_data_type(self): + return self._proxied_model.get_data_type() + + def get_return_type(self): + return self._proxied_model.get_return_type() + + def get_argument_type(self): + return self._proxied_model.get_argument_type() + + def as_data(self, builder, value): + return self._proxied_model.as_data(builder, value) + + def as_argument(self, builder, value): + return self._proxied_model.as_argument(builder, value) + + def as_return(self, builder, value): + return self._proxied_model.as_return(builder, value) + + def from_data(self, builder, value): + return self._proxied_model.from_data(builder, value) + + def from_argument(self, builder, value): + return self._proxied_model.from_argument(builder, value) + + def from_return(self, builder, value): + return self._proxied_model.from_return(builder, value) + + +@register_default(types.EnumMember) +@register_default(types.IntEnumMember) +class EnumModel(ProxyModel): + """ + Enum members are represented exactly like their values. + """ + def __init__(self, dmm, fe_type): + super(EnumModel, self).__init__(dmm, fe_type) + self._proxied_model = dmm.lookup(fe_type.dtype) + + +@register_default(types.Opaque) +@register_default(types.PyObject) +@register_default(types.RawPointer) +@register_default(types.NoneType) +@register_default(types.StringLiteral) +@register_default(types.EllipsisType) +@register_default(types.Function) +@register_default(types.Type) +@register_default(types.Object) +@register_default(types.Module) +@register_default(types.Phantom) +@register_default(types.ContextManager) +@register_default(types.Dispatcher) +@register_default(types.ObjModeDispatcher) +@register_default(types.ExceptionClass) +@register_default(types.Dummy) +@register_default(types.ExceptionInstance) +@register_default(types.ExternalFunction) +@register_default(types.EnumClass) +@register_default(types.IntEnumClass) +@register_default(types.NumberClass) +@register_default(types.TypeRef) +@register_default(types.NamedTupleClass) +@register_default(types.DType) +@register_default(types.RecursiveCall) +@register_default(types.MakeFunctionLiteral) +@register_default(types.Poison) +class OpaqueModel(PrimitiveModel): + """ + Passed as opaque pointers + """ + _ptr_type = ir.IntType(8).as_pointer() + + def __init__(self, dmm, fe_type): + be_type = self._ptr_type + super(OpaqueModel, self).__init__(dmm, fe_type, be_type) + + +@register_default(types.MemInfoPointer) +class MemInfoModel(OpaqueModel): + + def inner_models(self): + return [self._dmm.lookup(self._fe_type.dtype)] + + def has_nrt_meminfo(self): + return True + + def get_nrt_meminfo(self, builder, value): + return value + + +@register_default(types.Integer) +@register_default(types.IntegerLiteral) +class IntegerModel(PrimitiveModel): + def __init__(self, dmm, fe_type): + be_type = ir.IntType(fe_type.bitwidth) + super(IntegerModel, self).__init__(dmm, fe_type, be_type) + + +@register_default(types.Float) +class FloatModel(PrimitiveModel): + def __init__(self, dmm, fe_type): + if fe_type == types.float32: + be_type = ir.FloatType() + elif fe_type == types.float64: + be_type = ir.DoubleType() + else: + raise NotImplementedError(fe_type) + super(FloatModel, self).__init__(dmm, fe_type, be_type) + + +@register_default(types.CPointer) +class PointerModel(PrimitiveModel): + def __init__(self, dmm, fe_type): + self._pointee_model = dmm.lookup(fe_type.dtype) + self._pointee_be_type = self._pointee_model.get_data_type() + be_type = self._pointee_be_type.as_pointer() + super(PointerModel, self).__init__(dmm, fe_type, be_type) + + +@register_default(types.EphemeralPointer) +class EphemeralPointerModel(PointerModel): + + def get_data_type(self): + return self._pointee_be_type + + def as_data(self, builder, value): + value = builder.load(value) + return self._pointee_model.as_data(builder, value) + + def from_data(self, builder, value): + raise NotImplementedError("use load_from_data_pointer() instead") + + def load_from_data_pointer(self, builder, ptr, align=None): + return builder.bitcast(ptr, self.get_value_type()) + + +@register_default(types.EphemeralArray) +class EphemeralArrayModel(PointerModel): + + def __init__(self, dmm, fe_type): + super(EphemeralArrayModel, self).__init__(dmm, fe_type) + self._data_type = ir.ArrayType(self._pointee_be_type, + self._fe_type.count) + + def get_data_type(self): + return self._data_type + + def as_data(self, builder, value): + values = [builder.load(cgutils.gep_inbounds(builder, value, i)) + for i in range(self._fe_type.count)] + return cgutils.pack_array(builder, values) + + def from_data(self, builder, value): + raise NotImplementedError("use load_from_data_pointer() instead") + + def load_from_data_pointer(self, builder, ptr, align=None): + return builder.bitcast(ptr, self.get_value_type()) + + +@register_default(types.ExternalFunctionPointer) +class ExternalFuncPointerModel(PrimitiveModel): + def __init__(self, dmm, fe_type): + sig = fe_type.sig + # Since the function is non-Numba, there is no adaptation + # of arguments and return value, hence get_value_type(). + retty = dmm.lookup(sig.return_type).get_value_type() + args = [dmm.lookup(t).get_value_type() for t in sig.args] + be_type = ir.PointerType(ir.FunctionType(retty, args)) + super(ExternalFuncPointerModel, self).__init__(dmm, fe_type, be_type) + + +@register_default(types.UniTuple) +@register_default(types.NamedUniTuple) +@register_default(types.StarArgUniTuple) +class UniTupleModel(DataModel): + def __init__(self, dmm, fe_type): + super(UniTupleModel, self).__init__(dmm, fe_type) + self._elem_model = dmm.lookup(fe_type.dtype) + self._count = len(fe_type) + self._value_type = ir.ArrayType(self._elem_model.get_value_type(), + self._count) + self._data_type = ir.ArrayType(self._elem_model.get_data_type(), + self._count) + + def get_value_type(self): + return self._value_type + + def get_data_type(self): + return self._data_type + + def get_return_type(self): + return self.get_value_type() + + def get_argument_type(self): + return (self._elem_model.get_argument_type(),) * self._count + + def as_argument(self, builder, value): + out = [] + for i in range(self._count): + v = builder.extract_value(value, [i]) + v = self._elem_model.as_argument(builder, v) + out.append(v) + return out + + def from_argument(self, builder, value): + out = ir.Constant(self.get_value_type(), ir.Undefined) + for i, v in enumerate(value): + v = self._elem_model.from_argument(builder, v) + out = builder.insert_value(out, v, [i]) + return out + + def as_data(self, builder, value): + out = ir.Constant(self.get_data_type(), ir.Undefined) + for i in range(self._count): + val = builder.extract_value(value, [i]) + dval = self._elem_model.as_data(builder, val) + out = builder.insert_value(out, dval, [i]) + return out + + def from_data(self, builder, value): + out = ir.Constant(self.get_value_type(), ir.Undefined) + for i in range(self._count): + val = builder.extract_value(value, [i]) + dval = self._elem_model.from_data(builder, val) + out = builder.insert_value(out, dval, [i]) + return out + + def as_return(self, builder, value): + return value + + def from_return(self, builder, value): + return value + + def traverse(self, builder): + def getter(i, value): + return builder.extract_value(value, i) + return [(self._fe_type.dtype, partial(getter, i)) + for i in range(self._count)] + + def inner_models(self): + return [self._elem_model] + + +class CompositeModel(DataModel): + """Any model that is composed of multiple other models should subclass from + this. + """ + pass + + +class StructModel(CompositeModel): + _value_type = None + _data_type = None + + def __init__(self, dmm, fe_type, members): + super(StructModel, self).__init__(dmm, fe_type) + if members: + self._fields, self._members = zip(*members) + else: + self._fields = self._members = () + self._models = tuple([self._dmm.lookup(t) for t in self._members]) + + def get_member_fe_type(self, name): + """ + StructModel-specific: get the Numba type of the field named *name*. + """ + pos = self.get_field_position(name) + return self._members[pos] + + def get_value_type(self): + if self._value_type is None: + self._value_type = ir.LiteralStructType([t.get_value_type() + for t in self._models]) + return self._value_type + + def get_data_type(self): + if self._data_type is None: + self._data_type = ir.LiteralStructType([t.get_data_type() + for t in self._models]) + return self._data_type + + def get_argument_type(self): + return tuple([t.get_argument_type() for t in self._models]) + + def get_return_type(self): + return self.get_data_type() + + def _as(self, methname, builder, value): + extracted = [] + for i, dm in enumerate(self._models): + extracted.append(getattr(dm, methname)(builder, + self.get(builder, value, i))) + return tuple(extracted) + + def _from(self, methname, builder, value): + struct = ir.Constant(self.get_value_type(), ir.Undefined) + + for i, (dm, val) in enumerate(zip(self._models, value)): + v = getattr(dm, methname)(builder, val) + struct = self.set(builder, struct, v, i) + + return struct + + def as_data(self, builder, value): + """ + Converts the LLVM struct in `value` into a representation suited for + storing into arrays. + + Note + ---- + Current implementation rarely changes how types are represented for + "value" and "data". This is usually a pointless rebuild of the + immutable LLVM struct value. Luckily, LLVM optimization removes all + redundancy. + + Sample usecase: Structures nested with pointers to other structures + that can be serialized into a flat representation when storing into + array. + """ + elems = self._as("as_data", builder, value) + struct = ir.Constant(self.get_data_type(), ir.Undefined) + for i, el in enumerate(elems): + struct = builder.insert_value(struct, el, [i]) + return struct + + def from_data(self, builder, value): + """ + Convert from "data" representation back into "value" representation. + Usually invoked when loading from array. + + See notes in `as_data()` + """ + vals = [builder.extract_value(value, [i]) + for i in range(len(self._members))] + return self._from("from_data", builder, vals) + + def load_from_data_pointer(self, builder, ptr, align=None): + values = [] + for i, model in enumerate(self._models): + elem_ptr = cgutils.gep_inbounds(builder, ptr, 0, i) + val = model.load_from_data_pointer(builder, elem_ptr, align) + values.append(val) + + struct = ir.Constant(self.get_value_type(), ir.Undefined) + for i, val in enumerate(values): + struct = self.set(builder, struct, val, i) + return struct + + def as_argument(self, builder, value): + return self._as("as_argument", builder, value) + + def from_argument(self, builder, value): + return self._from("from_argument", builder, value) + + def as_return(self, builder, value): + elems = self._as("as_data", builder, value) + struct = ir.Constant(self.get_data_type(), ir.Undefined) + for i, el in enumerate(elems): + struct = builder.insert_value(struct, el, [i]) + return struct + + def from_return(self, builder, value): + vals = [builder.extract_value(value, [i]) + for i in range(len(self._members))] + return self._from("from_data", builder, vals) + + def get(self, builder, val, pos): + """Get a field at the given position or the fieldname + + Args + ---- + builder: + LLVM IRBuilder + val: + value to be inserted + pos: int or str + field index or field name + + Returns + ------- + Extracted value + """ + if isinstance(pos, str): + pos = self.get_field_position(pos) + return builder.extract_value(val, [pos], + name="extracted." + self._fields[pos]) + + def set(self, builder, stval, val, pos): + """Set a field at the given position or the fieldname + + Args + ---- + builder: + LLVM IRBuilder + stval: + LLVM struct value + val: + value to be inserted + pos: int or str + field index or field name + + Returns + ------- + A new LLVM struct with the value inserted + """ + if isinstance(pos, str): + pos = self.get_field_position(pos) + return builder.insert_value(stval, val, [pos], + name="inserted." + self._fields[pos]) + + def get_field_position(self, field): + try: + return self._fields.index(field) + except ValueError: + raise KeyError("%s does not have a field named %r" + % (self.__class__.__name__, field)) + + @property + def field_count(self): + return len(self._fields) + + def get_type(self, pos): + """Get the frontend type (numba type) of a field given the position + or the fieldname + + Args + ---- + pos: int or str + field index or field name + """ + if isinstance(pos, str): + pos = self.get_field_position(pos) + return self._members[pos] + + def get_model(self, pos): + """ + Get the datamodel of a field given the position or the fieldname. + + Args + ---- + pos: int or str + field index or field name + """ + return self._models[pos] + + def traverse(self, builder): + def getter(k, value): + if value.type != self.get_value_type(): + args = self.get_value_type(), value.type + raise TypeError("expecting {0} but got {1}".format(*args)) + return self.get(builder, value, k) + + return [(self.get_type(k), partial(getter, k)) for k in self._fields] + + def inner_models(self): + return self._models + + +@register_default(types.Complex) +class ComplexModel(StructModel): + _element_type = NotImplemented + + def __init__(self, dmm, fe_type): + members = [ + ('real', fe_type.underlying_float), + ('imag', fe_type.underlying_float), + ] + super(ComplexModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.LiteralList) +@register_default(types.LiteralStrKeyDict) +@register_default(types.Tuple) +@register_default(types.NamedTuple) +@register_default(types.StarArgTuple) +class TupleModel(StructModel): + def __init__(self, dmm, fe_type): + members = [('f' + str(i), t) for i, t in enumerate(fe_type)] + super(TupleModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.UnionType) +class UnionModel(StructModel): + def __init__(self, dmm, fe_type): + members = [ + ('tag', types.uintp), + # XXX: it should really be a MemInfoPointer(types.voidptr) + ('payload', types.Tuple.from_types(fe_type.types)), + ] + super(UnionModel, self).__init__(dmm, fe_type, members) + + + +@register_default(types.Pair) +class PairModel(StructModel): + def __init__(self, dmm, fe_type): + members = [('first', fe_type.first_type), + ('second', fe_type.second_type)] + super(PairModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.ListPayload) +class ListPayloadModel(StructModel): + def __init__(self, dmm, fe_type): + # The fields are mutable but the payload is always manipulated + # by reference. This scheme allows mutations of an array to + # be seen by its iterators. + members = [ + ('size', types.intp), + ('allocated', types.intp), + # This member is only used only for reflected lists + ('dirty', types.boolean), + # Actually an inlined var-sized array + ('data', fe_type.container.dtype), + ] + super(ListPayloadModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.List) +class ListModel(StructModel): + def __init__(self, dmm, fe_type): + payload_type = types.ListPayload(fe_type) + members = [ + # The meminfo data points to a ListPayload + ('meminfo', types.MemInfoPointer(payload_type)), + # This member is only used only for reflected lists + ('parent', types.pyobject), + ] + super(ListModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.ListIter) +class ListIterModel(StructModel): + def __init__(self, dmm, fe_type): + payload_type = types.ListPayload(fe_type.container) + members = [ + # The meminfo data points to a ListPayload (shared with the + # original list object) + ('meminfo', types.MemInfoPointer(payload_type)), + ('index', types.EphemeralPointer(types.intp)), + ] + super(ListIterModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.SetEntry) +class SetEntryModel(StructModel): + def __init__(self, dmm, fe_type): + dtype = fe_type.set_type.dtype + members = [ + # -1 = empty, -2 = deleted + ('hash', types.intp), + ('key', dtype), + ] + super(SetEntryModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.SetPayload) +class SetPayloadModel(StructModel): + def __init__(self, dmm, fe_type): + entry_type = types.SetEntry(fe_type.container) + members = [ + # Number of active + deleted entries + ('fill', types.intp), + # Number of active entries + ('used', types.intp), + # Allocated size - 1 (size being a power of 2) + ('mask', types.intp), + # Search finger + ('finger', types.intp), + # This member is only used only for reflected sets + ('dirty', types.boolean), + # Actually an inlined var-sized array + ('entries', entry_type), + ] + super(SetPayloadModel, self).__init__(dmm, fe_type, members) + +@register_default(types.Set) +class SetModel(StructModel): + def __init__(self, dmm, fe_type): + payload_type = types.SetPayload(fe_type) + members = [ + # The meminfo data points to a SetPayload + ('meminfo', types.MemInfoPointer(payload_type)), + # This member is only used only for reflected sets + ('parent', types.pyobject), + ] + super(SetModel, self).__init__(dmm, fe_type, members) + +@register_default(types.SetIter) +class SetIterModel(StructModel): + def __init__(self, dmm, fe_type): + payload_type = types.SetPayload(fe_type.container) + members = [ + # The meminfo data points to a SetPayload (shared with the + # original set object) + ('meminfo', types.MemInfoPointer(payload_type)), + # The index into the entries table + ('index', types.EphemeralPointer(types.intp)), + ] + super(SetIterModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.Array) +@register_default(types.Buffer) +@register_default(types.ByteArray) +@register_default(types.Bytes) +@register_default(types.MemoryView) +@register_default(types.PyArray) +class ArrayModel(StructModel): + def __init__(self, dmm, fe_type): + ndim = fe_type.ndim + members = [ + ('meminfo', types.MemInfoPointer(fe_type.dtype)), + ('parent', types.pyobject), + ('nitems', types.intp), + ('itemsize', types.intp), + ('data', types.CPointer(fe_type.dtype)), + ('shape', types.UniTuple(types.intp, ndim)), + ('strides', types.UniTuple(types.intp, ndim)), + + ] + super(ArrayModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.ArrayFlags) +class ArrayFlagsModel(StructModel): + def __init__(self, dmm, fe_type): + members = [ + ('parent', fe_type.array_type), + ] + super(ArrayFlagsModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.NestedArray) +class NestedArrayModel(ArrayModel): + def __init__(self, dmm, fe_type): + self._be_type = dmm.lookup(fe_type.dtype).get_data_type() + super(NestedArrayModel, self).__init__(dmm, fe_type) + + def as_storage_type(self): + """Return the LLVM type representation for the storage of + the nestedarray. + """ + ret = ir.ArrayType(self._be_type, self._fe_type.nitems) + return ret + + +@register_default(types.Optional) +class OptionalModel(StructModel): + def __init__(self, dmm, fe_type): + members = [ + ('data', fe_type.type), + ('valid', types.boolean), + ] + self._value_model = dmm.lookup(fe_type.type) + super(OptionalModel, self).__init__(dmm, fe_type, members) + + def get_return_type(self): + return self._value_model.get_return_type() + + def as_return(self, builder, value): + raise NotImplementedError + + def from_return(self, builder, value): + return self._value_model.from_return(builder, value) + + def traverse(self, builder): + def get_data(value): + valid = get_valid(value) + data = self.get(builder, value, "data") + return builder.select(valid, data, ir.Constant(data.type, None)) + def get_valid(value): + return self.get(builder, value, "valid") + + return [(self.get_type("data"), get_data), + (self.get_type("valid"), get_valid)] + + +@register_default(types.Record) +class RecordModel(CompositeModel): + def __init__(self, dmm, fe_type): + super(RecordModel, self).__init__(dmm, fe_type) + self._models = [self._dmm.lookup(t) for _, t in fe_type.members] + self._be_type = ir.ArrayType(ir.IntType(8), fe_type.size) + self._be_ptr_type = self._be_type.as_pointer() + + def get_value_type(self): + """Passed around as reference to underlying data + """ + return self._be_ptr_type + + def get_argument_type(self): + return self._be_ptr_type + + def get_return_type(self): + return self._be_ptr_type + + def get_data_type(self): + return self._be_type + + def as_data(self, builder, value): + return builder.load(value) + + def from_data(self, builder, value): + raise NotImplementedError("use load_from_data_pointer() instead") + + def as_argument(self, builder, value): + return value + + def from_argument(self, builder, value): + return value + + def as_return(self, builder, value): + return value + + def from_return(self, builder, value): + return value + + def load_from_data_pointer(self, builder, ptr, align=None): + return builder.bitcast(ptr, self.get_value_type()) + + +@register_default(types.UnicodeCharSeq) +class UnicodeCharSeq(DataModel): + def __init__(self, dmm, fe_type): + super(UnicodeCharSeq, self).__init__(dmm, fe_type) + charty = ir.IntType(numpy_support.sizeof_unicode_char * 8) + self._be_type = ir.ArrayType(charty, fe_type.count) + + def get_value_type(self): + return self._be_type + + def get_data_type(self): + return self._be_type + + def as_data(self, builder, value): + return value + + def from_data(self, builder, value): + return value + + def as_return(self, builder, value): + return value + + def from_return(self, builder, value): + return value + + def as_argument(self, builder, value): + return value + + def from_argument(self, builder, value): + return value + + +@register_default(types.CharSeq) +class CharSeq(DataModel): + def __init__(self, dmm, fe_type): + super(CharSeq, self).__init__(dmm, fe_type) + charty = ir.IntType(8) + self._be_type = ir.ArrayType(charty, fe_type.count) + + def get_value_type(self): + return self._be_type + + def get_data_type(self): + return self._be_type + + def as_data(self, builder, value): + return value + + def from_data(self, builder, value): + return value + + def as_return(self, builder, value): + return value + + def from_return(self, builder, value): + return value + + def as_argument(self, builder, value): + return value + + def from_argument(self, builder, value): + return value + + +class CContiguousFlatIter(StructModel): + def __init__(self, dmm, fe_type, need_indices): + assert fe_type.array_type.layout == 'C' + array_type = fe_type.array_type + dtype = array_type.dtype + ndim = array_type.ndim + members = [('array', array_type), + ('stride', types.intp), + ('index', types.EphemeralPointer(types.intp)), + ] + if need_indices: + # For ndenumerate() + members.append(('indices', types.EphemeralArray(types.intp, ndim))) + super(CContiguousFlatIter, self).__init__(dmm, fe_type, members) + + +class FlatIter(StructModel): + def __init__(self, dmm, fe_type): + array_type = fe_type.array_type + dtype = array_type.dtype + ndim = array_type.ndim + members = [('array', array_type), + ('pointers', types.EphemeralArray(types.CPointer(dtype), ndim)), + ('indices', types.EphemeralArray(types.intp, ndim)), + ('exhausted', types.EphemeralPointer(types.boolean)), + ] + super(FlatIter, self).__init__(dmm, fe_type, members) + + +@register_default(types.UniTupleIter) +class UniTupleIter(StructModel): + def __init__(self, dmm, fe_type): + members = [('index', types.EphemeralPointer(types.intp)), + ('tuple', fe_type.container,)] + super(UniTupleIter, self).__init__(dmm, fe_type, members) + + +@register_default(types.misc.SliceLiteral) +@register_default(types.SliceType) +class SliceModel(StructModel): + def __init__(self, dmm, fe_type): + members = [('start', types.intp), + ('stop', types.intp), + ('step', types.intp), + ] + super(SliceModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.NPDatetime) +@register_default(types.NPTimedelta) +class NPDatetimeModel(PrimitiveModel): + def __init__(self, dmm, fe_type): + be_type = ir.IntType(64) + super(NPDatetimeModel, self).__init__(dmm, fe_type, be_type) + + +@register_default(types.ArrayIterator) +class ArrayIterator(StructModel): + def __init__(self, dmm, fe_type): + # We use an unsigned index to avoid the cost of negative index tests. + members = [('index', types.EphemeralPointer(types.uintp)), + ('array', fe_type.array_type)] + super(ArrayIterator, self).__init__(dmm, fe_type, members) + + +@register_default(types.EnumerateType) +class EnumerateType(StructModel): + def __init__(self, dmm, fe_type): + members = [('count', types.EphemeralPointer(types.intp)), + ('iter', fe_type.source_type)] + + super(EnumerateType, self).__init__(dmm, fe_type, members) + + +@register_default(types.ZipType) +class ZipType(StructModel): + def __init__(self, dmm, fe_type): + members = [('iter%d' % i, source_type.iterator_type) + for i, source_type in enumerate(fe_type.source_types)] + super(ZipType, self).__init__(dmm, fe_type, members) + + +@register_default(types.RangeIteratorType) +class RangeIteratorType(StructModel): + def __init__(self, dmm, fe_type): + int_type = fe_type.yield_type + members = [('iter', types.EphemeralPointer(int_type)), + ('stop', int_type), + ('step', int_type), + ('count', types.EphemeralPointer(int_type))] + super(RangeIteratorType, self).__init__(dmm, fe_type, members) + + +@register_default(types.Generator) +class GeneratorModel(CompositeModel): + def __init__(self, dmm, fe_type): + super(GeneratorModel, self).__init__(dmm, fe_type) + # XXX Fold this in DataPacker? + self._arg_models = [self._dmm.lookup(t) for t in fe_type.arg_types + if not isinstance(t, types.Omitted)] + self._state_models = [self._dmm.lookup(t) for t in fe_type.state_types] + + self._args_be_type = ir.LiteralStructType( + [t.get_data_type() for t in self._arg_models]) + self._state_be_type = ir.LiteralStructType( + [t.get_data_type() for t in self._state_models]) + # The whole generator closure + self._be_type = ir.LiteralStructType( + [self._dmm.lookup(types.int32).get_value_type(), + self._args_be_type, self._state_be_type]) + self._be_ptr_type = self._be_type.as_pointer() + + def get_value_type(self): + """ + The generator closure is passed around as a reference. + """ + return self._be_ptr_type + + def get_argument_type(self): + return self._be_ptr_type + + def get_return_type(self): + return self._be_type + + def get_data_type(self): + return self._be_type + + def as_argument(self, builder, value): + return value + + def from_argument(self, builder, value): + return value + + def as_return(self, builder, value): + return self.as_data(builder, value) + + def from_return(self, builder, value): + return self.from_data(builder, value) + + def as_data(self, builder, value): + return builder.load(value) + + def from_data(self, builder, value): + stack = cgutils.alloca_once(builder, value.type) + builder.store(value, stack) + return stack + + +@register_default(types.ArrayCTypes) +class ArrayCTypesModel(StructModel): + def __init__(self, dmm, fe_type): + # ndim = fe_type.ndim + members = [('data', types.CPointer(fe_type.dtype)), + ('meminfo', types.MemInfoPointer(fe_type.dtype))] + super(ArrayCTypesModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.RangeType) +class RangeModel(StructModel): + def __init__(self, dmm, fe_type): + int_type = fe_type.iterator_type.yield_type + members = [('start', int_type), + ('stop', int_type), + ('step', int_type)] + super(RangeModel, self).__init__(dmm, fe_type, members) + + +# ============================================================================= + +@register_default(types.NumpyNdIndexType) +class NdIndexModel(StructModel): + def __init__(self, dmm, fe_type): + ndim = fe_type.ndim + members = [('shape', types.UniTuple(types.intp, ndim)), + ('indices', types.EphemeralArray(types.intp, ndim)), + ('exhausted', types.EphemeralPointer(types.boolean)), + ] + super(NdIndexModel, self).__init__(dmm, fe_type, members) + + +@register_default(types.NumpyFlatType) +def handle_numpy_flat_type(dmm, ty): + if ty.array_type.layout == 'C': + return CContiguousFlatIter(dmm, ty, need_indices=False) + else: + return FlatIter(dmm, ty) + +@register_default(types.NumpyNdEnumerateType) +def handle_numpy_ndenumerate_type(dmm, ty): + if ty.array_type.layout == 'C': + return CContiguousFlatIter(dmm, ty, need_indices=True) + else: + return FlatIter(dmm, ty) + +@register_default(types.BoundFunction) +def handle_bound_function(dmm, ty): + # The same as the underlying type + return dmm[ty.this] + + +@register_default(types.NumpyNdIterType) +class NdIter(StructModel): + def __init__(self, dmm, fe_type): + array_types = fe_type.arrays + ndim = fe_type.ndim + shape_len = ndim if fe_type.need_shaped_indexing else 1 + members = [('exhausted', types.EphemeralPointer(types.boolean)), + ('arrays', types.Tuple(array_types)), + # The iterator's main shape and indices + ('shape', types.UniTuple(types.intp, shape_len)), + ('indices', types.EphemeralArray(types.intp, shape_len)), + ] + # Indexing state for the various sub-iterators + # XXX use a tuple instead? + for i, sub in enumerate(fe_type.indexers): + kind, start_dim, end_dim, _ = sub + member_name = 'index%d' % i + if kind == 'flat': + # A single index into the flattened array + members.append((member_name, types.EphemeralPointer(types.intp))) + elif kind in ('scalar', 'indexed', '0d'): + # Nothing required + pass + else: + assert 0 + # Slots holding values of the scalar args + # XXX use a tuple instead? + for i, ty in enumerate(fe_type.arrays): + if not isinstance(ty, types.Array): + member_name = 'scalar%d' % i + members.append((member_name, types.EphemeralPointer(ty))) + + super(NdIter, self).__init__(dmm, fe_type, members) + + +@register_default(types.DeferredType) +class DeferredStructModel(CompositeModel): + def __init__(self, dmm, fe_type): + super(DeferredStructModel, self).__init__(dmm, fe_type) + self.typename = "deferred.{0}".format(id(fe_type)) + self.actual_fe_type = fe_type.get() + + def get_value_type(self): + return ir.global_context.get_identified_type(self.typename + '.value') + + def get_data_type(self): + return ir.global_context.get_identified_type(self.typename + '.data') + + def get_argument_type(self): + return self._actual_model.get_argument_type() + + def as_argument(self, builder, value): + inner = self.get(builder, value) + return self._actual_model.as_argument(builder, inner) + + def from_argument(self, builder, value): + res = self._actual_model.from_argument(builder, value) + return self.set(builder, self.make_uninitialized(), res) + + def from_data(self, builder, value): + self._define() + elem = self.get(builder, value) + value = self._actual_model.from_data(builder, elem) + out = self.make_uninitialized() + return self.set(builder, out, value) + + def as_data(self, builder, value): + self._define() + elem = self.get(builder, value) + value = self._actual_model.as_data(builder, elem) + out = self.make_uninitialized(kind='data') + return self.set(builder, out, value) + + def from_return(self, builder, value): + return value + + def as_return(self, builder, value): + return value + + def get(self, builder, value): + return builder.extract_value(value, [0]) + + def set(self, builder, value, content): + return builder.insert_value(value, content, [0]) + + def make_uninitialized(self, kind='value'): + self._define() + if kind == 'value': + ty = self.get_value_type() + else: + ty = self.get_data_type() + return ir.Constant(ty, ir.Undefined) + + def _define(self): + valty = self.get_value_type() + self._define_value_type(valty) + datty = self.get_data_type() + self._define_data_type(datty) + + def _define_value_type(self, value_type): + if value_type.is_opaque: + value_type.set_body(self._actual_model.get_value_type()) + + def _define_data_type(self, data_type): + if data_type.is_opaque: + data_type.set_body(self._actual_model.get_data_type()) + + @property + def _actual_model(self): + return self._dmm.lookup(self.actual_fe_type) + + def traverse(self, builder): + return [(self.actual_fe_type, + lambda value: builder.extract_value(value, [0]))] + + +@register_default(types.StructRefPayload) +class StructPayloadModel(StructModel): + """Model for the payload of a mutable struct + """ + def __init__(self, dmm, fe_typ): + members = tuple(fe_typ.field_dict.items()) + super().__init__(dmm, fe_typ, members) + + +class StructRefModel(StructModel): + """Model for a mutable struct. + A reference to the payload + """ + def __init__(self, dmm, fe_typ): + dtype = fe_typ.get_data_type() + members = [ + ("meminfo", types.MemInfoPointer(dtype)), + ] + super().__init__(dmm, fe_typ, members) + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/packer.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/packer.py new file mode 100644 index 0000000000000000000000000000000000000000..9efc51449bc3699b67e2cef8035bbdb93c3dabde --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/packer.py @@ -0,0 +1,213 @@ +from collections import deque + +from numba.core import types, cgutils + + + +class DataPacker(object): + """ + A helper to pack a number of typed arguments into a data structure. + Omitted arguments (i.e. values with the type `Omitted`) are automatically + skipped. + """ + # XXX should DataPacker be a model for a dedicated type? + + def __init__(self, dmm, fe_types): + self._dmm = dmm + self._fe_types = fe_types + self._models = [dmm.lookup(ty) for ty in fe_types] + + self._pack_map = [] + self._be_types = [] + for i, ty in enumerate(fe_types): + if not isinstance(ty, types.Omitted): + self._pack_map.append(i) + self._be_types.append(self._models[i].get_data_type()) + + def as_data(self, builder, values): + """ + Return the given values packed as a data structure. + """ + elems = [self._models[i].as_data(builder, values[i]) + for i in self._pack_map] + return cgutils.make_anonymous_struct(builder, elems) + + def _do_load(self, builder, ptr, formal_list=None): + res = [] + for i, i_formal in enumerate(self._pack_map): + elem_ptr = cgutils.gep_inbounds(builder, ptr, 0, i) + val = self._models[i_formal].load_from_data_pointer(builder, elem_ptr) + if formal_list is None: + res.append((self._fe_types[i_formal], val)) + else: + formal_list[i_formal] = val + return res + + def load(self, builder, ptr): + """ + Load the packed values and return a (type, value) tuples. + """ + return self._do_load(builder, ptr) + + def load_into(self, builder, ptr, formal_list): + """ + Load the packed values into a sequence indexed by formal + argument number (skipping any Omitted position). + """ + self._do_load(builder, ptr, formal_list) + + +class ArgPacker(object): + """ + Compute the position for each high-level typed argument. + It flattens every composite argument into primitive types. + It maintains a position map for unflattening the arguments. + + Since struct (esp. nested struct) have specific ABI requirements (e.g. + alignment, pointer address-space, ...) in different architecture (e.g. + OpenCL, CUDA), flattening composite argument types simplifes the call + setup from the Python side. Functions are receiving simple primitive + types and there are only a handful of these. + """ + + def __init__(self, dmm, fe_args): + self._dmm = dmm + self._fe_args = fe_args + self._nargs = len(fe_args) + + self._dm_args = [] + argtys = [] + for ty in fe_args: + dm = self._dmm.lookup(ty) + self._dm_args.append(dm) + argtys.append(dm.get_argument_type()) + self._unflattener = _Unflattener(argtys) + self._be_args = list(_flatten(argtys)) + + def as_arguments(self, builder, values): + """Flatten all argument values + """ + if len(values) != self._nargs: + raise TypeError("invalid number of args: expected %d, got %d" + % (self._nargs, len(values))) + + if not values: + return () + + args = [dm.as_argument(builder, val) + for dm, val in zip(self._dm_args, values) + ] + + args = tuple(_flatten(args)) + return args + + def from_arguments(self, builder, args): + """Unflatten all argument values + """ + + valtree = self._unflattener.unflatten(args) + values = [dm.from_argument(builder, val) + for dm, val in zip(self._dm_args, valtree) + ] + + return values + + def assign_names(self, args, names): + """Assign names for each flattened argument values. + """ + + valtree = self._unflattener.unflatten(args) + for aval, aname in zip(valtree, names): + self._assign_names(aval, aname) + + def _assign_names(self, val_or_nested, name, depth=()): + if isinstance(val_or_nested, (tuple, list)): + for pos, aval in enumerate(val_or_nested): + self._assign_names(aval, name, depth=depth + (pos,)) + else: + postfix = '.'.join(map(str, depth)) + parts = [name, postfix] + val_or_nested.name = '.'.join(filter(bool, parts)) + + @property + def argument_types(self): + """Return a list of LLVM types that are results of flattening + composite types. + """ + return tuple(ty for ty in self._be_args if ty != ()) + + +def _flatten(iterable): + """ + Flatten nested iterable of (tuple, list). + """ + def rec(iterable): + for i in iterable: + if isinstance(i, (tuple, list)): + for j in rec(i): + yield j + else: + yield i + return rec(iterable) + + +_PUSH_LIST = 1 +_APPEND_NEXT_VALUE = 2 +_APPEND_EMPTY_TUPLE = 3 +_POP = 4 + +class _Unflattener(object): + """ + An object used to unflatten nested sequences after a given pattern + (an arbitrarily nested sequence). + The pattern shows the nested sequence shape desired when unflattening; + the values it contains are irrelevant. + """ + + def __init__(self, pattern): + self._code = self._build_unflatten_code(pattern) + + def _build_unflatten_code(self, iterable): + """Build the unflatten opcode sequence for the given *iterable* structure + (an iterable of nested sequences). + """ + code = [] + def rec(iterable): + for i in iterable: + if isinstance(i, (tuple, list)): + if len(i) > 0: + code.append(_PUSH_LIST) + rec(i) + code.append(_POP) + else: + code.append(_APPEND_EMPTY_TUPLE) + else: + code.append(_APPEND_NEXT_VALUE) + + rec(iterable) + return code + + def unflatten(self, flatiter): + """Rebuild a nested tuple structure. + """ + vals = deque(flatiter) + + res = [] + cur = res + stack = [] + for op in self._code: + if op is _PUSH_LIST: + stack.append(cur) + cur.append([]) + cur = cur[-1] + elif op is _APPEND_NEXT_VALUE: + cur.append(vals.popleft()) + elif op is _APPEND_EMPTY_TUPLE: + cur.append(()) + elif op is _POP: + cur = stack.pop() + + assert not stack, stack + assert not vals, vals + + return res diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/registry.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..18bdc475ef09727924b8159a0c9428f7c3abbee1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/registry.py @@ -0,0 +1,18 @@ +import functools +from .manager import DataModelManager + + +def register(dmm, typecls): + """Used as decorator to simplify datamodel registration. + Returns the object being decorated so that chaining is possible. + """ + def wraps(fn): + dmm.register(typecls, fn) + return fn + + return wraps + + +default_manager = DataModelManager() + +register_default = functools.partial(register, default_manager) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/testing.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/testing.py new file mode 100644 index 0000000000000000000000000000000000000000..e2e8a2818b6111efc3979f28a7aa80eef4686a8e --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/datamodel/testing.py @@ -0,0 +1,150 @@ +from llvmlite import ir +from llvmlite import binding as ll + +from numba.core import datamodel +import unittest + + +class DataModelTester(unittest.TestCase): + """ + Test the implementation of a DataModel for a frontend type. + """ + fe_type = NotImplemented + + def setUp(self): + self.module = ir.Module() + self.datamodel = datamodel.default_manager[self.fe_type] + + def test_as_arg(self): + """ + - Is as_arg() and from_arg() implemented? + - Are they the inverse of each other? + """ + fnty = ir.FunctionType(ir.VoidType(), []) + function = ir.Function(self.module, fnty, name="test_as_arg") + builder = ir.IRBuilder() + builder.position_at_end(function.append_basic_block()) + + undef_value = ir.Constant(self.datamodel.get_value_type(), None) + args = self.datamodel.as_argument(builder, undef_value) + self.assertIsNot(args, NotImplemented, "as_argument returned " + "NotImplementedError") + + if isinstance(args, (tuple, list)): + def recur_tuplize(args, func=None): + for arg in args: + if isinstance(arg, (tuple, list)): + yield tuple(recur_tuplize(arg, func=func)) + else: + if func is None: + yield arg + else: + yield func(arg) + + argtypes = tuple(recur_tuplize(args, func=lambda x: x.type)) + exptypes = tuple(recur_tuplize( + self.datamodel.get_argument_type())) + self.assertEqual(exptypes, argtypes) + else: + self.assertEqual(args.type, + self.datamodel.get_argument_type()) + + rev_value = self.datamodel.from_argument(builder, args) + self.assertEqual(rev_value.type, self.datamodel.get_value_type()) + + builder.ret_void() # end function + + # Ensure valid LLVM generation + materialized = ll.parse_assembly(str(self.module)) + str(materialized) + + def test_as_return(self): + """ + - Is as_return() and from_return() implemented? + - Are they the inverse of each other? + """ + fnty = ir.FunctionType(ir.VoidType(), []) + function = ir.Function(self.module, fnty, name="test_as_return") + builder = ir.IRBuilder() + builder.position_at_end(function.append_basic_block()) + + undef_value = ir.Constant(self.datamodel.get_value_type(), None) + ret = self.datamodel.as_return(builder, undef_value) + self.assertIsNot(ret, NotImplemented, "as_return returned " + "NotImplementedError") + + self.assertEqual(ret.type, self.datamodel.get_return_type()) + + rev_value = self.datamodel.from_return(builder, ret) + self.assertEqual(rev_value.type, self.datamodel.get_value_type()) + + builder.ret_void() # end function + + # Ensure valid LLVM generation + materialized = ll.parse_assembly(str(self.module)) + str(materialized) + + +class SupportAsDataMixin(object): + """Test as_data() and from_data() + """ + # XXX test load_from_data_pointer() as well + + def test_as_data(self): + fnty = ir.FunctionType(ir.VoidType(), []) + function = ir.Function(self.module, fnty, name="test_as_data") + builder = ir.IRBuilder() + builder.position_at_end(function.append_basic_block()) + + undef_value = ir.Constant(self.datamodel.get_value_type(), None) + data = self.datamodel.as_data(builder, undef_value) + self.assertIsNot(data, NotImplemented, + "as_data returned NotImplemented") + + self.assertEqual(data.type, self.datamodel.get_data_type()) + + rev_value = self.datamodel.from_data(builder, data) + self.assertEqual(rev_value.type, + self.datamodel.get_value_type()) + + builder.ret_void() # end function + + # Ensure valid LLVM generation + materialized = ll.parse_assembly(str(self.module)) + str(materialized) + + +class NotSupportAsDataMixin(object): + """Ensure as_data() and from_data() raise NotImplementedError. + """ + + def test_as_data_not_supported(self): + fnty = ir.FunctionType(ir.VoidType(), []) + function = ir.Function(self.module, fnty, name="test_as_data") + builder = ir.IRBuilder() + builder.position_at_end(function.append_basic_block()) + + undef_value = ir.Constant(self.datamodel.get_value_type(), None) + with self.assertRaises(NotImplementedError): + data = self.datamodel.as_data(builder, undef_value) + with self.assertRaises(NotImplementedError): + rev_data = self.datamodel.from_data(builder, undef_value) + + +class DataModelTester_SupportAsDataMixin(DataModelTester, + SupportAsDataMixin): + pass + + +class DataModelTester_NotSupportAsDataMixin(DataModelTester, + NotSupportAsDataMixin): + pass + + +def test_factory(support_as_data=True): + """A helper for returning a unittest TestCase for testing + """ + if support_as_data: + return DataModelTester_SupportAsDataMixin + else: + return DataModelTester_NotSupportAsDataMixin diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/debuginfo.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/debuginfo.py new file mode 100644 index 0000000000000000000000000000000000000000..692241a35dac8aa4d78c5eafbccf067331963f4b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/debuginfo.py @@ -0,0 +1,609 @@ +""" +Implements helpers to build LLVM debuginfo. +""" + + +import abc +import os.path +from contextlib import contextmanager + +from llvmlite import ir +from numba.core import cgutils, types +from numba.core.datamodel.models import ComplexModel, UniTupleModel +from numba.core import config + + +@contextmanager +def suspend_emission(builder): + """Suspends the emission of debug_metadata for the duration of the context + managed block.""" + ref = builder.debug_metadata + builder.debug_metadata = None + try: + yield + finally: + builder.debug_metadata = ref + + +class AbstractDIBuilder(metaclass=abc.ABCMeta): + @abc.abstractmethod + def mark_variable(self, builder, allocavalue, name, lltype, size, line, + datamodel=None, argidx=None): + """Emit debug info for the variable. + """ + pass + + @abc.abstractmethod + def mark_location(self, builder, line): + """Emit source location information to the given IRBuilder. + """ + pass + + @abc.abstractmethod + def mark_subprogram(self, function, qualname, argnames, argtypes, line): + """Emit source location information for the given function. + """ + pass + + @abc.abstractmethod + def initialize(self): + """Initialize the debug info. An opportunity for the debuginfo to + prepare any necessary data structures. + """ + + @abc.abstractmethod + def finalize(self): + """Finalize the debuginfo by emitting all necessary metadata. + """ + pass + + +class DummyDIBuilder(AbstractDIBuilder): + + def __init__(self, module, filepath, cgctx): + pass + + def mark_variable(self, builder, allocavalue, name, lltype, size, line, + datamodel=None, argidx=None): + pass + + def mark_location(self, builder, line): + pass + + def mark_subprogram(self, function, qualname, argnames, argtypes, line): + pass + + def initialize(self): + pass + + def finalize(self): + pass + + +_BYTE_SIZE = 8 + + +class DIBuilder(AbstractDIBuilder): + DWARF_VERSION = 4 + DEBUG_INFO_VERSION = 3 + DBG_CU_NAME = 'llvm.dbg.cu' + _DEBUG = False + + def __init__(self, module, filepath, cgctx): + self.module = module + self.filepath = os.path.abspath(filepath) + self.difile = self._di_file() + self.subprograms = [] + self.cgctx = cgctx + self.initialize() + + def initialize(self): + # Create the compile unit now because it is referenced when + # constructing subprograms + self.dicompileunit = self._di_compile_unit() + + def _var_type(self, lltype, size, datamodel=None): + if self._DEBUG: + print("-->", lltype, size, datamodel, + getattr(datamodel, 'fe_type', 'NO FE TYPE')) + m = self.module + bitsize = _BYTE_SIZE * size + + int_type = ir.IntType, + real_type = ir.FloatType, ir.DoubleType + # For simple numeric types, choose the closest encoding. + # We treat all integers as unsigned when there's no known datamodel. + if isinstance(lltype, int_type + real_type): + if datamodel is None: + # This is probably something like an `i8*` member of a struct + name = str(lltype) + if isinstance(lltype, int_type): + ditok = 'DW_ATE_unsigned' + else: + ditok = 'DW_ATE_float' + else: + # This is probably a known int/float scalar type + name = str(datamodel.fe_type) + if isinstance(datamodel.fe_type, types.Integer): + if datamodel.fe_type.signed: + ditok = 'DW_ATE_signed' + else: + ditok = 'DW_ATE_unsigned' + else: + ditok = 'DW_ATE_float' + mdtype = m.add_debug_info('DIBasicType', { + 'name': name, + 'size': bitsize, + 'encoding': ir.DIToken(ditok), + }) + elif isinstance(datamodel, ComplexModel): + # TODO: Is there a better way of determining "this is a complex + # number"? + # + # NOTE: Commented below is the way to generate the metadata for a + # C99 complex type that's directly supported by DWARF. Numba however + # generates a struct with real/imag cf. CPython to give a more + # pythonic feel to inspection. + # + # mdtype = m.add_debug_info('DIBasicType', { + # 'name': f"{datamodel.fe_type} ({str(lltype)})", + # 'size': bitsize, + # 'encoding': ir.DIToken('DW_ATE_complex_float'), + #}) + meta = [] + offset = 0 + for ix, name in enumerate(('real', 'imag')): + component = lltype.elements[ix] + component_size = self.cgctx.get_abi_sizeof(component) + component_basetype = m.add_debug_info('DIBasicType', { + 'name': str(component), + 'size': _BYTE_SIZE * component_size, # bits + 'encoding': ir.DIToken('DW_ATE_float'), + }) + derived_type = m.add_debug_info('DIDerivedType', { + 'tag': ir.DIToken('DW_TAG_member'), + 'name': name, + 'baseType': component_basetype, + 'size': _BYTE_SIZE * component_size, # DW_TAG_member size is in bits + 'offset': offset, + }) + meta.append(derived_type) + offset += (_BYTE_SIZE * component_size) # offset is in bits + mdtype = m.add_debug_info('DICompositeType', { + 'tag': ir.DIToken('DW_TAG_structure_type'), + 'name': f"{datamodel.fe_type} ({str(lltype)})", + 'identifier': str(lltype), + 'elements': m.add_metadata(meta), + 'size': offset, + }, is_distinct=True) + elif isinstance(datamodel, UniTupleModel): + element = lltype.element + el_size = self.cgctx.get_abi_sizeof(element) + basetype = self._var_type(element, el_size) + name = f"{datamodel.fe_type} ({str(lltype)})" + count = size // el_size + mdrange = m.add_debug_info('DISubrange', { + 'count': count, + }) + mdtype = m.add_debug_info('DICompositeType', { + 'tag': ir.DIToken('DW_TAG_array_type'), + 'baseType': basetype, + 'name': name, + 'size': bitsize, + 'identifier': str(lltype), + 'elements': m.add_metadata([mdrange]), + }) + elif isinstance(lltype, ir.PointerType): + model = getattr(datamodel, '_pointee_model', None) + basetype = self._var_type(lltype.pointee, + self.cgctx.get_abi_sizeof(lltype.pointee), + model) + mdtype = m.add_debug_info('DIDerivedType', { + 'tag': ir.DIToken('DW_TAG_pointer_type'), + 'baseType': basetype, + 'size': _BYTE_SIZE * self.cgctx.get_abi_sizeof(lltype) + }) + elif isinstance(lltype, ir.LiteralStructType): + # Struct type + meta = [] + offset = 0 + if datamodel is None or not datamodel.inner_models(): + name = f"Anonymous struct ({str(lltype)})" + for field_id, element in enumerate(lltype.elements): + size = self.cgctx.get_abi_sizeof(element) + basetype = self._var_type(element, size) + derived_type = m.add_debug_info('DIDerivedType', { + 'tag': ir.DIToken('DW_TAG_member'), + 'name': f'', + 'baseType': basetype, + 'size': _BYTE_SIZE * size, # DW_TAG_member size is in bits + 'offset': offset, + }) + meta.append(derived_type) + offset += (_BYTE_SIZE * size) # offset is in bits + else: + name = f"{datamodel.fe_type} ({str(lltype)})" + for element, field, model in zip(lltype.elements, + datamodel._fields, + datamodel.inner_models()): + size = self.cgctx.get_abi_sizeof(element) + basetype = self._var_type(element, size, datamodel=model) + derived_type = m.add_debug_info('DIDerivedType', { + 'tag': ir.DIToken('DW_TAG_member'), + 'name': field, + 'baseType': basetype, + 'size': _BYTE_SIZE * size, # DW_TAG_member size is in bits + 'offset': offset, + }) + meta.append(derived_type) + offset += (_BYTE_SIZE * size) # offset is in bits + + mdtype = m.add_debug_info('DICompositeType', { + 'tag': ir.DIToken('DW_TAG_structure_type'), + 'name': name, + 'identifier': str(lltype), + 'elements': m.add_metadata(meta), + 'size': offset, + }, is_distinct=True) + elif isinstance(lltype, ir.ArrayType): + element = lltype.element + el_size = self.cgctx.get_abi_sizeof(element) + basetype = self._var_type(element, el_size) + count = size // el_size + mdrange = m.add_debug_info('DISubrange', { + 'count': count, + }) + mdtype = m.add_debug_info('DICompositeType', { + 'tag': ir.DIToken('DW_TAG_array_type'), + 'baseType': basetype, + 'name': str(lltype), + 'size': bitsize, + 'identifier': str(lltype), + 'elements': m.add_metadata([mdrange]), + }) + else: + # For all other types, describe it as sequence of bytes + count = size + mdrange = m.add_debug_info('DISubrange', { + 'count': count, + }) + mdbase = m.add_debug_info('DIBasicType', { + 'name': 'byte', + 'size': _BYTE_SIZE, + 'encoding': ir.DIToken('DW_ATE_unsigned_char'), + }) + mdtype = m.add_debug_info('DICompositeType', { + 'tag': ir.DIToken('DW_TAG_array_type'), + 'baseType': mdbase, + 'name': str(lltype), + 'size': bitsize, + 'identifier': str(lltype), + 'elements': m.add_metadata([mdrange]), + }) + + return mdtype + + def mark_variable(self, builder, allocavalue, name, lltype, size, line, + datamodel=None, argidx=None): + + arg_index = 0 if argidx is None else argidx + m = self.module + fnty = ir.FunctionType(ir.VoidType(), [ir.MetaDataType()] * 3) + decl = cgutils.get_or_insert_function(m, fnty, 'llvm.dbg.declare') + + mdtype = self._var_type(lltype, size, datamodel=datamodel) + name = name.replace('.', '$') # for gdb to work correctly + mdlocalvar = m.add_debug_info('DILocalVariable', { + 'name': name, + 'arg': arg_index, + 'scope': self.subprograms[-1], + 'file': self.difile, + 'line': line, + 'type': mdtype, + }) + mdexpr = m.add_debug_info('DIExpression', {}) + + return builder.call(decl, [allocavalue, mdlocalvar, mdexpr]) + + def mark_location(self, builder, line): + builder.debug_metadata = self._add_location(line) + + def mark_subprogram(self, function, qualname, argnames, argtypes, line): + name = qualname + argmap = dict(zip(argnames, argtypes)) + di_subp = self._add_subprogram(name=name, linkagename=function.name, + line=line, function=function, + argmap=argmap) + function.set_metadata("dbg", di_subp) + + # Don't marked alwaysinline functions as noinline. + if 'alwaysinline' not in function.attributes: + # disable inlining for this function for easier debugging + function.attributes.add('noinline') + + def finalize(self): + dbgcu = cgutils.get_or_insert_named_metadata(self.module, self.DBG_CU_NAME) + dbgcu.add(self.dicompileunit) + self._set_module_flags() + + # + # Internal APIs + # + + def _set_module_flags(self): + """Set the module flags metadata + """ + module = self.module + mflags = cgutils.get_or_insert_named_metadata(module, 'llvm.module.flags') + # Set *require* behavior to warning + # See http://llvm.org/docs/LangRef.html#module-flags-metadata + require_warning_behavior = self._const_int(2) + if self.DWARF_VERSION is not None: + dwarf_version = module.add_metadata([ + require_warning_behavior, + "Dwarf Version", + self._const_int(self.DWARF_VERSION) + ]) + if dwarf_version not in mflags.operands: + mflags.add(dwarf_version) + debuginfo_version = module.add_metadata([ + require_warning_behavior, + "Debug Info Version", + self._const_int(self.DEBUG_INFO_VERSION) + ]) + if debuginfo_version not in mflags.operands: + mflags.add(debuginfo_version) + + def _add_subprogram(self, name, linkagename, line, function, argmap): + """Emit subprogram metadata + """ + subp = self._di_subprogram(name, linkagename, line, function, argmap) + self.subprograms.append(subp) + return subp + + def _add_location(self, line): + """Emit location metatdaa + """ + loc = self._di_location(line) + return loc + + @classmethod + def _const_int(cls, num, bits=32): + """Util to create constant int in metadata + """ + return ir.IntType(bits)(num) + + @classmethod + def _const_bool(cls, boolean): + """Util to create constant boolean in metadata + """ + return ir.IntType(1)(boolean) + + # + # Helpers to emit the metadata nodes + # + + def _di_file(self): + return self.module.add_debug_info('DIFile', { + 'directory': os.path.dirname(self.filepath), + 'filename': os.path.basename(self.filepath), + }) + + def _di_compile_unit(self): + return self.module.add_debug_info('DICompileUnit', { + 'language': ir.DIToken('DW_LANG_C_plus_plus'), + 'file': self.difile, + # Numba has to pretend to be clang to ensure the prologue is skipped + # correctly in gdb. See: + # https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=gdb/amd64-tdep.c;h=e563d369d8cb3eb3c2f732c2fa850ec70ba8d63b;hb=a4b0231e179607e47b1cdf1fe15c5dc25e482fad#l2521 + # Note the "producer_is_llvm" call to specialise the prologue + # handling, this is defined here: + # https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=gdb/producer.c;h=cdfd80d904c09394febd18749bb90359b2d128cc;hb=a4b0231e179607e47b1cdf1fe15c5dc25e482fad#l124 + # and to get a match for this condition the 'producer' must start + # with "clang ", hence the following... + 'producer': 'clang (Numba)', + 'runtimeVersion': 0, + 'isOptimized': config.OPT != 0, + 'emissionKind': 1, # 0-NoDebug, 1-FullDebug + }, is_distinct=True) + + def _di_subroutine_type(self, line, function, argmap): + # The function call conv needs encoding. + llfunc = function + md = [] + + for idx, llarg in enumerate(llfunc.args): + if not llarg.name.startswith('arg.'): + name = llarg.name.replace('.', '$') # for gdb to work correctly + lltype = llarg.type + size = self.cgctx.get_abi_sizeof(lltype) + mdtype = self._var_type(lltype, size, datamodel=None) + md.append(mdtype) + + for idx, (name, nbtype) in enumerate(argmap.items()): + name = name.replace('.', '$') # for gdb to work correctly + datamodel = self.cgctx.data_model_manager[nbtype] + lltype = self.cgctx.get_value_type(nbtype) + size = self.cgctx.get_abi_sizeof(lltype) + mdtype = self._var_type(lltype, size, datamodel=datamodel) + md.append(mdtype) + + return self.module.add_debug_info('DISubroutineType', { + 'types': self.module.add_metadata(md), + }) + + def _di_subprogram(self, name, linkagename, line, function, argmap): + return self.module.add_debug_info('DISubprogram', { + 'name': name, + 'linkageName': linkagename, + 'scope': self.difile, + 'file': self.difile, + 'line': line, + 'type': self._di_subroutine_type(line, function, argmap), + 'isLocal': False, + 'isDefinition': True, + 'scopeLine': line, + 'isOptimized': config.OPT != 0, + 'unit': self.dicompileunit, + }, is_distinct=True) + + def _di_location(self, line): + return self.module.add_debug_info('DILocation', { + 'line': line, + 'column': 1, + 'scope': self.subprograms[-1], + }) + + +class NvvmDIBuilder(DIBuilder): + """ + Only implemented the minimal metadata to get line number information. + See http://llvm.org/releases/3.4/docs/LangRef.html + """ + # These constants are copied from llvm3.4 + DW_LANG_Python = 0x0014 + DI_Compile_unit = 786449 + DI_Subroutine_type = 786453 + DI_Subprogram = 786478 + DI_File = 786473 + + DWARF_VERSION = None # don't emit DWARF version + DEBUG_INFO_VERSION = 1 # as required by NVVM IR Spec + # Rename DIComputeUnit MD to hide it from llvm.parse_assembly() + # which strips invalid/outdated debug metadata + DBG_CU_NAME = 'numba.llvm.dbg.cu' + + # Default member + # Used in mark_location to remember last lineno to avoid duplication + _last_lineno = None + + def mark_variable(self, builder, allocavalue, name, lltype, size, line, + datamodel=None, argidx=None): + # unsupported + pass + + def mark_location(self, builder, line): + # Avoid duplication + if self._last_lineno == line: + return + self._last_lineno = line + # Add call to an inline asm to mark line location + asmty = ir.FunctionType(ir.VoidType(), []) + asm = ir.InlineAsm(asmty, "// dbg {}".format(line), "", + side_effect=True) + call = builder.call(asm, []) + md = self._di_location(line) + call.set_metadata('numba.dbg', md) + + def mark_subprogram(self, function, qualname, argnames, argtypes, line): + argmap = dict(zip(argnames, argtypes)) + self._add_subprogram(name=qualname, linkagename=function.name, + line=line) + + def _add_subprogram(self, name, linkagename, line): + """Emit subprogram metadata + """ + subp = self._di_subprogram(name, linkagename, line) + self.subprograms.append(subp) + return subp + + # + # Helper methods to create the metadata nodes. + # + + def _filepair(self): + return self.module.add_metadata([ + os.path.basename(self.filepath), + os.path.dirname(self.filepath), + ]) + + def _di_file(self): + return self.module.add_metadata([ + self._const_int(self.DI_File), + self._filepair(), + ]) + + def _di_compile_unit(self): + filepair = self._filepair() + empty = self.module.add_metadata([self._const_int(0)]) + sp_metadata = self.module.add_metadata(self.subprograms) + return self.module.add_metadata([ + self._const_int(self.DI_Compile_unit), # tag + filepair, # source directory and file pair + self._const_int(self.DW_LANG_Python), # language + 'Numba', # producer + self._const_bool(True), # optimized + "", # flags?? + self._const_int(0), # runtime version + empty, # enums types + empty, # retained types + self.module.add_metadata(self.subprograms), # subprograms + empty, # global variables + empty, # imported entities + "", # split debug filename + ]) + + def _di_subroutine_type(self): + types = self.module.add_metadata([None]) + return self.module.add_metadata([ + self._const_int(self.DI_Subroutine_type), # tag + self._const_int(0), + None, + "", + self._const_int(0), # line of definition + self._const_int(0, 64), # size in bits + self._const_int(0, 64), # offset in bits + self._const_int(0, 64), # align in bits + self._const_int(0), # flags + None, + types, + self._const_int(0), + None, + None, + None, + ]) + + def _di_subprogram(self, name, linkagename, line): + function_ptr = self.module.get_global(linkagename) + subroutine_type = self._di_subroutine_type() + funcvars = self.module.add_metadata([self._const_int(0)]) + context = self._di_file() + return self.module.add_metadata([ + self._const_int(self.DI_Subprogram), # tag + self._filepair(), # source dir & file + context, # context descriptor + name, # name + name, # display name + linkagename, # linkage name + self._const_int(line), # line + subroutine_type, # type descriptor + self._const_bool(False), # is local + self._const_bool(True), # is definition + self._const_int(0), # virtuality + self._const_int(0), # virtual function index + None, # vtable base type + self._const_int(0), # flags + self._const_bool(True), # is optimized + function_ptr, # pointer to function + None, # function template parameters + None, # function declaration descriptor + funcvars, # function variables + self._const_int(line) # scope line + ]) + + def _di_location(self, line): + return self.module.add_metadata([ + self._const_int(line), # line + self._const_int(0), # column + self.subprograms[-1], # scope + None, # original scope + ]) + + def initialize(self): + pass + + def finalize(self): + # We create the compile unit at this point because subprograms is + # populated and can be referred to by the compile unit. + self.dicompileunit = self._di_compile_unit() + super().finalize() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/decorators.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/decorators.py new file mode 100644 index 0000000000000000000000000000000000000000..2dfc4633f8c4b7b36416f31a0fbce8507e3ab2a4 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/decorators.py @@ -0,0 +1,309 @@ +""" +Define @jit and related decorators. +""" + + +import sys +import warnings +import inspect +import logging + +from numba.core.errors import DeprecationError, NumbaDeprecationWarning +from numba.stencils.stencil import stencil +from numba.core import config, extending, sigutils, registry + +_logger = logging.getLogger(__name__) + + +# ----------------------------------------------------------------------------- +# Decorators + +_msg_deprecated_signature_arg = ("Deprecated keyword argument `{0}`. " + "Signatures should be passed as the first " + "positional argument.") + + +def jit(signature_or_function=None, locals={}, cache=False, + pipeline_class=None, boundscheck=None, **options): + """ + This decorator is used to compile a Python function into native code. + + Args + ----- + signature_or_function: + The (optional) signature or list of signatures to be compiled. + If not passed, required signatures will be compiled when the + decorated function is called, depending on the argument values. + As a convenience, you can directly pass the function to be compiled + instead. + + locals: dict + Mapping of local variable names to Numba types. Used to override the + types deduced by Numba's type inference engine. + + pipeline_class: type numba.compiler.CompilerBase + The compiler pipeline type for customizing the compilation stages. + + options: + For a cpu target, valid options are: + nopython: bool + Set to True to disable the use of PyObjects and Python API + calls. The default behavior is to allow the use of PyObjects + and Python API. Default value is False. + + forceobj: bool + Set to True to force the use of PyObjects for every value. + Default value is False. + + looplift: bool + Set to True to enable jitting loops in nopython mode while + leaving surrounding code in object mode. This allows functions + to allocate NumPy arrays and use Python objects, while the + tight loops in the function can still be compiled in nopython + mode. Any arrays that the tight loop uses should be created + before the loop is entered. Default value is True. + + error_model: str + The error-model affects divide-by-zero behavior. + Valid values are 'python' and 'numpy'. The 'python' model + raises exception. The 'numpy' model sets the result to + *+/-inf* or *nan*. Default value is 'python'. + + inline: str or callable + The inline option will determine whether a function is inlined + at into its caller if called. String options are 'never' + (default) which will never inline, and 'always', which will + always inline. If a callable is provided it will be called with + the call expression node that is requesting inlining, the + caller's IR and callee's IR as arguments, it is expected to + return Truthy as to whether to inline. + NOTE: This inlining is performed at the Numba IR level and is in + no way related to LLVM inlining. + + boundscheck: bool or None + Set to True to enable bounds checking for array indices. Out + of bounds accesses will raise IndexError. The default is to + not do bounds checking. If False, bounds checking is disabled, + out of bounds accesses can produce garbage results or segfaults. + However, enabling bounds checking will slow down typical + functions, so it is recommended to only use this flag for + debugging. You can also set the NUMBA_BOUNDSCHECK environment + variable to 0 or 1 to globally override this flag. The default + value is None, which under normal execution equates to False, + but if debug is set to True then bounds checking will be + enabled. + + Returns + -------- + A callable usable as a compiled function. Actual compiling will be + done lazily if no explicit signatures are passed. + + Examples + -------- + The function can be used in the following ways: + + 1) jit(signatures, **targetoptions) -> jit(function) + + Equivalent to: + + d = dispatcher(function, targetoptions) + for signature in signatures: + d.compile(signature) + + Create a dispatcher object for a python function. Then, compile + the function with the given signature(s). + + Example: + + @jit("int32(int32, int32)") + def foo(x, y): + return x + y + + @jit(["int32(int32, int32)", "float32(float32, float32)"]) + def bar(x, y): + return x + y + + 2) jit(function, **targetoptions) -> dispatcher + + Create a dispatcher function object that specializes at call site. + + Examples: + + @jit + def foo(x, y): + return x + y + + @jit(nopython=True) + def bar(x, y): + return x + y + + """ + if 'argtypes' in options: + raise DeprecationError(_msg_deprecated_signature_arg.format('argtypes')) + if 'restype' in options: + raise DeprecationError(_msg_deprecated_signature_arg.format('restype')) + if options.get('nopython', False) and options.get('forceobj', False): + raise ValueError("Only one of 'nopython' or 'forceobj' can be True.") + + if "_target" in options: + # Set the "target_backend" option if "_target" is defined. + options['target_backend'] = options['_target'] + target = options.pop('_target', 'cpu') + + options['boundscheck'] = boundscheck + + # Handle signature + if signature_or_function is None: + # No signature, no function + pyfunc = None + sigs = None + elif isinstance(signature_or_function, list): + # A list of signatures is passed + pyfunc = None + sigs = signature_or_function + elif sigutils.is_signature(signature_or_function): + # A single signature is passed + pyfunc = None + sigs = [signature_or_function] + else: + # A function is passed + pyfunc = signature_or_function + sigs = None + + dispatcher_args = {} + if pipeline_class is not None: + dispatcher_args['pipeline_class'] = pipeline_class + wrapper = _jit(sigs, locals=locals, target=target, cache=cache, + targetoptions=options, **dispatcher_args) + if pyfunc is not None: + return wrapper(pyfunc) + else: + return wrapper + + +def _jit(sigs, locals, target, cache, targetoptions, **dispatcher_args): + + from numba.core.target_extension import resolve_dispatcher_from_str + dispatcher = resolve_dispatcher_from_str(target) + + def wrapper(func): + if extending.is_jitted(func): + raise TypeError( + "A jit decorator was called on an already jitted function " + f"{func}. If trying to access the original python " + f"function, use the {func}.py_func attribute." + ) + + if not inspect.isfunction(func): + raise TypeError( + "The decorated object is not a function (got type " + f"{type(func)})." + ) + + if config.ENABLE_CUDASIM and target == 'cuda': + from numba import cuda + return cuda.jit(func) + if config.DISABLE_JIT and not target == 'npyufunc': + return func + disp = dispatcher(py_func=func, locals=locals, + targetoptions=targetoptions, + **dispatcher_args) + if cache: + disp.enable_caching() + if sigs is not None: + # Register the Dispatcher to the type inference mechanism, + # even though the decorator hasn't returned yet. + from numba.core import typeinfer + with typeinfer.register_dispatcher(disp): + for sig in sigs: + disp.compile(sig) + disp.disable_compile() + return disp + + return wrapper + + +def generated_jit(function=None, cache=False, + pipeline_class=None, **options): + """ + This decorator allows flexible type-based compilation + of a jitted function. It works as `@jit`, except that the decorated + function is called at compile-time with the *types* of the arguments + and should return an implementation function for those types. + """ + dispatcher_args = {} + if pipeline_class is not None: + dispatcher_args['pipeline_class'] = pipeline_class + wrapper = _jit(sigs=None, locals={}, target='cpu', cache=cache, + targetoptions=options, impl_kind='generated', + **dispatcher_args) + if function is not None: + return wrapper(function) + else: + return wrapper + + +def njit(*args, **kws): + """ + Equivalent to jit(nopython=True) + + See documentation for jit function/decorator for full description. + """ + if 'nopython' in kws: + warnings.warn('nopython is set for njit and is ignored', RuntimeWarning) + if 'forceobj' in kws: + warnings.warn('forceobj is set for njit and is ignored', RuntimeWarning) + del kws['forceobj'] + kws.update({'nopython': True}) + return jit(*args, **kws) + + +def cfunc(sig, locals={}, cache=False, pipeline_class=None, **options): + """ + This decorator is used to compile a Python function into a C callback + usable with foreign C libraries. + + Usage:: + @cfunc("float64(float64, float64)", nopython=True, cache=True) + def add(a, b): + return a + b + + """ + sig = sigutils.normalize_signature(sig) + + def wrapper(func): + from numba.core.ccallback import CFunc + additional_args = {} + if pipeline_class is not None: + additional_args['pipeline_class'] = pipeline_class + res = CFunc(func, sig, locals=locals, options=options, **additional_args) + if cache: + res.enable_caching() + res.compile() + return res + + return wrapper + + +def jit_module(**kwargs): + """ Automatically ``jit``-wraps functions defined in a Python module + + Note that ``jit_module`` should only be called at the end of the module to + be jitted. In addition, only functions which are defined in the module + ``jit_module`` is called from are considered for automatic jit-wrapping. + See the Numba documentation for more information about what can/cannot be + jitted. + + :param kwargs: Keyword arguments to pass to ``jit`` such as ``nopython`` + or ``error_model``. + + """ + # Get the module jit_module is being called from + frame = inspect.stack()[1] + module = inspect.getmodule(frame[0]) + # Replace functions in module with jit-wrapped versions + for name, obj in module.__dict__.items(): + if inspect.isfunction(obj) and inspect.getmodule(obj) == module: + _logger.debug("Auto decorating function {} from module {} with jit " + "and options: {}".format(obj, module.__name__, kwargs)) + module.__dict__[name] = jit(obj, **kwargs) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/descriptors.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/descriptors.py new file mode 100644 index 0000000000000000000000000000000000000000..9c0c367e9c99dc2799cfdb738e62cbd85d93f8eb --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/descriptors.py @@ -0,0 +1,21 @@ +""" +Target Descriptors +""" + +from abc import ABCMeta, abstractmethod + + +class TargetDescriptor(metaclass=ABCMeta): + + def __init__(self, target_name): + self._target_name = target_name + + @property + @abstractmethod + def typing_context(self): + ... + + @property + @abstractmethod + def target_context(self): + ... diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/dispatcher.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/dispatcher.py new file mode 100644 index 0000000000000000000000000000000000000000..69414a7c57daaa0178995a1dce686cf5b36006b5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/dispatcher.py @@ -0,0 +1,1322 @@ +# -*- coding: utf-8 -*- + + +import collections +import functools +import sys +import types as pytypes +import uuid +import weakref +from contextlib import ExitStack + +from numba import _dispatcher +from numba.core import ( + utils, types, errors, typing, serialize, config, compiler, sigutils +) +from numba.core.compiler_lock import global_compiler_lock +from numba.core.typeconv.rules import default_type_manager +from numba.core.typing.templates import fold_arguments +from numba.core.typing.typeof import Purpose, typeof +from numba.core.bytecode import get_code_object +from numba.core.caching import NullCache, FunctionCache +from numba.core import entrypoints +from numba.core.retarget import BaseRetarget +import numba.core.event as ev + + +class _RetargetStack(utils.ThreadLocalStack, stack_name="retarget"): + def push(self, state): + super().push(state) + _dispatcher.set_use_tls_target_stack(len(self) > 0) + + def pop(self): + super().pop() + _dispatcher.set_use_tls_target_stack(len(self) > 0) + + +class TargetConfigurationStack: + """The target configuration stack. + + Uses the BORG pattern and stores states in threadlocal storage. + + WARNING: features associated with this class are experimental. The API + may change without notice. + """ + + def __init__(self): + self._stack = _RetargetStack() + + def get(self): + """Get the current target from the top of the stack. + + May raise IndexError if the stack is empty. Users should check the size + of the stack beforehand. + """ + return self._stack.top() + + def __len__(self): + """Size of the stack + """ + return len(self._stack) + + @classmethod + def switch_target(cls, retarget: BaseRetarget): + """Returns a contextmanager that pushes a new retarget handler, + an instance of `numba.core.retarget.BaseRetarget`, onto the + target-config stack for the duration of the context-manager. + """ + return cls()._stack.enter(retarget) + + +class OmittedArg(object): + """ + A placeholder for omitted arguments with a default value. + """ + + def __init__(self, value): + self.value = value + + def __repr__(self): + return "omitted arg(%r)" % (self.value,) + + @property + def _numba_type_(self): + return types.Omitted(self.value) + + +class _FunctionCompiler(object): + def __init__(self, py_func, targetdescr, targetoptions, locals, + pipeline_class): + self.py_func = py_func + self.targetdescr = targetdescr + self.targetoptions = targetoptions + self.locals = locals + self.pysig = utils.pysignature(self.py_func) + self.pipeline_class = pipeline_class + # Remember key=(args, return_type) combinations that will fail + # compilation to avoid compilation attempt on them. The values are + # the exceptions. + self._failed_cache = {} + + def fold_argument_types(self, args, kws): + """ + Given positional and named argument types, fold keyword arguments + and resolve defaults by inserting types.Omitted() instances. + + A (pysig, argument types) tuple is returned. + """ + def normal_handler(index, param, value): + return value + + def default_handler(index, param, default): + return types.Omitted(default) + + def stararg_handler(index, param, values): + return types.StarArgTuple(values) + # For now, we take argument values from the @jit function, even + # in the case of generated jit. + args = fold_arguments(self.pysig, args, kws, + normal_handler, + default_handler, + stararg_handler) + return self.pysig, args + + def compile(self, args, return_type): + status, retval = self._compile_cached(args, return_type) + if status: + return retval + else: + raise retval + + def _compile_cached(self, args, return_type): + key = tuple(args), return_type + try: + return False, self._failed_cache[key] + except KeyError: + pass + + try: + retval = self._compile_core(args, return_type) + except errors.TypingError as e: + self._failed_cache[key] = e + return False, e + else: + return True, retval + + def _compile_core(self, args, return_type): + flags = compiler.Flags() + self.targetdescr.options.parse_as_flags(flags, self.targetoptions) + flags = self._customize_flags(flags) + + impl = self._get_implementation(args, {}) + cres = compiler.compile_extra(self.targetdescr.typing_context, + self.targetdescr.target_context, + impl, + args=args, return_type=return_type, + flags=flags, locals=self.locals, + pipeline_class=self.pipeline_class) + # Check typing error if object mode is used + if cres.typing_error is not None and not flags.enable_pyobject: + raise cres.typing_error + return cres + + def get_globals_for_reduction(self): + return serialize._get_function_globals_for_reduction(self.py_func) + + def _get_implementation(self, args, kws): + return self.py_func + + def _customize_flags(self, flags): + return flags + + +class _GeneratedFunctionCompiler(_FunctionCompiler): + + def __init__(self, py_func, targetdescr, targetoptions, locals, + pipeline_class): + super(_GeneratedFunctionCompiler, self).__init__( + py_func, targetdescr, targetoptions, locals, pipeline_class) + self.impls = set() + + def get_globals_for_reduction(self): + # This will recursively get the globals used by any nested + # implementation function. + return serialize._get_function_globals_for_reduction(self.py_func) + + def _get_implementation(self, args, kws): + impl = self.py_func(*args, **kws) + # Check the generating function and implementation signatures are + # compatible, otherwise compiling would fail later. + pysig = utils.pysignature(self.py_func) + implsig = utils.pysignature(impl) + ok = len(pysig.parameters) == len(implsig.parameters) + if ok: + for pyparam, implparam in zip(pysig.parameters.values(), + implsig.parameters.values()): + # We allow the implementation to omit default values, but + # if it mentions them, they should have the same value... + if (pyparam.name != implparam.name or + pyparam.kind != implparam.kind or + (implparam.default is not implparam.empty and + implparam.default != pyparam.default)): + ok = False + if not ok: + raise TypeError("generated implementation %s should be compatible " + "with signature '%s', but has signature '%s'" + % (impl, pysig, implsig)) + self.impls.add(impl) + return impl + + +_CompileStats = collections.namedtuple( + '_CompileStats', ('cache_path', 'cache_hits', 'cache_misses')) + + +class CompilingCounter(object): + """ + A simple counter that increment in __enter__ and decrement in __exit__. + """ + + def __init__(self): + self.counter = 0 + + def __enter__(self): + assert self.counter >= 0 + self.counter += 1 + + def __exit__(self, *args, **kwargs): + self.counter -= 1 + assert self.counter >= 0 + + def __bool__(self): + return self.counter > 0 + + __nonzero__ = __bool__ + + +class _DispatcherBase(_dispatcher.Dispatcher): + """ + Common base class for dispatcher Implementations. + """ + + __numba__ = "py_func" + + def __init__(self, arg_count, py_func, pysig, can_fallback, + exact_match_required): + self._tm = default_type_manager + + # A mapping of signatures to compile results + self.overloads = collections.OrderedDict() + + self.py_func = py_func + # other parts of Numba assume the old Python 2 name for code object + self.func_code = get_code_object(py_func) + # but newer python uses a different name + self.__code__ = self.func_code + # a place to keep an active reference to the types of the active call + self._types_active_call = [] + # Default argument values match the py_func + self.__defaults__ = py_func.__defaults__ + + argnames = tuple(pysig.parameters) + default_values = self.py_func.__defaults__ or () + defargs = tuple(OmittedArg(val) for val in default_values) + try: + lastarg = list(pysig.parameters.values())[-1] + except IndexError: + has_stararg = False + else: + has_stararg = lastarg.kind == lastarg.VAR_POSITIONAL + _dispatcher.Dispatcher.__init__(self, self._tm.get_pointer(), + arg_count, self._fold_args, + argnames, defargs, + can_fallback, + has_stararg, + exact_match_required) + + self.doc = py_func.__doc__ + self._compiling_counter = CompilingCounter() + weakref.finalize(self, self._make_finalizer()) + + def _compilation_chain_init_hook(self): + """ + This will be called ahead of any part of compilation taking place (this + even includes being ahead of working out the types of the arguments). + This permits activities such as initialising extension entry points so + that the compiler knows about additional externally defined types etc + before it does anything. + """ + entrypoints.init_all() + + def _reset_overloads(self): + self._clear() + self.overloads.clear() + + def _make_finalizer(self): + """ + Return a finalizer function that will release references to + related compiled functions. + """ + overloads = self.overloads + targetctx = self.targetctx + + # Early-bind utils.shutting_down() into the function's local namespace + # (see issue #689) + def finalizer(shutting_down=utils.shutting_down): + # The finalizer may crash at shutdown, skip it (resources + # will be cleared by the process exiting, anyway). + if shutting_down(): + return + # This function must *not* hold any reference to self: + # we take care to bind the necessary objects in the closure. + for cres in overloads.values(): + try: + targetctx.remove_user_function(cres.entry_point) + except KeyError: + pass + + return finalizer + + @property + def signatures(self): + """ + Returns a list of compiled function signatures. + """ + return list(self.overloads) + + @property + def nopython_signatures(self): + return [cres.signature for cres in self.overloads.values() + if not cres.objectmode] + + def disable_compile(self, val=True): + """Disable the compilation of new signatures at call time. + """ + # If disabling compilation then there must be at least one signature + assert (not val) or len(self.signatures) > 0 + self._can_compile = not val + + def add_overload(self, cres): + args = tuple(cres.signature.args) + sig = [a._code for a in args] + self._insert(sig, cres.entry_point, cres.objectmode) + self.overloads[args] = cres + + def fold_argument_types(self, args, kws): + return self._compiler.fold_argument_types(args, kws) + + def get_call_template(self, args, kws): + """ + Get a typing.ConcreteTemplate for this dispatcher and the given + *args* and *kws* types. This allows to resolve the return type. + + A (template, pysig, args, kws) tuple is returned. + """ + # XXX how about a dispatcher template class automating the + # following? + + # Fold keyword arguments and resolve default values + pysig, args = self._compiler.fold_argument_types(args, kws) + kws = {} + # Ensure an overload is available + if self._can_compile: + self.compile(tuple(args)) + + # Create function type for typing + func_name = self.py_func.__name__ + name = "CallTemplate({0})".format(func_name) + # The `key` isn't really used except for diagnosis here, + # so avoid keeping a reference to `cfunc`. + call_template = typing.make_concrete_template( + name, key=func_name, signatures=self.nopython_signatures) + return call_template, pysig, args, kws + + def get_overload(self, sig): + """ + Return the compiled function for the given signature. + """ + args, return_type = sigutils.normalize_signature(sig) + return self.overloads[tuple(args)].entry_point + + @property + def is_compiling(self): + """ + Whether a specialization is currently being compiled. + """ + return self._compiling_counter + + def _compile_for_args(self, *args, **kws): + """ + For internal use. Compile a specialized version of the function + for the given *args* and *kws*, and return the resulting callable. + """ + assert not kws + # call any initialisation required for the compilation chain (e.g. + # extension point registration). + self._compilation_chain_init_hook() + + def error_rewrite(e, issue_type): + """ + Rewrite and raise Exception `e` with help supplied based on the + specified issue_type. + """ + if config.SHOW_HELP: + help_msg = errors.error_extras[issue_type] + e.patch_message('\n'.join((str(e).rstrip(), help_msg))) + if config.FULL_TRACEBACKS: + raise e + else: + raise e.with_traceback(None) + + argtypes = [] + for a in args: + if isinstance(a, OmittedArg): + argtypes.append(types.Omitted(a.value)) + else: + argtypes.append(self.typeof_pyval(a)) + + return_val = None + try: + return_val = self.compile(tuple(argtypes)) + except errors.ForceLiteralArg as e: + # Received request for compiler re-entry with the list of arguments + # indicated by e.requested_args. + # First, check if any of these args are already Literal-ized + already_lit_pos = [i for i in e.requested_args + if isinstance(args[i], types.Literal)] + if already_lit_pos: + # Abort compilation if any argument is already a Literal. + # Letting this continue will cause infinite compilation loop. + m = ("Repeated literal typing request.\n" + "{}.\n" + "This is likely caused by an error in typing. " + "Please see nested and suppressed exceptions.") + info = ', '.join('Arg #{} is {}'.format(i, args[i]) + for i in sorted(already_lit_pos)) + raise errors.CompilerError(m.format(info)) + # Convert requested arguments into a Literal. + args = [(types.literal + if i in e.requested_args + else lambda x: x)(args[i]) + for i, v in enumerate(args)] + # Re-enter compilation with the Literal-ized arguments + return_val = self._compile_for_args(*args) + + except errors.TypingError as e: + # Intercept typing error that may be due to an argument + # that failed inferencing as a Numba type + failed_args = [] + for i, arg in enumerate(args): + val = arg.value if isinstance(arg, OmittedArg) else arg + try: + tp = typeof(val, Purpose.argument) + except ValueError as typeof_exc: + failed_args.append((i, str(typeof_exc))) + else: + if tp is None: + failed_args.append( + (i, f"cannot determine Numba type of value {val}")) + if failed_args: + # Patch error message to ease debugging + args_str = "\n".join( + f"- argument {i}: {err}" for i, err in failed_args + ) + msg = (f"{str(e).rstrip()} \n\nThis error may have been caused " + f"by the following argument(s):\n{args_str}\n") + e.patch_message(msg) + + error_rewrite(e, 'typing') + except errors.UnsupportedError as e: + # Something unsupported is present in the user code, add help info + error_rewrite(e, 'unsupported_error') + except (errors.NotDefinedError, errors.RedefinedError, + errors.VerificationError) as e: + # These errors are probably from an issue with either the code + # supplied being syntactically or otherwise invalid + error_rewrite(e, 'interpreter') + except errors.ConstantInferenceError as e: + # this is from trying to infer something as constant when it isn't + # or isn't supported as a constant + error_rewrite(e, 'constant_inference') + except Exception as e: + if config.SHOW_HELP: + if hasattr(e, 'patch_message'): + help_msg = errors.error_extras['reportable'] + e.patch_message('\n'.join((str(e).rstrip(), help_msg))) + # ignore the FULL_TRACEBACKS config, this needs reporting! + raise e + finally: + self._types_active_call = [] + return return_val + + def inspect_llvm(self, signature=None): + """Get the LLVM intermediate representation generated by compilation. + + Parameters + ---------- + signature : tuple of numba types, optional + Specify a signature for which to obtain the LLVM IR. If None, the + IR is returned for all available signatures. + + Returns + ------- + llvm : dict[signature, str] or str + Either the LLVM IR string for the specified signature, or, if no + signature was given, a dictionary mapping signatures to LLVM IR + strings. + """ + if signature is not None: + lib = self.overloads[signature].library + return lib.get_llvm_str() + + return dict((sig, self.inspect_llvm(sig)) for sig in self.signatures) + + def inspect_asm(self, signature=None): + """Get the generated assembly code. + + Parameters + ---------- + signature : tuple of numba types, optional + Specify a signature for which to obtain the assembly code. If + None, the assembly code is returned for all available signatures. + + Returns + ------- + asm : dict[signature, str] or str + Either the assembly code for the specified signature, or, if no + signature was given, a dictionary mapping signatures to assembly + code. + """ + if signature is not None: + lib = self.overloads[signature].library + return lib.get_asm_str() + + return dict((sig, self.inspect_asm(sig)) for sig in self.signatures) + + def inspect_types(self, file=None, signature=None, + pretty=False, style='default', **kwargs): + """Print/return Numba intermediate representation (IR)-annotated code. + + Parameters + ---------- + file : file-like object, optional + File to which to print. Defaults to sys.stdout if None. Must be + None if ``pretty=True``. + signature : tuple of numba types, optional + Print/return the intermediate representation for only the given + signature. If None, the IR is printed for all available signatures. + pretty : bool, optional + If True, an Annotate object will be returned that can render the + IR with color highlighting in Jupyter and IPython. ``file`` must + be None if ``pretty`` is True. Additionally, the ``pygments`` + library must be installed for ``pretty=True``. + style : str, optional + Choose a style for rendering. Ignored if ``pretty`` is ``False``. + This is directly consumed by ``pygments`` formatters. To see a + list of available styles, import ``pygments`` and run + ``list(pygments.styles.get_all_styles())``. + + Returns + ------- + annotated : Annotate object, optional + Only returned if ``pretty=True``, otherwise this function is only + used for its printing side effect. If ``pretty=True``, an Annotate + object is returned that can render itself in Jupyter and IPython. + """ + overloads = self.overloads + if signature is not None: + overloads = {signature: self.overloads[signature]} + + if not pretty: + if file is None: + file = sys.stdout + + for ver, res in overloads.items(): + print("%s %s" % (self.py_func.__name__, ver), file=file) + print('-' * 80, file=file) + print(res.type_annotation, file=file) + print('=' * 80, file=file) + else: + if file is not None: + raise ValueError("`file` must be None if `pretty=True`") + from numba.core.annotations.pretty_annotate import Annotate + return Annotate(self, signature=signature, style=style) + + def inspect_cfg(self, signature=None, show_wrapper=None, **kwargs): + """ + For inspecting the CFG of the function. + + By default the CFG of the user function is shown. The *show_wrapper* + option can be set to "python" or "cfunc" to show the python wrapper + function or the *cfunc* wrapper function, respectively. + + Parameters accepted in kwargs + ----------------------------- + filename : string, optional + the name of the output file, if given this will write the output to + filename + view : bool, optional + whether to immediately view the optional output file + highlight : bool, set, dict, optional + what, if anything, to highlight, options are: + { incref : bool, # highlight NRT_incref calls + decref : bool, # highlight NRT_decref calls + returns : bool, # highlight exits which are normal returns + raises : bool, # highlight exits which are from raise + meminfo : bool, # highlight calls to NRT*meminfo + branches : bool, # highlight true/false branches + } + Default is True which sets all of the above to True. Supplying a set + of strings is also accepted, these are interpreted as key:True with + respect to the above dictionary. e.g. {'incref', 'decref'} would + switch on highlighting on increfs and decrefs. + interleave: bool, set, dict, optional + what, if anything, to interleave in the LLVM IR, options are: + { python: bool # interleave python source code with the LLVM IR + lineinfo: bool # interleave line information markers with the LLVM + # IR + } + Default is True which sets all of the above to True. Supplying a set + of strings is also accepted, these are interpreted as key:True with + respect to the above dictionary. e.g. {'python',} would + switch on interleaving of python source code in the LLVM IR. + strip_ir : bool, optional + Default is False. If set to True all LLVM IR that is superfluous to + that requested in kwarg `highlight` will be removed. + show_key : bool, optional + Default is True. Create a "key" for the highlighting in the rendered + CFG. + fontsize : int, optional + Default is 8. Set the fontsize in the output to this value. + """ + if signature is not None: + cres = self.overloads[signature] + lib = cres.library + if show_wrapper == 'python': + fname = cres.fndesc.llvm_cpython_wrapper_name + elif show_wrapper == 'cfunc': + fname = cres.fndesc.llvm_cfunc_wrapper_name + else: + fname = cres.fndesc.mangled_name + return lib.get_function_cfg(fname, py_func=self.py_func, **kwargs) + + return dict((sig, self.inspect_cfg(sig, show_wrapper=show_wrapper)) + for sig in self.signatures) + + def inspect_disasm_cfg(self, signature=None): + """ + For inspecting the CFG of the disassembly of the function. + + Requires python package: r2pipe + Requires radare2 binary on $PATH. + Notebook rendering requires python package: graphviz + + signature : tuple of Numba types, optional + Print/return the disassembly CFG for only the given signatures. + If None, the IR is printed for all available signatures. + """ + if signature is not None: + cres = self.overloads[signature] + lib = cres.library + return lib.get_disasm_cfg(cres.fndesc.mangled_name) + + return dict((sig, self.inspect_disasm_cfg(sig)) + for sig in self.signatures) + + def get_annotation_info(self, signature=None): + """ + Gets the annotation information for the function specified by + signature. If no signature is supplied a dictionary of signature to + annotation information is returned. + """ + signatures = self.signatures if signature is None else [signature] + out = collections.OrderedDict() + for sig in signatures: + cres = self.overloads[sig] + ta = cres.type_annotation + key = (ta.func_id.filename + ':' + str(ta.func_id.firstlineno + 1), + ta.signature) + out[key] = ta.annotate_raw()[key] + return out + + def _explain_ambiguous(self, *args, **kws): + """ + Callback for the C _Dispatcher object. + """ + assert not kws, "kwargs not handled" + args = tuple([self.typeof_pyval(a) for a in args]) + # The order here must be deterministic for testing purposes, which + # is ensured by the OrderedDict. + sigs = self.nopython_signatures + # This will raise + self.typingctx.resolve_overload(self.py_func, sigs, args, kws, + allow_ambiguous=False) + + def _explain_matching_error(self, *args, **kws): + """ + Callback for the C _Dispatcher object. + """ + assert not kws, "kwargs not handled" + args = [self.typeof_pyval(a) for a in args] + msg = ("No matching definition for argument type(s) %s" + % ', '.join(map(str, args))) + raise TypeError(msg) + + def _search_new_conversions(self, *args, **kws): + """ + Callback for the C _Dispatcher object. + Search for approximately matching signatures for the given arguments, + and ensure the corresponding conversions are registered in the C++ + type manager. + """ + assert not kws, "kwargs not handled" + args = [self.typeof_pyval(a) for a in args] + found = False + for sig in self.nopython_signatures: + conv = self.typingctx.install_possible_conversions(args, sig.args) + if conv: + found = True + return found + + def __repr__(self): + return "%s(%s)" % (type(self).__name__, self.py_func) + + def typeof_pyval(self, val): + """ + Resolve the Numba type of Python value *val*. + This is called from numba._dispatcher as a fallback if the native code + cannot decide the type. + """ + # Not going through the resolve_argument_type() indirection + # can save a couple µs. + try: + tp = typeof(val, Purpose.argument) + except ValueError: + tp = types.pyobject + else: + if tp is None: + tp = types.pyobject + self._types_active_call.append(tp) + return tp + + def _callback_add_timer(self, duration, cres, lock_name): + md = cres.metadata + # md can be None when code is loaded from cache + if md is not None: + timers = md.setdefault("timers", {}) + if lock_name not in timers: + # Only write if the metadata does not exist + timers[lock_name] = duration + else: + msg = f"'{lock_name} metadata is already defined." + raise AssertionError(msg) + + def _callback_add_compiler_timer(self, duration, cres): + return self._callback_add_timer(duration, cres, + lock_name="compiler_lock") + + def _callback_add_llvm_timer(self, duration, cres): + return self._callback_add_timer(duration, cres, + lock_name="llvm_lock") + + +class _MemoMixin: + __uuid = None + # A {uuid -> instance} mapping, for deserialization + _memo = weakref.WeakValueDictionary() + # hold refs to last N functions deserialized, retaining them in _memo + # regardless of whether there is another reference + _recent = collections.deque(maxlen=config.FUNCTION_CACHE_SIZE) + + @property + def _uuid(self): + """ + An instance-specific UUID, to avoid multiple deserializations of + a given instance. + + Note: this is lazily-generated, for performance reasons. + """ + u = self.__uuid + if u is None: + u = str(uuid.uuid4()) + self._set_uuid(u) + return u + + def _set_uuid(self, u): + assert self.__uuid is None + self.__uuid = u + self._memo[u] = self + self._recent.append(self) + + +class Dispatcher(serialize.ReduceMixin, _MemoMixin, _DispatcherBase): + """ + Implementation of user-facing dispatcher objects (i.e. created using + the @jit decorator). + This is an abstract base class. Subclasses should define the targetdescr + class attribute. + """ + _fold_args = True + _impl_kinds = { + 'direct': _FunctionCompiler, + 'generated': _GeneratedFunctionCompiler, + } + + __numba__ = 'py_func' + + def __init__(self, py_func, locals={}, targetoptions={}, + impl_kind='direct', pipeline_class=compiler.Compiler): + """ + Parameters + ---------- + py_func: function object to be compiled + locals: dict, optional + Mapping of local variable names to Numba types. Used to override + the types deduced by the type inference engine. + targetoptions: dict, optional + Target-specific config options. + impl_kind: str + Select the compiler mode for `@jit` and `@generated_jit` + pipeline_class: type numba.compiler.CompilerBase + The compiler pipeline type. + """ + self.typingctx = self.targetdescr.typing_context + self.targetctx = self.targetdescr.target_context + + pysig = utils.pysignature(py_func) + arg_count = len(pysig.parameters) + can_fallback = not targetoptions.get('nopython', False) + + _DispatcherBase.__init__(self, arg_count, py_func, pysig, can_fallback, + exact_match_required=False) + + functools.update_wrapper(self, py_func) + + self.targetoptions = targetoptions + self.locals = locals + self._cache = NullCache() + compiler_class = self._impl_kinds[impl_kind] + self._impl_kind = impl_kind + self._compiler = compiler_class(py_func, self.targetdescr, + targetoptions, locals, pipeline_class) + self._cache_hits = collections.Counter() + self._cache_misses = collections.Counter() + + self._type = types.Dispatcher(self) + self.typingctx.insert_global(self, self._type) + + # Remember target restriction + self._required_target_backend = targetoptions.get('target_backend') + + def dump(self, tab=''): + print(f'{tab}DUMP {type(self).__name__}[{self.py_func.__name__}' + f', type code={self._type._code}]') + for cres in self.overloads.values(): + cres.dump(tab=tab + ' ') + print(f'{tab}END DUMP {type(self).__name__}[{self.py_func.__name__}]') + + @property + def _numba_type_(self): + return types.Dispatcher(self) + + def enable_caching(self): + self._cache = FunctionCache(self.py_func) + + def __get__(self, obj, objtype=None): + '''Allow a JIT function to be bound as a method to an object''' + if obj is None: # Unbound method + return self + else: # Bound method + return pytypes.MethodType(self, obj) + + def _reduce_states(self): + """ + Reduce the instance for pickling. This will serialize + the original function as well the compilation options and + compiled signatures, but not the compiled code itself. + + NOTE: part of ReduceMixin protocol + """ + if self._can_compile: + sigs = [] + else: + sigs = [cr.signature for cr in self.overloads.values()] + + return dict( + uuid=str(self._uuid), + py_func=self.py_func, + locals=self.locals, + targetoptions=self.targetoptions, + impl_kind=self._impl_kind, + can_compile=self._can_compile, + sigs=sigs, + ) + + @classmethod + def _rebuild(cls, uuid, py_func, locals, targetoptions, impl_kind, + can_compile, sigs): + """ + Rebuild an Dispatcher instance after it was __reduce__'d. + + NOTE: part of ReduceMixin protocol + """ + try: + return cls._memo[uuid] + except KeyError: + pass + self = cls(py_func, locals, targetoptions, impl_kind) + # Make sure this deserialization will be merged with subsequent ones + self._set_uuid(uuid) + for sig in sigs: + self.compile(sig) + self._can_compile = can_compile + return self + + def compile(self, sig): + disp = self._get_dispatcher_for_current_target() + if disp is not self: + return disp.compile(sig) + + with ExitStack() as scope: + cres = None + + def cb_compiler(dur): + if cres is not None: + self._callback_add_compiler_timer(dur, cres) + + def cb_llvm(dur): + if cres is not None: + self._callback_add_llvm_timer(dur, cres) + + scope.enter_context(ev.install_timer("numba:compiler_lock", + cb_compiler)) + scope.enter_context(ev.install_timer("numba:llvm_lock", cb_llvm)) + scope.enter_context(global_compiler_lock) + + if not self._can_compile: + raise RuntimeError("compilation disabled") + # Use counter to track recursion compilation depth + with self._compiling_counter: + args, return_type = sigutils.normalize_signature(sig) + # Don't recompile if signature already exists + existing = self.overloads.get(tuple(args)) + if existing is not None: + return existing.entry_point + # Try to load from disk cache + cres = self._cache.load_overload(sig, self.targetctx) + if cres is not None: + self._cache_hits[sig] += 1 + # XXX fold this in add_overload()? (also see compiler.py) + if not cres.objectmode: + self.targetctx.insert_user_function(cres.entry_point, + cres.fndesc, + [cres.library]) + self.add_overload(cres) + return cres.entry_point + + self._cache_misses[sig] += 1 + ev_details = dict( + dispatcher=self, + args=args, + return_type=return_type, + ) + with ev.trigger_event("numba:compile", data=ev_details): + try: + cres = self._compiler.compile(args, return_type) + except errors.ForceLiteralArg as e: + def folded(args, kws): + return self._compiler.fold_argument_types(args, + kws)[1] + raise e.bind_fold_arguments(folded) + self.add_overload(cres) + self._cache.save_overload(sig, cres) + return cres.entry_point + + def get_compile_result(self, sig): + """Compile (if needed) and return the compilation result with the + given signature. + """ + atypes = tuple(sig.args) + if atypes not in self.overloads: + self.compile(atypes) + return self.overloads[atypes] + + def recompile(self): + """ + Recompile all signatures afresh. + """ + sigs = list(self.overloads) + old_can_compile = self._can_compile + # Ensure the old overloads are disposed of, + # including compiled functions. + self._make_finalizer()() + self._reset_overloads() + self._cache.flush() + self._can_compile = True + try: + for sig in sigs: + self.compile(sig) + finally: + self._can_compile = old_can_compile + + @property + def stats(self): + return _CompileStats( + cache_path=self._cache.cache_path, + cache_hits=self._cache_hits, + cache_misses=self._cache_misses, + ) + + def parallel_diagnostics(self, signature=None, level=1): + """ + Print parallel diagnostic information for the given signature. If no + signature is present it is printed for all known signatures. level is + used to adjust the verbosity, level=1 (default) is minimal verbosity, + and 2, 3, and 4 provide increasing levels of verbosity. + """ + def dump(sig): + ol = self.overloads[sig] + pfdiag = ol.metadata.get('parfor_diagnostics', None) + if pfdiag is None: + msg = "No parfors diagnostic available, is 'parallel=True' set?" + raise ValueError(msg) + pfdiag.dump(level) + if signature is not None: + dump(signature) + else: + [dump(sig) for sig in self.signatures] + + def get_metadata(self, signature=None): + """ + Obtain the compilation metadata for a given signature. + """ + if signature is not None: + return self.overloads[signature].metadata + else: + return dict( + (sig,self.overloads[sig].metadata) for sig in self.signatures + ) + + def get_function_type(self): + """Return unique function type of dispatcher when possible, otherwise + return None. + + A Dispatcher instance has unique function type when it + contains exactly one compilation result and its compilation + has been disabled (via its disable_compile method). + """ + if not self._can_compile and len(self.overloads) == 1: + cres = tuple(self.overloads.values())[0] + return types.FunctionType(cres.signature) + + def _get_retarget_dispatcher(self): + """Returns a dispatcher for the retarget request. + """ + # Check TLS target configuration + tc = TargetConfigurationStack() + retarget = tc.get() + retarget.check_compatible(self) + disp = retarget.retarget(self) + return disp + + def _get_dispatcher_for_current_target(self): + """Returns a dispatcher for the current target registered in + `TargetConfigurationStack`. `self` is returned if no target is + specified. + """ + tc = TargetConfigurationStack() + if tc: + return self._get_retarget_dispatcher() + else: + return self + + def _call_tls_target(self, *args, **kwargs): + """This is called when the C dispatcher logic sees a retarget request. + """ + disp = self._get_retarget_dispatcher() + # Call the new dispatcher + return disp(*args, **kwargs) + + +class LiftedCode(serialize.ReduceMixin, _MemoMixin, _DispatcherBase): + """ + Implementation of the hidden dispatcher objects used for lifted code + (a lifted loop is really compiled as a separate function). + """ + _fold_args = False + can_cache = False + + def __init__(self, func_ir, typingctx, targetctx, flags, locals): + self.func_ir = func_ir + self.lifted_from = None + + self.typingctx = typingctx + self.targetctx = targetctx + self.flags = flags + self.locals = locals + + _DispatcherBase.__init__(self, self.func_ir.arg_count, + self.func_ir.func_id.func, + self.func_ir.func_id.pysig, + can_fallback=True, + exact_match_required=False) + + def _reduce_states(self): + """ + Reduce the instance for pickling. This will serialize + the original function as well the compilation options and + compiled signatures, but not the compiled code itself. + + NOTE: part of ReduceMixin protocol + """ + return dict( + uuid=self._uuid, func_ir=self.func_ir, flags=self.flags, + locals=self.locals, extras=self._reduce_extras(), + ) + + def _reduce_extras(self): + """ + NOTE: sub-class can override to add extra states + """ + return {} + + @classmethod + def _rebuild(cls, uuid, func_ir, flags, locals, extras): + """ + Rebuild an Dispatcher instance after it was __reduce__'d. + + NOTE: part of ReduceMixin protocol + """ + try: + return cls._memo[uuid] + except KeyError: + pass + + # NOTE: We are assuming that this is must be cpu_target, which is true + # for now. + # TODO: refactor this to not assume on `cpu_target` + + from numba.core import registry + typingctx = registry.cpu_target.typing_context + targetctx = registry.cpu_target.target_context + + self = cls(func_ir, typingctx, targetctx, flags, locals, **extras) + self._set_uuid(uuid) + return self + + def get_source_location(self): + """Return the starting line number of the loop. + """ + return self.func_ir.loc.line + + def _pre_compile(self, args, return_type, flags): + """Pre-compile actions + """ + pass + + def compile(self, sig): + with ExitStack() as scope: + cres = None + + def cb_compiler(dur): + if cres is not None: + self._callback_add_compiler_timer(dur, cres) + + def cb_llvm(dur): + if cres is not None: + self._callback_add_llvm_timer(dur, cres) + + scope.enter_context(ev.install_timer("numba:compiler_lock", + cb_compiler)) + scope.enter_context(ev.install_timer("numba:llvm_lock", cb_llvm)) + scope.enter_context(global_compiler_lock) + + # Use counter to track recursion compilation depth + with self._compiling_counter: + # XXX this is mostly duplicated from Dispatcher. + flags = self.flags + args, return_type = sigutils.normalize_signature(sig) + + # Don't recompile if signature already exists + # (e.g. if another thread compiled it before we got the lock) + existing = self.overloads.get(tuple(args)) + if existing is not None: + return existing.entry_point + + self._pre_compile(args, return_type, flags) + + # Clone IR to avoid (some of the) mutation in the rewrite pass + cloned_func_ir = self.func_ir.copy() + + ev_details = dict( + dispatcher=self, + args=args, + return_type=return_type, + ) + with ev.trigger_event("numba:compile", data=ev_details): + cres = compiler.compile_ir(typingctx=self.typingctx, + targetctx=self.targetctx, + func_ir=cloned_func_ir, + args=args, + return_type=return_type, + flags=flags, locals=self.locals, + lifted=(), + lifted_from=self.lifted_from, + is_lifted_loop=True,) + + # Check typing error if object mode is used + if (cres.typing_error is not None and + not flags.enable_pyobject): + raise cres.typing_error + self.add_overload(cres) + return cres.entry_point + + def _get_dispatcher_for_current_target(self): + # Lifted code does not honor the target switch currently. + # No work has been done to check if this can be allowed. + return self + + +class LiftedLoop(LiftedCode): + def _pre_compile(self, args, return_type, flags): + assert not flags.enable_looplift, "Enable looplift flags is on" + + +class LiftedWith(LiftedCode): + + can_cache = True + + def _reduce_extras(self): + return dict(output_types=self.output_types) + + @property + def _numba_type_(self): + return types.Dispatcher(self) + + def get_call_template(self, args, kws): + """ + Get a typing.ConcreteTemplate for this dispatcher and the given + *args* and *kws* types. This enables the resolving of the return type. + + A (template, pysig, args, kws) tuple is returned. + """ + # Ensure an overload is available + if self._can_compile: + self.compile(tuple(args)) + + pysig = None + # Create function type for typing + func_name = self.py_func.__name__ + name = "CallTemplate({0})".format(func_name) + # The `key` isn't really used except for diagnosis here, + # so avoid keeping a reference to `cfunc`. + call_template = typing.make_concrete_template( + name, key=func_name, signatures=self.nopython_signatures) + return call_template, pysig, args, kws + + +class ObjModeLiftedWith(LiftedWith): + def __init__(self, *args, **kwargs): + self.output_types = kwargs.pop('output_types', None) + super(LiftedWith, self).__init__(*args, **kwargs) + if not self.flags.force_pyobject: + raise ValueError("expecting `flags.force_pyobject`") + if self.output_types is None: + raise TypeError('`output_types` must be provided') + # switch off rewrites, they have no effect + self.flags.no_rewrites = True + + @property + def _numba_type_(self): + return types.ObjModeDispatcher(self) + + def get_call_template(self, args, kws): + """ + Get a typing.ConcreteTemplate for this dispatcher and the given + *args* and *kws* types. This enables the resolving of the return type. + + A (template, pysig, args, kws) tuple is returned. + """ + assert not kws + self._legalize_arg_types(args) + # Coerce to object mode + args = [types.ffi_forced_object] * len(args) + + if self._can_compile: + self.compile(tuple(args)) + + signatures = [typing.signature(self.output_types, *args)] + pysig = None + func_name = self.py_func.__name__ + name = "CallTemplate({0})".format(func_name) + call_template = typing.make_concrete_template( + name, key=func_name, signatures=signatures) + + return call_template, pysig, args, kws + + def _legalize_arg_types(self, args): + for i, a in enumerate(args, start=1): + if isinstance(a, types.List): + msg = ( + 'Does not support list type inputs into ' + 'with-context for arg {}' + ) + raise errors.TypingError(msg.format(i)) + elif isinstance(a, types.Dispatcher): + msg = ( + 'Does not support function type inputs into ' + 'with-context for arg {}' + ) + raise errors.TypingError(msg.format(i)) + + @global_compiler_lock + def compile(self, sig): + args, _ = sigutils.normalize_signature(sig) + sig = (types.ffi_forced_object,) * len(args) + return super().compile(sig) + + +# Initialize typeof machinery +_dispatcher.typeof_init( + OmittedArg, + dict((str(t), t._code) for t in types.number_domain)) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/entrypoints.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/entrypoints.py new file mode 100644 index 0000000000000000000000000000000000000000..ad42782df8c6a29e3d651db29bb30fb7f047ea10 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/entrypoints.py @@ -0,0 +1,58 @@ +import logging +import warnings + +from numba.core.config import PYVERSION + +if PYVERSION < (3, 9): + try: + import importlib_metadata + except ImportError as ex: + raise ImportError( + "importlib_metadata backport is required for Python version < 3.9, " + "try:\n" + "$ conda/pip install importlib_metadata" + ) from ex +else: + from importlib import metadata as importlib_metadata + + +_already_initialized = False +logger = logging.getLogger(__name__) + + +def init_all(): + """Execute all `numba_extensions` entry points with the name `init` + + If extensions have already been initialized, this function does nothing. + """ + global _already_initialized + if _already_initialized: + return + + # Must put this here to avoid extensions re-triggering initialization + _already_initialized = True + + def load_ep(entry_point): + """Loads a given entry point. Warns and logs on failure. + """ + logger.debug('Loading extension: %s', entry_point) + try: + func = entry_point.load() + func() + except Exception as e: + msg = (f"Numba extension module '{entry_point.module}' " + f"failed to load due to '{type(e).__name__}({str(e)})'.") + warnings.warn(msg, stacklevel=3) + logger.debug('Extension loading failed for: %s', entry_point) + + eps = importlib_metadata.entry_points() + # Split, Python 3.10+ and importlib_metadata 3.6+ have the "selectable" + # interface, versions prior to that do not. See "compatibility note" in: + # https://docs.python.org/3.10/library/importlib.metadata.html#entry-points + if hasattr(eps, 'select'): + for entry_point in eps.select(group="numba_extensions", name="init"): + load_ep(entry_point) + else: + for entry_point in eps.get("numba_extensions", ()): + if entry_point.name == "init": + load_ep(entry_point) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/environment.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/environment.py new file mode 100644 index 0000000000000000000000000000000000000000..ffc5feb59defcbabc02fd96d728a94d307e84c4a --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/environment.py @@ -0,0 +1,64 @@ +import weakref +import importlib + +from numba import _dynfunc + + +class Environment(_dynfunc.Environment): + """Stores globals and constant pyobjects for runtime. + + It is often needed to convert b/w nopython objects and pyobjects. + """ + __slots__ = ('env_name', '__weakref__') + # A weak-value dictionary to store live environment with env_name as the + # key. + _memo = weakref.WeakValueDictionary() + + @classmethod + def from_fndesc(cls, fndesc): + try: + # Avoid creating new Env + return cls._memo[fndesc.env_name] + except KeyError: + inst = cls(fndesc.lookup_globals()) + inst.env_name = fndesc.env_name + cls._memo[fndesc.env_name] = inst + return inst + + def can_cache(self): + is_dyn = '__name__' not in self.globals + return not is_dyn + + def __reduce__(self): + return _rebuild_env, ( + self.globals.get('__name__'), + self.consts, + self.env_name, + ) + + def __del__(self): + return + + def __repr__(self): + return f"" + + +def _rebuild_env(modname, consts, env_name): + env = lookup_environment(env_name) + if env is not None: + return env + + mod = importlib.import_module(modname) + env = Environment(mod.__dict__) + env.consts[:] = consts + env.env_name = env_name + # Cache loaded object + Environment._memo[env_name] = env + return env + + +def lookup_environment(env_name): + """Returns the Environment object for the given name; + or None if not found + """ + return Environment._memo.get(env_name) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/errors.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/errors.py new file mode 100644 index 0000000000000000000000000000000000000000..7c3a1ac96f26827c8c245349c710688d7c49caaa --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/errors.py @@ -0,0 +1,848 @@ +""" +Numba-specific errors and warnings. +""" + + +import abc +import contextlib +import os +import sys +import warnings +import numba.core.config +import numpy as np +from collections import defaultdict +from numba.core.utils import (chain_exception, use_old_style_errors, + use_new_style_errors) +from functools import wraps +from abc import abstractmethod + +# Filled at the end +__all__ = [] + + +class NumbaWarning(Warning): + """ + Base category for all Numba compiler warnings. + """ + + def __init__(self, msg, loc=None, highlighting=True, ): + self.msg = msg + self.loc = loc + if highlighting: + highlight = termcolor().errmsg + else: + def highlight(x): + return x + if loc: + super(NumbaWarning, self).__init__( + highlight("%s\n%s\n" % (msg, loc.strformat()))) + else: + super(NumbaWarning, self).__init__(highlight("%s" % (msg,))) + + +class NumbaPerformanceWarning(NumbaWarning): + """ + Warning category for when an operation might not be + as fast as expected. + """ + + +class NumbaDeprecationWarning(NumbaWarning): + """ + Warning category for use of a deprecated feature. + """ + + +class NumbaPendingDeprecationWarning(NumbaWarning): + """ + Warning category for use of a feature that is pending deprecation. + """ + + +class NumbaParallelSafetyWarning(NumbaWarning): + """ + Warning category for when an operation in a prange + might not have parallel semantics. + """ + + +class NumbaTypeSafetyWarning(NumbaWarning): + """ + Warning category for unsafe casting operations. + """ + + +class NumbaExperimentalFeatureWarning(NumbaWarning): + """ + Warning category for using an experimental feature. + """ + + +class NumbaInvalidConfigWarning(NumbaWarning): + """ + Warning category for using an invalid configuration. + """ + + +class NumbaPedanticWarning(NumbaWarning): + """ + Warning category for reporting pedantic messages. + """ + def __init__(self, msg, **kwargs): + super().__init__(f"{msg}\n{pedantic_warning_info}") + + +class NumbaIRAssumptionWarning(NumbaPedanticWarning): + """ + Warning category for reporting an IR assumption violation. + """ + + +class NumbaDebugInfoWarning(NumbaWarning): + """ + Warning category for an issue with the emission of debug information. + """ + +# These are needed in the color formatting of errors setup + + +class _ColorScheme(metaclass=abc.ABCMeta): + + @abstractmethod + def code(self, msg): + pass + + @abstractmethod + def errmsg(self, msg): + pass + + @abstractmethod + def filename(self, msg): + pass + + @abstractmethod + def indicate(self, msg): + pass + + @abstractmethod + def highlight(self, msg): + pass + + @abstractmethod + def reset(self, msg): + pass + + +class _DummyColorScheme(_ColorScheme): + + def __init__(self, theme=None): + pass + + def code(self, msg): + pass + + def errmsg(self, msg): + pass + + def filename(self, msg): + pass + + def indicate(self, msg): + pass + + def highlight(self, msg): + pass + + def reset(self, msg): + pass + + +# holds reference to the instance of the terminal color scheme in use +_termcolor_inst = None + +try: + import colorama + + # If the colorama version is < 0.3.9 it can break stdout/stderr in some + # situations, as a result if this condition is met colorama is disabled and + # the user is warned. Note that early versions did not have a __version__. + colorama_version = getattr(colorama, '__version__', '0.0.0') + + if tuple([int(x) for x in colorama_version.split('.')]) < (0, 3, 9): + msg = ("Insufficiently recent colorama version found. " + "Numba requires colorama >= 0.3.9") + # warn the user + warnings.warn(msg) + # trip the exception to disable color errors + raise ImportError + + # If Numba is running in testsuite mode then do not use error message + # coloring so CI system output is consistently readable without having + # to read between shell escape characters. + if os.environ.get('NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING', None): + raise ImportError # just to trigger the exception handler below + +except ImportError: + + class NOPColorScheme(_DummyColorScheme): + def __init__(self, theme=None): + if theme is not None: + raise ValueError("specifying a theme has no effect") + _DummyColorScheme.__init__(self, theme=theme) + + def code(self, msg): + return msg + + def errmsg(self, msg): + return msg + + def filename(self, msg): + return msg + + def indicate(self, msg): + return msg + + def highlight(self, msg): + return msg + + def reset(self, msg): + return msg + + def termcolor(): + global _termcolor_inst + if _termcolor_inst is None: + _termcolor_inst = NOPColorScheme() + return _termcolor_inst + +else: + + from colorama import init, reinit, deinit, Fore, Style + + class ColorShell(object): + _has_initialized = False + + def __init__(self): + init() + self._has_initialized = True + + def __enter__(self): + if self._has_initialized: + reinit() + + def __exit__(self, *exc_detail): + Style.RESET_ALL + deinit() + + class reset_terminal(object): + def __init__(self): + self._buf = bytearray(b'') + + def __enter__(self): + return self._buf + + def __exit__(self, *exc_detail): + self._buf += bytearray(Style.RESET_ALL.encode('utf-8')) + + # define some default themes, if more are added, update the envvars docs! + themes = {} + + # No color added, just bold weighting + themes['no_color'] = {'code': None, + 'errmsg': None, + 'filename': None, + 'indicate': None, + 'highlight': None, + 'reset': None, } + + # suitable for terminals with a dark background + themes['dark_bg'] = {'code': Fore.BLUE, + 'errmsg': Fore.YELLOW, + 'filename': Fore.WHITE, + 'indicate': Fore.GREEN, + 'highlight': Fore.RED, + 'reset': Style.RESET_ALL, } + + # suitable for terminals with a light background + themes['light_bg'] = {'code': Fore.BLUE, + 'errmsg': Fore.BLACK, + 'filename': Fore.MAGENTA, + 'indicate': Fore.BLACK, + 'highlight': Fore.RED, + 'reset': Style.RESET_ALL, } + + # suitable for terminals with a blue background + themes['blue_bg'] = {'code': Fore.WHITE, + 'errmsg': Fore.YELLOW, + 'filename': Fore.MAGENTA, + 'indicate': Fore.CYAN, + 'highlight': Fore.RED, + 'reset': Style.RESET_ALL, } + + # suitable for use in jupyter notebooks + themes['jupyter_nb'] = {'code': Fore.BLACK, + 'errmsg': Fore.BLACK, + 'filename': Fore.GREEN, + 'indicate': Fore.CYAN, + 'highlight': Fore.RED, + 'reset': Style.RESET_ALL, } + + default_theme = themes['no_color'] + + class HighlightColorScheme(_DummyColorScheme): + def __init__(self, theme=default_theme): + self._code = theme['code'] + self._errmsg = theme['errmsg'] + self._filename = theme['filename'] + self._indicate = theme['indicate'] + self._highlight = theme['highlight'] + self._reset = theme['reset'] + _DummyColorScheme.__init__(self, theme=theme) + + def _markup(self, msg, color=None, style=Style.BRIGHT): + features = '' + if color: + features += color + if style: + features += style + with ColorShell(): + with reset_terminal() as mu: + mu += features.encode('utf-8') + mu += (msg).encode('utf-8') + return mu.decode('utf-8') + + def code(self, msg): + return self._markup(msg, self._code) + + def errmsg(self, msg): + return self._markup(msg, self._errmsg) + + def filename(self, msg): + return self._markup(msg, self._filename) + + def indicate(self, msg): + return self._markup(msg, self._indicate) + + def highlight(self, msg): + return self._markup(msg, self._highlight) + + def reset(self, msg): + return self._markup(msg, self._reset) + + def termcolor(): + global _termcolor_inst + if _termcolor_inst is None: + scheme = themes[numba.core.config.COLOR_SCHEME] + _termcolor_inst = HighlightColorScheme(scheme) + return _termcolor_inst + + +pedantic_warning_info = """ +This warning came from an internal pedantic check. Please report the warning +message and traceback, along with a minimal reproducer at: +https://github.com/numba/numba/issues/new?template=bug_report.md +""" + +feedback_details = """ +Please report the error message and traceback, along with a minimal reproducer +at: https://github.com/numba/numba/issues/new?template=bug_report.md + +If more help is needed please feel free to speak to the Numba core developers +directly at: https://gitter.im/numba/numba + +Thanks in advance for your help in improving Numba! +""" + +unsupported_error_info = """ +Unsupported functionality was found in the code Numba was trying to compile. + +If this functionality is important to you please file a feature request at: +https://github.com/numba/numba/issues/new?template=feature_request.md +""" + +interpreter_error_info = """ +Unsupported Python functionality was found in the code Numba was trying to +compile. This error could be due to invalid code, does the code work +without Numba? (To temporarily disable Numba JIT, set the `NUMBA_DISABLE_JIT` +environment variable to non-zero, and then rerun the code). + +If the code is valid and the unsupported functionality is important to you +please file a feature request at: +https://github.com/numba/numba/issues/new?template=feature_request.md + +To see Python/NumPy features supported by the latest release of Numba visit: +https://numba.readthedocs.io/en/stable/reference/pysupported.html +and +https://numba.readthedocs.io/en/stable/reference/numpysupported.html +""" + +constant_inference_info = """ +Numba could not make a constant out of something that it decided should be +a constant. This could well be a current limitation in Numba's internals, +however please first check that your code is valid for compilation, +particularly with respect to string interpolation (not supported!) and +the requirement of compile time constants as arguments to exceptions: +https://numba.readthedocs.io/en/stable/reference/pysupported.html?highlight=exceptions#constructs + +If the code is valid and the unsupported functionality is important to you +please file a feature request at: +https://github.com/numba/numba/issues/new?template=feature_request.md + +If you think your code should work with Numba. %s +""" % feedback_details + +typing_error_info = """ +This is not usually a problem with Numba itself but instead often caused by +the use of unsupported features or an issue in resolving types. + +To see Python/NumPy features supported by the latest release of Numba visit: +https://numba.readthedocs.io/en/stable/reference/pysupported.html +and +https://numba.readthedocs.io/en/stable/reference/numpysupported.html + +For more information about typing errors and how to debug them visit: +https://numba.readthedocs.io/en/stable/user/troubleshoot.html#my-code-doesn-t-compile + +If you think your code should work with Numba, please report the error message +and traceback, along with a minimal reproducer at: +https://github.com/numba/numba/issues/new?template=bug_report.md +""" + +reportable_issue_info = """ +------------------------------------------------------------------------------- +This should not have happened, a problem has occurred in Numba's internals. +You are currently using Numba version %s. +%s +""" % (numba.__version__, feedback_details) + +error_extras = dict() +error_extras['unsupported_error'] = unsupported_error_info +error_extras['typing'] = typing_error_info +error_extras['reportable'] = reportable_issue_info +error_extras['interpreter'] = interpreter_error_info +error_extras['constant_inference'] = constant_inference_info + + +def deprecated(arg): + """Define a deprecation decorator. + An optional string should refer to the new API to be used instead. + + Example: + @deprecated + def old_func(): ... + + @deprecated('new_func') + def old_func(): ...""" + + subst = arg if isinstance(arg, str) else None + + def decorator(func): + def wrapper(*args, **kwargs): + msg = "Call to deprecated function \"{}\"." + if subst: + msg += "\n Use \"{}\" instead." + warnings.warn(msg.format(func.__name__, subst), + category=DeprecationWarning, stacklevel=2) + return func(*args, **kwargs) + + return wraps(func)(wrapper) + + if not subst: + return decorator(arg) + else: + return decorator + + +class WarningsFixer(object): + """ + An object "fixing" warnings of a given category caught during + certain phases. The warnings can have their filename and lineno fixed, + and they are deduplicated as well. + """ + + def __init__(self, category): + self._category = category + # {(filename, lineno, category) -> messages} + self._warnings = defaultdict(set) + + @contextlib.contextmanager + def catch_warnings(self, filename=None, lineno=None): + """ + Store warnings and optionally fix their filename and lineno. + """ + with warnings.catch_warnings(record=True) as wlist: + warnings.simplefilter('always', self._category) + yield + + for w in wlist: + msg = str(w.message) + if issubclass(w.category, self._category): + # Store warnings of this category for deduplication + filename = filename or w.filename + lineno = lineno or w.lineno + self._warnings[filename, lineno, w.category].add(msg) + else: + # Simply emit other warnings again + warnings.warn_explicit(msg, w.category, + w.filename, w.lineno) + + def flush(self): + """ + Emit all stored warnings. + """ + def key(arg): + # It is possible through codegen to create entirely identical + # warnings, this leads to comparing types when sorting which breaks + # on Python 3. Key as str() and if the worse happens then `id` + # creates some uniqueness + return str(arg) + str(id(arg)) + + for (filename, lineno, category), messages in sorted( + self._warnings.items(), key=key): + for msg in sorted(messages): + warnings.warn_explicit(msg, category, filename, lineno) + self._warnings.clear() + + +class NumbaError(Exception): + + def __init__(self, msg, loc=None, highlighting=True): + self.msg = msg + self.loc = loc + if highlighting: + highlight = termcolor().errmsg + else: + def highlight(x): + return x + + if loc: + new_msg = "%s\n%s\n" % (msg, loc.strformat()) + else: + new_msg = "%s" % (msg,) + super(NumbaError, self).__init__(highlight(new_msg)) + + @property + def contexts(self): + try: + return self._contexts + except AttributeError: + self._contexts = lst = [] + return lst + + def add_context(self, msg): + """ + Add contextual info. The exception message is expanded with the new + contextual information. + """ + self.contexts.append(msg) + f = termcolor().errmsg('{0}\n') + termcolor().filename('During: {1}') + newmsg = f.format(self, msg) + self.args = (newmsg,) + return self + + def patch_message(self, new_message): + """ + Change the error message to the given new message. + """ + self.args = (new_message,) + self.args[1:] + + +class UnsupportedError(NumbaError): + """ + Numba does not have an implementation for this functionality. + """ + pass + + +class UnsupportedRewriteError(UnsupportedError): + """UnsupportedError from rewrite passes + """ + pass + + +class IRError(NumbaError): + """ + An error occurred during Numba IR generation. + """ + pass + + +class RedefinedError(IRError): + """ + An error occurred during interpretation of IR due to variable redefinition. + """ + pass + + +class NotDefinedError(IRError): + """ + An undefined variable is encountered during interpretation of IR. + """ + + def __init__(self, name, loc=None): + self.name = name + msg = ("The compiler failed to analyze the bytecode. " + "Variable '%s' is not defined." % name) + super(NotDefinedError, self).__init__(msg, loc=loc) + + +class VerificationError(IRError): + """ + An error occurred during IR verification. Once Numba's internal + representation (IR) is constructed it is then verified to ensure that + terminators are both present and in the correct places within the IR. If + it is the case that this condition is not met, a VerificationError is + raised. + """ + pass + + +class DeprecationError(NumbaError): + """ + Functionality is deprecated. + """ + pass + + +class LoweringError(NumbaError): + """ + An error occurred during lowering. + """ + + def __init__(self, msg, loc=None): + super(LoweringError, self).__init__(msg, loc=loc) + + +class UnsupportedParforsError(NumbaError): + """ + An error occurred because parfors is not supported on the platform. + """ + pass + + +class ForbiddenConstruct(LoweringError): + """ + A forbidden Python construct was encountered (e.g. use of locals()). + """ + pass + + +class TypingError(NumbaError): + """ + A type inference failure. + """ + pass + + +class UntypedAttributeError(TypingError): + def __init__(self, value, attr, loc=None): + module = getattr(value, 'pymod', None) + if module is not None and module == np: + # unsupported numpy feature. + msg = ("Use of unsupported NumPy function 'numpy.%s' " + "or unsupported use of the function.") % attr + else: + msg = "Unknown attribute '{attr}' of type {type}" + msg = msg.format(type=value, attr=attr) + super(UntypedAttributeError, self).__init__(msg, loc=loc) + + +class ByteCodeSupportError(NumbaError): + """ + Failure to extract the bytecode of the user's function. + """ + + def __init__(self, msg, loc=None): + super(ByteCodeSupportError, self).__init__(msg, loc=loc) + + +class CompilerError(NumbaError): + """ + Some high-level error in the compiler. + """ + pass + + +class ConstantInferenceError(NumbaError): + """ + Failure during constant inference. + """ + + def __init__(self, value, loc=None): + super(ConstantInferenceError, self).__init__(value, loc=loc) + + +class InternalError(NumbaError): + """ + For wrapping internal error occurred within the compiler + """ + + def __init__(self, exception): + super(InternalError, self).__init__(str(exception)) + self.old_exception = exception + + +class InternalTargetMismatchError(InternalError): + """For signalling a target mismatch error occurred internally within the + compiler. + """ + def __init__(self, kind, target_hw, hw_clazz): + msg = (f"{kind.title()} being resolved on a target from which it does " + f"not inherit. Local target is {target_hw}, declared " + f"target class is {hw_clazz}.") + super().__init__(msg) + + +class RequireLiteralValue(TypingError): + """ + For signalling that a function's typing requires a constant value for + some of its arguments. + """ + pass + + +class ForceLiteralArg(NumbaError): + """A Pseudo-exception to signal the dispatcher to type an argument literally + + Attributes + ---------- + requested_args : frozenset[int] + requested positions of the arguments. + """ + def __init__(self, arg_indices, fold_arguments=None, loc=None): + """ + Parameters + ---------- + arg_indices : Sequence[int] + requested positions of the arguments. + fold_arguments: callable + A function ``(tuple, dict) -> tuple`` that binds and flattens + the ``args`` and ``kwargs``. + loc : numba.ir.Loc or None + """ + super(ForceLiteralArg, self).__init__( + "Pseudo-exception to force literal arguments in the dispatcher", + loc=loc, + ) + self.requested_args = frozenset(arg_indices) + self.fold_arguments = fold_arguments + + def bind_fold_arguments(self, fold_arguments): + """Bind the fold_arguments function + """ + e = ForceLiteralArg(self.requested_args, fold_arguments, + loc=self.loc) + return chain_exception(e, self) + + def combine(self, other): + """Returns a new instance by or'ing the requested_args. + """ + if not isinstance(other, ForceLiteralArg): + m = '*other* must be a {} but got a {} instead' + raise TypeError(m.format(ForceLiteralArg, type(other))) + return ForceLiteralArg(self.requested_args | other.requested_args) + + def __or__(self, other): + """Same as self.combine(other) + """ + return self.combine(other) + + +class LiteralTypingError(TypingError): + """ + Failure in typing a Literal type + """ + pass + + +# These Exception classes are just Numba copies of their Python equivalents for +# use internally in cases where we want e.g. type inference to keep on trying. +# Exceptions extending from NumbaError are considered "special" by Numba's +# internals and are treated differently to standard Python exceptions which are +# permitted to just propagate up the stack. + +class NumbaValueError(TypingError): + pass + + +class NumbaTypeError(TypingError): + pass + + +class NumbaAttributeError(TypingError): + pass + + +class NumbaAssertionError(TypingError): + pass + + +class NumbaNotImplementedError(TypingError): + pass + + +class NumbaKeyError(TypingError): + pass + + +class NumbaIndexError(TypingError): + pass + + +class NumbaRuntimeError(NumbaError): + pass + + +def _format_msg(fmt, args, kwargs): + return fmt.format(*args, **kwargs) + + +_numba_path = os.path.dirname(__file__) +loc_info = {} + + +@contextlib.contextmanager +def new_error_context(fmt_, *args, **kwargs): + """ + A contextmanager that prepend contextual information to any exception + raised within. If the exception type is not an instance of NumbaError, + it will be wrapped into a InternalError. The exception class can be + changed by providing a "errcls_" keyword argument with the exception + constructor. + + The first argument is a message that describes the context. It can be a + format string. If there are additional arguments, it will be used as + ``fmt_.format(*args, **kwargs)`` to produce the final message string. + """ + errcls = kwargs.pop('errcls_', InternalError) + + loc = kwargs.get('loc', None) + if loc is not None and not loc.filename.startswith(_numba_path): + loc_info.update(kwargs) + + try: + yield + except NumbaError as e: + e.add_context(_format_msg(fmt_, args, kwargs)) + raise + except AssertionError: + # Let assertion error pass through for shorter traceback in debugging + raise + except Exception as e: + if use_old_style_errors(): + newerr = errcls(e).add_context(_format_msg(fmt_, args, kwargs)) + if numba.core.config.FULL_TRACEBACKS: + tb = sys.exc_info()[2] + else: + tb = None + raise newerr.with_traceback(tb) + elif use_new_style_errors(): + raise e + else: + msg = ("Unknown CAPTURED_ERRORS style: " + f"'{numba.core.config.CAPTURED_ERRORS}'.") + assert 0, msg + + +__all__ += [name for (name, value) in globals().items() + if not name.startswith('_') and isinstance(value, type) + and issubclass(value, (Exception, Warning))] diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/event.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/event.py new file mode 100644 index 0000000000000000000000000000000000000000..e2a1b0bde107a5b65ea251d09b9aa27df8f701a2 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/event.py @@ -0,0 +1,491 @@ +""" +The ``numba.core.event`` module provides a simple event system for applications +to register callbacks to listen to specific compiler events. + +The following events are built in: + +- ``"numba:compile"`` is broadcast when a dispatcher is compiling. Events of + this kind have ``data`` defined to be a ``dict`` with the following + key-values: + + - ``"dispatcher"``: the dispatcher object that is compiling. + - ``"args"``: the argument types. + - ``"return_type"``: the return type. + +- ``"numba:compiler_lock"`` is broadcast when the internal compiler-lock is + acquired. This is mostly used internally to measure time spent with the lock + acquired. + +- ``"numba:llvm_lock"`` is broadcast when the internal LLVM-lock is acquired. + This is used internally to measure time spent with the lock acquired. + +- ``"numba:run_pass"`` is broadcast when a compiler pass is running. + + - ``"name"``: pass name. + - ``"qualname"``: qualified name of the function being compiled. + - ``"module"``: module name of the function being compiled. + - ``"flags"``: compilation flags. + - ``"args"``: argument types. + - ``"return_type"`` return type. + +Applications can register callbacks that are listening for specific events using +``register(kind: str, listener: Listener)``, where ``listener`` is an instance +of ``Listener`` that defines custom actions on occurrence of the specific event. +""" + +import os +import json +import atexit +import abc +import enum +import time +import threading +from timeit import default_timer as timer +from contextlib import contextmanager, ExitStack +from collections import defaultdict + +from numba.core import config + + +class EventStatus(enum.Enum): + """Status of an event. + """ + START = enum.auto() + END = enum.auto() + + +# Builtin event kinds. +_builtin_kinds = frozenset([ + "numba:compiler_lock", + "numba:compile", + "numba:llvm_lock", + "numba:run_pass", +]) + + +def _guard_kind(kind): + """Guard to ensure that an event kind is valid. + + All event kinds with a "numba:" prefix must be defined in the pre-defined + ``numba.core.event._builtin_kinds``. + Custom event kinds are allowed by not using the above prefix. + + Parameters + ---------- + kind : str + + Return + ------ + res : str + """ + if kind.startswith("numba:") and kind not in _builtin_kinds: + msg = (f"{kind} is not a valid event kind, " + "it starts with the reserved prefix 'numba:'") + raise ValueError(msg) + return kind + + +class Event: + """An event. + + Parameters + ---------- + kind : str + status : EventStatus + data : any; optional + Additional data for the event. + exc_details : 3-tuple; optional + Same 3-tuple for ``__exit__``. + """ + def __init__(self, kind, status, data=None, exc_details=None): + self._kind = _guard_kind(kind) + self._status = status + self._data = data + self._exc_details = (None + if exc_details is None or exc_details[0] is None + else exc_details) + + @property + def kind(self): + """Event kind + + Returns + ------- + res : str + """ + return self._kind + + @property + def status(self): + """Event status + + Returns + ------- + res : EventStatus + """ + return self._status + + @property + def data(self): + """Event data + + Returns + ------- + res : object + """ + return self._data + + @property + def is_start(self): + """Is it a *START* event? + + Returns + ------- + res : bool + """ + return self._status == EventStatus.START + + @property + def is_end(self): + """Is it an *END* event? + + Returns + ------- + res : bool + """ + return self._status == EventStatus.END + + @property + def is_failed(self): + """Is the event carrying an exception? + + This is used for *END* event. This method will never return ``True`` + in a *START* event. + + Returns + ------- + res : bool + """ + return self._exc_details is None + + def __str__(self): + data = (f"{type(self.data).__qualname__}" + if self.data is not None else "None") + return f"Event({self._kind}, {self._status}, data: {data})" + + __repr__ = __str__ + + +_registered = defaultdict(list) + + +def register(kind, listener): + """Register a listener for a given event kind. + + Parameters + ---------- + kind : str + listener : Listener + """ + assert isinstance(listener, Listener) + kind = _guard_kind(kind) + _registered[kind].append(listener) + + +def unregister(kind, listener): + """Unregister a listener for a given event kind. + + Parameters + ---------- + kind : str + listener : Listener + """ + assert isinstance(listener, Listener) + kind = _guard_kind(kind) + lst = _registered[kind] + lst.remove(listener) + + +def broadcast(event): + """Broadcast an event to all registered listeners. + + Parameters + ---------- + event : Event + """ + for listener in _registered[event.kind]: + listener.notify(event) + + +class Listener(abc.ABC): + """Base class for all event listeners. + """ + @abc.abstractmethod + def on_start(self, event): + """Called when there is a *START* event. + + Parameters + ---------- + event : Event + """ + pass + + @abc.abstractmethod + def on_end(self, event): + """Called when there is a *END* event. + + Parameters + ---------- + event : Event + """ + pass + + def notify(self, event): + """Notify this Listener with the given Event. + + Parameters + ---------- + event : Event + """ + if event.is_start: + self.on_start(event) + elif event.is_end: + self.on_end(event) + else: + raise AssertionError("unreachable") + + +class TimingListener(Listener): + """A listener that measures the total time spent between *START* and + *END* events during the time this listener is active. + """ + def __init__(self): + self._depth = 0 + + def on_start(self, event): + if self._depth == 0: + self._ts = timer() + self._depth += 1 + + def on_end(self, event): + self._depth -= 1 + if self._depth == 0: + last = getattr(self, "_duration", 0) + self._duration = (timer() - self._ts) + last + + @property + def done(self): + """Returns a ``bool`` indicating whether a measurement has been made. + + When this returns ``False``, the matching event has never fired. + If and only if this returns ``True``, ``.duration`` can be read without + error. + """ + return hasattr(self, "_duration") + + @property + def duration(self): + """Returns the measured duration. + + This may raise ``AttributeError``. Users can use ``.done`` to check + that a measurement has been made. + """ + return self._duration + + +class RecordingListener(Listener): + """A listener that records all events and stores them in the ``.buffer`` + attribute as a list of 2-tuple ``(float, Event)``, where the first element + is the time the event occurred as returned by ``time.time()`` and the second + element is the event. + """ + def __init__(self): + self.buffer = [] + + def on_start(self, event): + self.buffer.append((time.time(), event)) + + def on_end(self, event): + self.buffer.append((time.time(), event)) + + +@contextmanager +def install_listener(kind, listener): + """Install a listener for event "kind" temporarily within the duration of + the context. + + Returns + ------- + res : Listener + The *listener* provided. + + Examples + -------- + + >>> with install_listener("numba:compile", listener): + >>> some_code() # listener will be active here. + >>> other_code() # listener will be unregistered by this point. + + """ + register(kind, listener) + try: + yield listener + finally: + unregister(kind, listener) + + +@contextmanager +def install_timer(kind, callback): + """Install a TimingListener temporarily to measure the duration of + an event. + + If the context completes successfully, the *callback* function is executed. + The *callback* function is expected to take a float argument for the + duration in seconds. + + Returns + ------- + res : TimingListener + + Examples + -------- + + This is equivalent to: + + >>> with install_listener(kind, TimingListener()) as res: + >>> ... + """ + tl = TimingListener() + with install_listener(kind, tl): + yield tl + + if tl.done: + callback(tl.duration) + + +@contextmanager +def install_recorder(kind): + """Install a RecordingListener temporarily to record all events. + + Once the context is closed, users can use ``RecordingListener.buffer`` + to access the recorded events. + + Returns + ------- + res : RecordingListener + + Examples + -------- + + This is equivalent to: + + >>> with install_listener(kind, RecordingListener()) as res: + >>> ... + """ + rl = RecordingListener() + with install_listener(kind, rl): + yield rl + + +def start_event(kind, data=None): + """Trigger the start of an event of *kind* with *data*. + + Parameters + ---------- + kind : str + Event kind. + data : any; optional + Extra event data. + """ + evt = Event(kind=kind, status=EventStatus.START, data=data) + broadcast(evt) + + +def end_event(kind, data=None, exc_details=None): + """Trigger the end of an event of *kind*, *exc_details*. + + Parameters + ---------- + kind : str + Event kind. + data : any; optional + Extra event data. + exc_details : 3-tuple; optional + Same 3-tuple for ``__exit__``. Or, ``None`` if no error. + """ + evt = Event( + kind=kind, status=EventStatus.END, data=data, exc_details=exc_details, + ) + broadcast(evt) + + +@contextmanager +def trigger_event(kind, data=None): + """A context manager to trigger the start and end events of *kind* with + *data*. The start event is triggered when entering the context. + The end event is triggered when exiting the context. + + Parameters + ---------- + kind : str + Event kind. + data : any; optional + Extra event data. + """ + with ExitStack() as scope: + @scope.push + def on_exit(*exc_details): + end_event(kind, data=data, exc_details=exc_details) + + start_event(kind, data=data) + yield + + +def _get_native_ident(): + try: + return threading.get_native_ident() + except AttributeError: + # Fallback for python <3.8 + return threading.get_ident() + + +def _prepare_chrome_trace_data(listener: RecordingListener): + """Prepare events in `listener` for serializing as chrome trace data. + """ + # The spec for the trace event format can be found at: + # https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit # noqa + # This code only uses the JSON Array Format for simplicity. + pid = os.getpid() + tid = _get_native_ident() + evs = [] + for ts, rec in listener.buffer: + data = rec.data + cat = str(rec.kind) + ph = 'B' if rec.is_start else 'E' + name = data['name'] + args = data + ev = dict( + cat=cat, pid=pid, tid=tid, ts=ts, ph=ph, name=name, args=args, + ) + evs.append(ev) + return evs + + +def _setup_chrome_trace_exit_handler(): + """Setup a RecordingListener and an exit handler to write the captured events + to file. + """ + listener = RecordingListener() + register("numba:run_pass", listener) + filename = config.CHROME_TRACE + + @atexit.register + def _write_chrome_trace(): + # The following output file is not multi-process safe. + evs = _prepare_chrome_trace_data(listener) + with open(filename, "w") as out: + json.dump(evs, out) + + +if config.CHROME_TRACE: + _setup_chrome_trace_exit_handler() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/extending.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/extending.py new file mode 100644 index 0000000000000000000000000000000000000000..9d005fe74bce6b985efb3462d425c708ead04320 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/extending.py @@ -0,0 +1,583 @@ +import os +import uuid +import weakref +import collections +import functools + +import numba +from numba.core import types, errors, utils, config + +# Exported symbols +from numba.core.typing.typeof import typeof_impl # noqa: F401 +from numba.core.typing.asnumbatype import as_numba_type # noqa: F401 +from numba.core.typing.templates import infer, infer_getattr # noqa: F401 +from numba.core.imputils import ( # noqa: F401 + lower_builtin, lower_getattr, lower_getattr_generic, # noqa: F401 + lower_setattr, lower_setattr_generic, lower_cast) # noqa: F401 +from numba.core.datamodel import models # noqa: F401 +from numba.core.datamodel import register_default as register_model # noqa: F401, E501 +from numba.core.pythonapi import box, unbox, reflect, NativeValue # noqa: F401 +from numba._helperlib import _import_cython_function # noqa: F401 +from numba.core.serialize import ReduceMixin + + +def type_callable(func): + """ + Decorate a function as implementing typing for the callable *func*. + *func* can be a callable object (probably a global) or a string + denoting a built-in operation (such 'getitem' or '__array_wrap__') + """ + from numba.core.typing.templates import (CallableTemplate, infer, + infer_global) + if not callable(func) and not isinstance(func, str): + raise TypeError("`func` should be a function or string") + try: + func_name = func.__name__ + except AttributeError: + func_name = str(func) + + def decorate(typing_func): + def generic(self): + return typing_func(self.context) + + name = "%s_CallableTemplate" % (func_name,) + bases = (CallableTemplate,) + class_dict = dict(key=func, generic=generic) + template = type(name, bases, class_dict) + infer(template) + if callable(func): + infer_global(func, types.Function(template)) + return typing_func + + return decorate + + +# By default, an *overload* does not have a cpython wrapper because it is not +# callable from python. +_overload_default_jit_options = {'no_cpython_wrapper': True} + + +def overload(func, jit_options={}, strict=True, inline='never', + prefer_literal=False, **kwargs): + """ + A decorator marking the decorated function as typing and implementing + *func* in nopython mode. + + The decorated function will have the same formal parameters as *func* + and be passed the Numba types of those parameters. It should return + a function implementing *func* for the given types. + + Here is an example implementing len() for tuple types:: + + @overload(len) + def tuple_len(seq): + if isinstance(seq, types.BaseTuple): + n = len(seq) + def len_impl(seq): + return n + return len_impl + + Compiler options can be passed as an dictionary using the **jit_options** + argument. + + Overloading strictness (that the typing and implementing signatures match) + is enforced by the **strict** keyword argument, it is recommended that this + is set to True (default). + + To handle a function that accepts imprecise types, an overload + definition can return 2-tuple of ``(signature, impl_function)``, where + the ``signature`` is a ``typing.Signature`` specifying the precise + signature to be used; and ``impl_function`` is the same implementation + function as in the simple case. + + If the kwarg inline determines whether the overload is inlined in the + calling function and can be one of three values: + * 'never' (default) - the overload is never inlined. + * 'always' - the overload is always inlined. + * a function that takes two arguments, both of which are instances of a + namedtuple with fields: + * func_ir + * typemap + * calltypes + * signature + The first argument holds the information from the caller, the second + holds the information from the callee. The function should return Truthy + to determine whether to inline, this essentially permitting custom + inlining rules (typical use might be cost models). + + The *prefer_literal* option allows users to control if literal types should + be tried first or last. The default (`False`) is to use non-literal types. + Implementations that can specialize based on literal values should set the + option to `True`. Note, this option maybe expanded in the near future to + allow for more control (e.g. disabling non-literal types). + + **kwargs prescribes additional arguments passed through to the overload + template. The only accepted key at present is 'target' which is a string + corresponding to the target that this overload should be bound against. + """ + from numba.core.typing.templates import make_overload_template, infer_global + + # set default options + opts = _overload_default_jit_options.copy() + opts.update(jit_options) # let user options override + + # TODO: abort now if the kwarg 'target' relates to an unregistered target, + # this requires sorting out the circular imports first. + + def decorate(overload_func): + template = make_overload_template(func, overload_func, opts, strict, + inline, prefer_literal, **kwargs) + infer(template) + if callable(func): + infer_global(func, types.Function(template)) + return overload_func + + return decorate + + +def register_jitable(*args, **kwargs): + """ + Register a regular python function that can be executed by the python + interpreter and can be compiled into a nopython function when referenced + by other jit'ed functions. Can be used as:: + + @register_jitable + def foo(x, y): + return x + y + + Or, with compiler options:: + + @register_jitable(_nrt=False) # disable runtime allocation + def foo(x, y): + return x + y + + """ + def wrap(fn): + # It is just a wrapper for @overload + inline = kwargs.pop('inline', 'never') + + @overload(fn, jit_options=kwargs, inline=inline, strict=False) + def ov_wrap(*args, **kwargs): + return fn + return fn + + if kwargs: + return wrap + else: + return wrap(*args) + + +def overload_attribute(typ, attr, **kwargs): + """ + A decorator marking the decorated function as typing and implementing + attribute *attr* for the given Numba type in nopython mode. + + *kwargs* are passed to the underlying `@overload` call. + + Here is an example implementing .nbytes for array types:: + + @overload_attribute(types.Array, 'nbytes') + def array_nbytes(arr): + def get(arr): + return arr.size * arr.itemsize + return get + """ + # TODO implement setters + from numba.core.typing.templates import make_overload_attribute_template + + def decorate(overload_func): + template = make_overload_attribute_template( + typ, attr, overload_func, + inline=kwargs.get('inline', 'never'), + ) + infer_getattr(template) + overload(overload_func, **kwargs)(overload_func) + return overload_func + + return decorate + + +def _overload_method_common(typ, attr, **kwargs): + """Common code for overload_method and overload_classmethod + """ + from numba.core.typing.templates import make_overload_method_template + + def decorate(overload_func): + copied_kwargs = kwargs.copy() # avoid mutating parent dict + template = make_overload_method_template( + typ, attr, overload_func, + inline=copied_kwargs.pop('inline', 'never'), + prefer_literal=copied_kwargs.pop('prefer_literal', False), + **copied_kwargs, + ) + infer_getattr(template) + overload(overload_func, **kwargs)(overload_func) + return overload_func + + return decorate + + +def overload_method(typ, attr, **kwargs): + """ + A decorator marking the decorated function as typing and implementing + method *attr* for the given Numba type in nopython mode. + + *kwargs* are passed to the underlying `@overload` call. + + Here is an example implementing .take() for array types:: + + @overload_method(types.Array, 'take') + def array_take(arr, indices): + if isinstance(indices, types.Array): + def take_impl(arr, indices): + n = indices.shape[0] + res = np.empty(n, arr.dtype) + for i in range(n): + res[i] = arr[indices[i]] + return res + return take_impl + """ + return _overload_method_common(typ, attr, **kwargs) + + +def overload_classmethod(typ, attr, **kwargs): + """ + A decorator marking the decorated function as typing and implementing + classmethod *attr* for the given Numba type in nopython mode. + + + Similar to ``overload_method``. + + + Here is an example implementing a classmethod on the Array type to call + ``np.arange()``:: + + @overload_classmethod(types.Array, "make") + def ov_make(cls, nitems): + def impl(cls, nitems): + return np.arange(nitems) + return impl + + The above code will allow the following to work in jit-compiled code:: + + @njit + def foo(n): + return types.Array.make(n) + """ + return _overload_method_common(types.TypeRef(typ), attr, **kwargs) + + +def make_attribute_wrapper(typeclass, struct_attr, python_attr): + """ + Make an automatic attribute wrapper exposing member named *struct_attr* + as a read-only attribute named *python_attr*. + The given *typeclass*'s model must be a StructModel subclass. + """ + from numba.core.typing.templates import AttributeTemplate + from numba.core.datamodel import default_manager + from numba.core.datamodel.models import StructModel + from numba.core.imputils import impl_ret_borrowed + from numba.core import cgutils + + if not isinstance(typeclass, type) or not issubclass(typeclass, types.Type): + raise TypeError("typeclass should be a Type subclass, got %s" + % (typeclass,)) + + def get_attr_fe_type(typ): + """ + Get the Numba type of member *struct_attr* in *typ*. + """ + model = default_manager.lookup(typ) + if not isinstance(model, StructModel): + raise TypeError("make_struct_attribute_wrapper() needs a type " + "with a StructModel, but got %s" % (model,)) + return model.get_member_fe_type(struct_attr) + + @infer_getattr + class StructAttribute(AttributeTemplate): + key = typeclass + + def generic_resolve(self, typ, attr): + if attr == python_attr: + return get_attr_fe_type(typ) + + @lower_getattr(typeclass, python_attr) + def struct_getattr_impl(context, builder, typ, val): + val = cgutils.create_struct_proxy(typ)(context, builder, value=val) + attrty = get_attr_fe_type(typ) + attrval = getattr(val, struct_attr) + return impl_ret_borrowed(context, builder, attrty, attrval) + + +class _Intrinsic(ReduceMixin): + """ + Dummy callable for intrinsic + """ + _memo = weakref.WeakValueDictionary() + # hold refs to last N functions deserialized, retaining them in _memo + # regardless of whether there is another reference + _recent = collections.deque(maxlen=config.FUNCTION_CACHE_SIZE) + + __uuid = None + + def __init__(self, name, defn, **kwargs): + self._ctor_kwargs = kwargs + self._name = name + self._defn = defn + functools.update_wrapper(self, defn) + + @property + def _uuid(self): + """ + An instance-specific UUID, to avoid multiple deserializations of + a given instance. + + Note this is lazily-generated, for performance reasons. + """ + u = self.__uuid + if u is None: + u = str(uuid.uuid1()) + self._set_uuid(u) + return u + + def _set_uuid(self, u): + assert self.__uuid is None + self.__uuid = u + self._memo[u] = self + self._recent.append(self) + + def _register(self): + # _ctor_kwargs + from numba.core.typing.templates import (make_intrinsic_template, + infer_global) + + template = make_intrinsic_template(self, self._defn, self._name, + self._ctor_kwargs) + infer(template) + infer_global(self, types.Function(template)) + + def __call__(self, *args, **kwargs): + """ + This is only defined to pretend to be a callable from CPython. + """ + msg = '{0} is not usable in pure-python'.format(self) + raise NotImplementedError(msg) + + def __repr__(self): + return "".format(self._name) + + def __deepcopy__(self, memo): + # NOTE: Intrinsic are immutable and we don't need to copy. + # This is triggered from deepcopy of statements. + return self + + def _reduce_states(self): + """ + NOTE: part of ReduceMixin protocol + """ + return dict(uuid=self._uuid, name=self._name, defn=self._defn) + + @classmethod + def _rebuild(cls, uuid, name, defn): + """ + NOTE: part of ReduceMixin protocol + """ + try: + return cls._memo[uuid] + except KeyError: + llc = cls(name=name, defn=defn) + llc._register() + llc._set_uuid(uuid) + return llc + + +def intrinsic(*args, **kwargs): + """ + A decorator marking the decorated function as typing and implementing + *func* in nopython mode using the llvmlite IRBuilder API. This is an escape + hatch for expert users to build custom LLVM IR that will be inlined to + the caller. + + The first argument to *func* is the typing context. The rest of the + arguments corresponds to the type of arguments of the decorated function. + These arguments are also used as the formal argument of the decorated + function. If *func* has the signature ``foo(typing_context, arg0, arg1)``, + the decorated function will have the signature ``foo(arg0, arg1)``. + + The return values of *func* should be a 2-tuple of expected type signature, + and a code-generation function that will passed to ``lower_builtin``. + For unsupported operation, return None. + + Here is an example implementing a ``cast_int_to_byte_ptr`` that cast + any integer to a byte pointer:: + + @intrinsic + def cast_int_to_byte_ptr(typingctx, src): + # check for accepted types + if isinstance(src, types.Integer): + # create the expected type signature + result_type = types.CPointer(types.uint8) + sig = result_type(types.uintp) + # defines the custom code generation + def codegen(context, builder, signature, args): + # llvm IRBuilder code here + [src] = args + rtype = signature.return_type + llrtype = context.get_value_type(rtype) + return builder.inttoptr(src, llrtype) + return sig, codegen + """ + # Make inner function for the actual work + def _intrinsic(func): + name = getattr(func, '__name__', str(func)) + llc = _Intrinsic(name, func, **kwargs) + llc._register() + return llc + + if not kwargs: + # No option is given + return _intrinsic(*args) + else: + # options are given, create a new callable to recv the + # definition function + def wrapper(func): + return _intrinsic(func) + return wrapper + + +def get_cython_function_address(module_name, function_name): + """ + Get the address of a Cython function. + + Args + ---- + module_name: + Name of the Cython module + function_name: + Name of the Cython function + + Returns + ------- + A Python int containing the address of the function + + """ + return _import_cython_function(module_name, function_name) + + +def include_path(): + """Returns the C include directory path. + """ + include_dir = os.path.dirname(os.path.dirname(numba.__file__)) + path = os.path.abspath(include_dir) + return path + + +def sentry_literal_args(pysig, literal_args, args, kwargs): + """Ensures that the given argument types (in *args* and *kwargs*) are + literally typed for a function with the python signature *pysig* and the + list of literal argument names in *literal_args*. + + Alternatively, this is the same as:: + + SentryLiteralArgs(literal_args).for_pysig(pysig).bind(*args, **kwargs) + """ + boundargs = pysig.bind(*args, **kwargs) + + # Find literal argument positions and whether it is satisfied. + request_pos = set() + missing = False + for i, (k, v) in enumerate(boundargs.arguments.items()): + if k in literal_args: + request_pos.add(i) + if not isinstance(v, types.Literal): + missing = True + if missing: + # Yes, there are missing required literal arguments + e = errors.ForceLiteralArg(request_pos) + + # A helper function to fold arguments + def folded(args, kwargs): + out = pysig.bind(*args, **kwargs).arguments.values() + return tuple(out) + + raise e.bind_fold_arguments(folded) + + +class SentryLiteralArgs(collections.namedtuple( + '_SentryLiteralArgs', ['literal_args'])): + """ + Parameters + ---------- + literal_args : Sequence[str] + A sequence of names for literal arguments + + Examples + -------- + + The following line: + + >>> SentryLiteralArgs(literal_args).for_pysig(pysig).bind(*args, **kwargs) + + is equivalent to: + + >>> sentry_literal_args(pysig, literal_args, args, kwargs) + """ + def for_function(self, func): + """Bind the sentry to the signature of *func*. + + Parameters + ---------- + func : Function + A python function. + + Returns + ------- + obj : BoundLiteralArgs + """ + return self.for_pysig(utils.pysignature(func)) + + def for_pysig(self, pysig): + """Bind the sentry to the given signature *pysig*. + + Parameters + ---------- + pysig : inspect.Signature + + + Returns + ------- + obj : BoundLiteralArgs + """ + return BoundLiteralArgs( + pysig=pysig, + literal_args=self.literal_args, + ) + + +class BoundLiteralArgs(collections.namedtuple( + 'BoundLiteralArgs', ['pysig', 'literal_args'])): + """ + This class is usually created by SentryLiteralArgs. + """ + def bind(self, *args, **kwargs): + """Bind to argument types. + """ + return sentry_literal_args( + self.pysig, + self.literal_args, + args, + kwargs, + ) + + +def is_jitted(function): + """Returns True if a function is wrapped by one of the Numba @jit + decorators, for example: numba.jit, numba.njit + + The purpose of this function is to provide a means to check if a function is + already JIT decorated. + """ + + # don't want to export this so import locally + from numba.core.dispatcher import Dispatcher + return isinstance(function, Dispatcher) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/externals.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/externals.py new file mode 100644 index 0000000000000000000000000000000000000000..e181b5f43d9d8dda06a84ba93a176058cc5b1543 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/externals.py @@ -0,0 +1,155 @@ +""" +Register external C functions necessary for Numba code generation. +""" + +import sys + +from llvmlite import ir +import llvmlite.binding as ll + +from numba.core import utils, intrinsics +from numba import _helperlib + + +def _add_missing_symbol(symbol, addr): + """Add missing symbol into LLVM internal symtab + """ + if not ll.address_of_symbol(symbol): + ll.add_symbol(symbol, addr) + + +def _get_msvcrt_symbol(symbol): + """ + Under Windows, look up a symbol inside the C runtime + and return the raw pointer value as an integer. + """ + from ctypes import cdll, cast, c_void_p + f = getattr(cdll.msvcrt, symbol) + return cast(f, c_void_p).value + + +def compile_multi3(context): + """ + Compile the multi3() helper function used by LLVM + for 128-bit multiplication on 32-bit platforms. + """ + codegen = context.codegen() + library = codegen.create_library("multi3") + + ir_mod = library.create_ir_module("multi3") + + i64 = ir.IntType(64) + i128 = ir.IntType(128) + lower_mask = ir.Constant(i64, 0xffffffff) + _32 = ir.Constant(i64, 32) + _64 = ir.Constant(i128, 64) + + fn_type = ir.FunctionType(i128, [i128, i128]) + fn = ir.Function(ir_mod, fn_type, name="multi3") + + a, b = fn.args + bb = fn.append_basic_block() + builder = ir.IRBuilder(bb) + + # This implementation mimics compiler-rt's. + al = builder.trunc(a, i64) + bl = builder.trunc(b, i64) + ah = builder.trunc(builder.ashr(a, _64), i64) + bh = builder.trunc(builder.ashr(b, _64), i64) + + # Compute {rh, rl} = al * bl (unsigned 64-bit multiplication) + # rl = (al & 0xffffffff) * (bl & 0xffffffff) + rl = builder.mul(builder.and_(al, lower_mask), builder.and_(bl, lower_mask)) + # t = rl >> 32 + t = builder.lshr(rl, _32) + # rl &= 0xffffffff + rl = builder.and_(rl, lower_mask) + # t += (al >> 32) * (bl & 0xffffffff) + t = builder.add(t, builder.mul(builder.lshr(al, _32), + builder.and_(bl, lower_mask))) + # rl += t << 32 + rl = builder.add(rl, builder.shl(t, _32)) + # rh = t >> 32 + rh = builder.lshr(t, _32) + # t = rl >> 32 + t = builder.lshr(rl, _32) + # rl &= 0xffffffff + rl = builder.and_(rl, lower_mask) + # t += (bl >> 32) * (al & 0xffffffff) + t = builder.add(t, builder.mul(builder.lshr(bl, _32), + builder.and_(al, lower_mask))) + # rl += t << 32 + rl = builder.add(rl, builder.shl(t, _32)) + # rh += t >> 32 + rh = builder.add(rh, builder.lshr(t, _32)) + # rh += (al >> 32) * (bl >> 32) + rh = builder.add(rh, builder.mul(builder.lshr(al, _32), + builder.lshr(bl, _32))) + + # rh += (bh * al) + (bl * ah) + rh = builder.add(rh, builder.mul(bh, al)) + rh = builder.add(rh, builder.mul(bl, ah)) + + # r = rl + (rh << 64) + r = builder.zext(rl, i128) + r = builder.add(r, builder.shl(builder.zext(rh, i128), _64)) + builder.ret(r) + + library.add_ir_module(ir_mod) + library.finalize() + + return library + + +class _Installer(object): + + _installed = False + + def install(self, context): + """ + Install the functions into LLVM. This only needs to be done once, + as the mappings are persistent during the process lifetime. + """ + if not self._installed: + self._do_install(context) + self._installed = True + + +class _ExternalMathFunctions(_Installer): + """ + Map the math functions from the C runtime library into the LLVM + execution environment. + """ + + def _do_install(self, context): + is32bit = utils.MACHINE_BITS == 32 + c_helpers = _helperlib.c_helpers + + if sys.platform.startswith('win32') and is32bit: + # For Windows XP _ftol2 is not defined, we will just use + # _ftol as a replacement. + # On Windows 7, this is not necessary but will work anyway. + ftol = _get_msvcrt_symbol("_ftol") + _add_missing_symbol("_ftol2", ftol) + + elif sys.platform.startswith('linux') and is32bit: + _add_missing_symbol("__fixunsdfdi", c_helpers["fptoui"]) + _add_missing_symbol("__fixunssfdi", c_helpers["fptouif"]) + + if is32bit: + # Make the library immortal + self._multi3_lib = compile_multi3(context) + ptr = self._multi3_lib.get_pointer_to_function("multi3") + assert ptr + _add_missing_symbol("__multi3", ptr) + + # List available C-math + for fname in intrinsics.INTR_MATH: + # Force binding from CPython's C runtime library. + # (under Windows, different versions of the C runtime can + # be loaded at the same time, for example msvcrt100 by + # CPython and msvcrt120 by LLVM) + ll.add_symbol(fname, c_helpers[fname]) + + +c_math_functions = _ExternalMathFunctions() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/fastmathpass.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/fastmathpass.py new file mode 100644 index 0000000000000000000000000000000000000000..d6dd1b89c20cb78cd9b819663aa21ce270ce4bbe --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/fastmathpass.py @@ -0,0 +1,44 @@ +from llvmlite import ir +from llvmlite.ir.transforms import Visitor, CallVisitor + + +class FastFloatBinOpVisitor(Visitor): + """ + A pass to add fastmath flag to float-binop instruction if they don't have + any flags. + """ + float_binops = frozenset(['fadd', 'fsub', 'fmul', 'fdiv', 'frem', 'fcmp']) + + def __init__(self, flags): + self.flags = flags + + def visit_Instruction(self, instr): + if instr.opname in self.float_binops: + if not instr.flags: + for flag in self.flags: + instr.flags.append(flag) + + +class FastFloatCallVisitor(CallVisitor): + """ + A pass to change all float function calls to use fastmath. + """ + + def __init__(self, flags): + self.flags = flags + + def visit_Call(self, instr): + # Add to any call that has float/double return type + if instr.type in (ir.FloatType(), ir.DoubleType()): + for flag in self.flags: + instr.fastmath.add(flag) + + +def rewrite_module(mod, options): + """ + Rewrite the given LLVM module to use fastmath everywhere. + """ + flags = options.flags + FastFloatBinOpVisitor(flags).visit(mod) + FastFloatCallVisitor(flags).visit(mod) + diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/funcdesc.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/funcdesc.py new file mode 100644 index 0000000000000000000000000000000000000000..2a1a4fe17ef761538cf1defba575632a604a8de0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/funcdesc.py @@ -0,0 +1,230 @@ +""" +Function descriptors. +""" + +from collections import defaultdict +import importlib + +from numba.core import types, itanium_mangler +from numba.core.utils import _dynamic_modname, _dynamic_module + + +def default_mangler(name, argtypes, *, abi_tags=(), uid=None): + return itanium_mangler.mangle(name, argtypes, abi_tags=abi_tags, uid=uid) + + +def qualifying_prefix(modname, qualname): + """ + Returns a new string that is used for the first half of the mangled name. + """ + # XXX choose a different convention for object mode + return '{}.{}'.format(modname, qualname) if modname else qualname + + +class FunctionDescriptor(object): + """ + Base class for function descriptors: an object used to carry + useful metadata about a natively callable function. + + Note that while `FunctionIdentity` denotes a Python function + which is being concretely compiled by Numba, `FunctionDescriptor` + may be more "abstract": e.g. a function decorated with `@generated_jit`. + """ + __slots__ = ('native', 'modname', 'qualname', 'doc', 'typemap', + 'calltypes', 'args', 'kws', 'restype', 'argtypes', + 'mangled_name', 'unique_name', 'env_name', 'global_dict', + 'inline', 'noalias', 'abi_tags', 'uid') + + def __init__(self, native, modname, qualname, unique_name, doc, + typemap, restype, calltypes, args, kws, mangler=None, + argtypes=None, inline=False, noalias=False, env_name=None, + global_dict=None, abi_tags=(), uid=None): + self.native = native + self.modname = modname + self.global_dict = global_dict + self.qualname = qualname + self.unique_name = unique_name + self.doc = doc + # XXX typemap and calltypes should be on the compile result, + # not the FunctionDescriptor + self.typemap = typemap + self.calltypes = calltypes + self.args = args + self.kws = kws + self.restype = restype + # Argument types + if argtypes is not None: + assert isinstance(argtypes, tuple), argtypes + self.argtypes = argtypes + else: + # Get argument types from the type inference result + # (note the "arg.FOO" convention as used in typeinfer + self.argtypes = tuple(self.typemap['arg.' + a] for a in args) + mangler = default_mangler if mangler is None else mangler + # The mangled name *must* be unique, else the wrong function can + # be chosen at link time. + qualprefix = qualifying_prefix(self.modname, self.qualname) + self.uid = uid + self.mangled_name = mangler( + qualprefix, self.argtypes, abi_tags=abi_tags, uid=uid, + ) + if env_name is None: + env_name = mangler(".NumbaEnv.{}".format(qualprefix), + self.argtypes, abi_tags=abi_tags, uid=uid) + self.env_name = env_name + self.inline = inline + self.noalias = noalias + self.abi_tags = abi_tags + + def lookup_globals(self): + """ + Return the global dictionary of the function. + It may not match the Module's globals if the function is created + dynamically (i.e. exec) + """ + return self.global_dict or self.lookup_module().__dict__ + + def lookup_module(self): + """ + Return the module in which this function is supposed to exist. + This may be a dummy module if the function was dynamically + generated or the module can't be found. + """ + if self.modname == _dynamic_modname: + return _dynamic_module + else: + try: + # ensure module exist + return importlib.import_module(self.modname) + except ImportError: + return _dynamic_module + + def lookup_function(self): + """ + Return the original function object described by this object. + """ + return getattr(self.lookup_module(), self.qualname) + + @property + def llvm_func_name(self): + """ + The LLVM-registered name for the raw function. + """ + return self.mangled_name + + # XXX refactor this + + @property + def llvm_cpython_wrapper_name(self): + """ + The LLVM-registered name for a CPython-compatible wrapper of the + raw function (i.e. a PyCFunctionWithKeywords). + """ + return itanium_mangler.prepend_namespace(self.mangled_name, + ns='cpython') + + @property + def llvm_cfunc_wrapper_name(self): + """ + The LLVM-registered name for a C-compatible wrapper of the + raw function. + """ + return 'cfunc.' + self.mangled_name + + def __repr__(self): + return "" % (self.unique_name) + + @classmethod + def _get_function_info(cls, func_ir): + """ + Returns + ------- + qualname, unique_name, modname, doc, args, kws, globals + + ``unique_name`` must be a unique name. + """ + func = func_ir.func_id.func + qualname = func_ir.func_id.func_qualname + # XXX to func_id + modname = func.__module__ + doc = func.__doc__ or '' + args = tuple(func_ir.arg_names) + kws = () # TODO + global_dict = None + + if modname is None: + # Dynamically generated function. + modname = _dynamic_modname + # Retain a reference to the dictionary of the function. + # This disables caching, serialization and pickling. + global_dict = func_ir.func_id.func.__globals__ + + unique_name = func_ir.func_id.unique_name + + return qualname, unique_name, modname, doc, args, kws, global_dict + + @classmethod + def _from_python_function(cls, func_ir, typemap, restype, + calltypes, native, mangler=None, + inline=False, noalias=False, abi_tags=()): + (qualname, unique_name, modname, doc, args, kws, global_dict, + ) = cls._get_function_info(func_ir) + + self = cls(native, modname, qualname, unique_name, doc, + typemap, restype, calltypes, + args, kws, mangler=mangler, inline=inline, noalias=noalias, + global_dict=global_dict, abi_tags=abi_tags, + uid=func_ir.func_id.unique_id) + return self + + +class PythonFunctionDescriptor(FunctionDescriptor): + """ + A FunctionDescriptor subclass for Numba-compiled functions. + """ + __slots__ = () + + @classmethod + def from_specialized_function(cls, func_ir, typemap, restype, calltypes, + mangler, inline, noalias, abi_tags): + """ + Build a FunctionDescriptor for a given specialization of a Python + function (in nopython mode). + """ + return cls._from_python_function(func_ir, typemap, restype, calltypes, + native=True, mangler=mangler, + inline=inline, noalias=noalias, + abi_tags=abi_tags) + + @classmethod + def from_object_mode_function(cls, func_ir): + """ + Build a FunctionDescriptor for an object mode variant of a Python + function. + """ + typemap = defaultdict(lambda: types.pyobject) + calltypes = typemap.copy() + restype = types.pyobject + return cls._from_python_function(func_ir, typemap, restype, calltypes, + native=False) + + +class ExternalFunctionDescriptor(FunctionDescriptor): + """ + A FunctionDescriptor subclass for opaque external functions + (e.g. raw C functions). + """ + __slots__ = () + + def __init__(self, name, restype, argtypes): + args = ["arg%d" % i for i in range(len(argtypes))] + + def mangler(a, x, abi_tags, uid=None): + return a + super(ExternalFunctionDescriptor, self + ).__init__(native=True, modname=None, qualname=name, + unique_name=name, doc='', typemap=None, + restype=restype, calltypes=None, args=args, + kws=None, + mangler=mangler, + argtypes=argtypes) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/generators.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/generators.py new file mode 100644 index 0000000000000000000000000000000000000000..6060d160c3bf2a2a7a01ec480c86d0c949cf0b6f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/generators.py @@ -0,0 +1,356 @@ +""" +Support for lowering generators. +""" + +import llvmlite.ir +from llvmlite.ir import Constant, IRBuilder + +from numba.core import types, config, cgutils +from numba.core.funcdesc import FunctionDescriptor + + +class GeneratorDescriptor(FunctionDescriptor): + """ + The descriptor for a generator's next function. + """ + __slots__ = () + + @classmethod + def from_generator_fndesc(cls, func_ir, fndesc, gentype, mangler): + """ + Build a GeneratorDescriptor for the generator returned by the + function described by *fndesc*, with type *gentype*. + + The generator inherits the env_name from the *fndesc*. + All emitted functions for the generator shares the same Env. + """ + assert isinstance(gentype, types.Generator) + restype = gentype.yield_type + args = ['gen'] + argtypes = (gentype,) + qualname = fndesc.qualname + '.next' + unique_name = fndesc.unique_name + '.next' + self = cls(fndesc.native, fndesc.modname, qualname, unique_name, + fndesc.doc, fndesc.typemap, restype, fndesc.calltypes, + args, fndesc.kws, argtypes=argtypes, mangler=mangler, + inline=False, env_name=fndesc.env_name) + return self + + @property + def llvm_finalizer_name(self): + """ + The LLVM name of the generator's finalizer function + (if .has_finalizer is true). + """ + return 'finalize_' + self.mangled_name + + +class BaseGeneratorLower(object): + """ + Base support class for lowering generators. + """ + + def __init__(self, lower): + self.context = lower.context + self.fndesc = lower.fndesc + self.library = lower.library + self.call_conv = lower.call_conv + self.func_ir = lower.func_ir + + self.geninfo = lower.generator_info + self.gentype = self.get_generator_type() + self.gendesc = GeneratorDescriptor.from_generator_fndesc( + lower.func_ir, self.fndesc, self.gentype, self.context.mangler) + # Helps packing non-omitted arguments into a structure + self.arg_packer = self.context.get_data_packer(self.fndesc.argtypes) + + self.resume_blocks = {} + + def get_args_ptr(self, builder, genptr): + return cgutils.gep_inbounds(builder, genptr, 0, 1) + + def get_resume_index_ptr(self, builder, genptr): + return cgutils.gep_inbounds(builder, genptr, 0, 0, + name='gen.resume_index') + + def get_state_ptr(self, builder, genptr): + return cgutils.gep_inbounds(builder, genptr, 0, 2, + name='gen.state') + + def lower_init_func(self, lower): + """ + Lower the generator's initialization function (which will fill up + the passed-by-reference generator structure). + """ + lower.setup_function(self.fndesc) + + builder = lower.builder + + # Insert the generator into the target context in order to allow + # calling from other Numba-compiled functions. + lower.context.insert_generator(self.gentype, self.gendesc, + [self.library]) + + # Init argument values + lower.extract_function_arguments() + + lower.pre_lower() + + # Initialize the return structure (i.e. the generator structure). + retty = self.context.get_return_type(self.gentype) + # Structure index #0: the initial resume index (0 == start of generator) + resume_index = self.context.get_constant(types.int32, 0) + # Structure index #1: the function arguments + argsty = retty.elements[1] + statesty = retty.elements[2] + + lower.debug_print("# low_init_func incref") + # Incref all NRT arguments before storing into generator states + if self.context.enable_nrt: + for argty, argval in zip(self.fndesc.argtypes, lower.fnargs): + self.context.nrt.incref(builder, argty, argval) + + # Filter out omitted arguments + argsval = self.arg_packer.as_data(builder, lower.fnargs) + + # Zero initialize states + statesval = Constant(statesty, None) + gen_struct = cgutils.make_anonymous_struct(builder, + [resume_index, argsval, + statesval], + retty) + + retval = self.box_generator_struct(lower, gen_struct) + + lower.debug_print("# low_init_func before return") + self.call_conv.return_value(builder, retval) + lower.post_lower() + + def lower_next_func(self, lower): + """ + Lower the generator's next() function (which takes the + passed-by-reference generator structure and returns the next + yielded value). + """ + lower.setup_function(self.gendesc) + lower.debug_print("# lower_next_func: {0}".format(self.gendesc.unique_name)) + assert self.gendesc.argtypes[0] == self.gentype + builder = lower.builder + function = lower.function + + # Extract argument values and other information from generator struct + genptr, = self.call_conv.get_arguments(function) + self.arg_packer.load_into(builder, + self.get_args_ptr(builder, genptr), + lower.fnargs) + + self.resume_index_ptr = self.get_resume_index_ptr(builder, genptr) + self.gen_state_ptr = self.get_state_ptr(builder, genptr) + + prologue = function.append_basic_block("generator_prologue") + + # Lower the generator's Python code + entry_block_tail = lower.lower_function_body() + + # Add block for StopIteration on entry + stop_block = function.append_basic_block("stop_iteration") + builder.position_at_end(stop_block) + self.call_conv.return_stop_iteration(builder) + + # Add prologue switch to resume blocks + builder.position_at_end(prologue) + # First Python block is also the resume point on first next() call + first_block = self.resume_blocks[0] = lower.blkmap[lower.firstblk] + + # Create front switch to resume points + switch = builder.switch(builder.load(self.resume_index_ptr), + stop_block) + for index, block in self.resume_blocks.items(): + switch.add_case(index, block) + + # Close tail of entry block + builder.position_at_end(entry_block_tail) + builder.branch(prologue) + + def lower_finalize_func(self, lower): + """ + Lower the generator's finalizer. + """ + fnty = llvmlite.ir.FunctionType(llvmlite.ir.VoidType(), + [self.context.get_value_type(self.gentype)]) + function = cgutils.get_or_insert_function( + lower.module, fnty, self.gendesc.llvm_finalizer_name) + entry_block = function.append_basic_block('entry') + builder = IRBuilder(entry_block) + + genptrty = self.context.get_value_type(self.gentype) + genptr = builder.bitcast(function.args[0], genptrty) + self.lower_finalize_func_body(builder, genptr) + + def return_from_generator(self, lower): + """ + Emit a StopIteration at generator end and mark the generator exhausted. + """ + indexval = Constant(self.resume_index_ptr.type.pointee, -1) + lower.builder.store(indexval, self.resume_index_ptr) + self.call_conv.return_stop_iteration(lower.builder) + + def create_resumption_block(self, lower, index): + block_name = "generator_resume%d" % (index,) + block = lower.function.append_basic_block(block_name) + lower.builder.position_at_end(block) + self.resume_blocks[index] = block + + def debug_print(self, builder, msg): + if config.DEBUG_JIT: + self.context.debug_print(builder, "DEBUGJIT: {0}".format(msg)) + +class GeneratorLower(BaseGeneratorLower): + """ + Support class for lowering nopython generators. + """ + + def get_generator_type(self): + return self.fndesc.restype + + def box_generator_struct(self, lower, gen_struct): + return gen_struct + + def lower_finalize_func_body(self, builder, genptr): + """ + Lower the body of the generator's finalizer: decref all live + state variables. + """ + self.debug_print(builder, "# generator: finalize") + if self.context.enable_nrt: + + # Always dereference all arguments + # self.debug_print(builder, "# generator: clear args") + args_ptr = self.get_args_ptr(builder, genptr) + for ty, val in self.arg_packer.load(builder, args_ptr): + self.context.nrt.decref(builder, ty, val) + + self.debug_print(builder, "# generator: finalize end") + builder.ret_void() + +class PyGeneratorLower(BaseGeneratorLower): + """ + Support class for lowering object mode generators. + """ + + def get_generator_type(self): + """ + Compute the actual generator type (the generator function's return + type is simply "pyobject"). + """ + return types.Generator( + gen_func=self.func_ir.func_id.func, + yield_type=types.pyobject, + arg_types=(types.pyobject,) * self.func_ir.arg_count, + state_types=(types.pyobject,) * len(self.geninfo.state_vars), + has_finalizer=True, + ) + + def box_generator_struct(self, lower, gen_struct): + """ + Box the raw *gen_struct* as a Python object. + """ + gen_ptr = cgutils.alloca_once_value(lower.builder, gen_struct) + return lower.pyapi.from_native_generator(gen_ptr, self.gentype, lower.envarg) + + def init_generator_state(self, lower): + """ + NULL-initialize all generator state variables, to avoid spurious + decref's on cleanup. + """ + lower.builder.store(Constant(self.gen_state_ptr.type.pointee, None), + self.gen_state_ptr) + + def lower_finalize_func_body(self, builder, genptr): + """ + Lower the body of the generator's finalizer: decref all live + state variables. + """ + pyapi = self.context.get_python_api(builder) + resume_index_ptr = self.get_resume_index_ptr(builder, genptr) + resume_index = builder.load(resume_index_ptr) + # If resume_index is 0, next() was never called + # If resume_index is -1, generator terminated cleanly + # (note function arguments are saved in state variables, + # so they don't need a separate cleanup step) + need_cleanup = builder.icmp_signed( + '>', resume_index, Constant(resume_index.type, 0)) + + with cgutils.if_unlikely(builder, need_cleanup): + # Decref all live vars (some may be NULL) + gen_state_ptr = self.get_state_ptr(builder, genptr) + for state_index in range(len(self.gentype.state_types)): + state_slot = cgutils.gep_inbounds(builder, gen_state_ptr, + 0, state_index) + ty = self.gentype.state_types[state_index] + val = self.context.unpack_value(builder, ty, state_slot) + pyapi.decref(val) + + builder.ret_void() + + +class LowerYield(object): + """ + Support class for lowering a particular yield point. + """ + + def __init__(self, lower, yield_point, live_vars): + self.lower = lower + self.context = lower.context + self.builder = lower.builder + self.genlower = lower.genlower + self.gentype = self.genlower.gentype + + self.gen_state_ptr = self.genlower.gen_state_ptr + self.resume_index_ptr = self.genlower.resume_index_ptr + self.yp = yield_point + self.inst = self.yp.inst + self.live_vars = live_vars + self.live_var_indices = [lower.generator_info.state_vars.index(v) + for v in live_vars] + + def lower_yield_suspend(self): + self.lower.debug_print("# generator suspend") + # Save live vars in state + for state_index, name in zip(self.live_var_indices, self.live_vars): + state_slot = cgutils.gep_inbounds(self.builder, self.gen_state_ptr, + 0, state_index) + ty = self.gentype.state_types[state_index] + # The yield might be in a loop, in which case the state might + # contain a predicate var that branches back to the loop head, in + # this case the var is live but in sequential lowering won't have + # been alloca'd yet, so do this here. + fetype = self.lower.typeof(name) + self.lower._alloca_var(name, fetype) + val = self.lower.loadvar(name) + # IncRef newly stored value + if self.context.enable_nrt: + self.context.nrt.incref(self.builder, ty, val) + + self.context.pack_value(self.builder, ty, val, state_slot) + # Save resume index + indexval = Constant(self.resume_index_ptr.type.pointee, + self.inst.index) + self.builder.store(indexval, self.resume_index_ptr) + self.lower.debug_print("# generator suspend end") + + def lower_yield_resume(self): + # Emit resumption point + self.genlower.create_resumption_block(self.lower, self.inst.index) + self.lower.debug_print("# generator resume") + # Reload live vars from state + for state_index, name in zip(self.live_var_indices, self.live_vars): + state_slot = cgutils.gep_inbounds(self.builder, self.gen_state_ptr, + 0, state_index) + ty = self.gentype.state_types[state_index] + val = self.context.unpack_value(self.builder, ty, state_slot) + self.lower.storevar(val, name) + # Previous storevar is making an extra incref + if self.context.enable_nrt: + self.context.nrt.decref(self.builder, ty, val) + self.lower.debug_print("# generator resume end") diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/imputils.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/imputils.py new file mode 100644 index 0000000000000000000000000000000000000000..06c35abb9cf99389b6015755fb9aec7048c8387f --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/imputils.py @@ -0,0 +1,469 @@ +""" +Utilities to simplify the boilerplate for native lowering. +""" + + +import collections +import contextlib +import inspect +import functools +from enum import Enum + +from numba.core import typing, types, utils, cgutils +from numba.core.typing.templates import BaseRegistryLoader + + +class Registry(object): + """ + A registry of function and attribute implementations. + """ + def __init__(self, name='unspecified'): + self.name = name + self.functions = [] + self.getattrs = [] + self.setattrs = [] + self.casts = [] + self.constants = [] + + def lower(self, func, *argtys): + """ + Decorate an implementation of *func* for the given argument types. + *func* may be an actual global function object, or any + pseudo-function supported by Numba, such as "getitem". + + The decorated implementation has the signature + (context, builder, sig, args). + """ + def decorate(impl): + self.functions.append((impl, func, argtys)) + return impl + return decorate + + def _decorate_attr(self, impl, ty, attr, impl_list, decorator): + real_impl = decorator(impl, ty, attr) + impl_list.append((real_impl, attr, real_impl.signature)) + return impl + + def lower_getattr(self, ty, attr): + """ + Decorate an implementation of __getattr__ for type *ty* and + the attribute *attr*. + + The decorated implementation will have the signature + (context, builder, typ, val). + """ + def decorate(impl): + return self._decorate_attr(impl, ty, attr, self.getattrs, + _decorate_getattr) + return decorate + + def lower_getattr_generic(self, ty): + """ + Decorate the fallback implementation of __getattr__ for type *ty*. + + The decorated implementation will have the signature + (context, builder, typ, val, attr). The implementation is + called for attributes which haven't been explicitly registered + with lower_getattr(). + """ + return self.lower_getattr(ty, None) + + def lower_setattr(self, ty, attr): + """ + Decorate an implementation of __setattr__ for type *ty* and + the attribute *attr*. + + The decorated implementation will have the signature + (context, builder, sig, args). + """ + def decorate(impl): + return self._decorate_attr(impl, ty, attr, self.setattrs, + _decorate_setattr) + return decorate + + def lower_setattr_generic(self, ty): + """ + Decorate the fallback implementation of __setattr__ for type *ty*. + + The decorated implementation will have the signature + (context, builder, sig, args, attr). The implementation is + called for attributes which haven't been explicitly registered + with lower_setattr(). + """ + return self.lower_setattr(ty, None) + + def lower_cast(self, fromty, toty): + """ + Decorate the implementation of implicit conversion between + *fromty* and *toty*. + + The decorated implementation will have the signature + (context, builder, fromty, toty, val). + """ + def decorate(impl): + self.casts.append((impl, (fromty, toty))) + return impl + return decorate + + def lower_constant(self, ty): + """ + Decorate the implementation for creating a constant of type *ty*. + + The decorated implementation will have the signature + (context, builder, ty, pyval). + """ + def decorate(impl): + self.constants.append((impl, (ty,))) + return impl + return decorate + + def __repr__(self): + return f"Lowering Registry<{self.name}>" + + +class RegistryLoader(BaseRegistryLoader): + """ + An incremental loader for a target registry. + """ + registry_items = ('functions', 'getattrs', 'setattrs', 'casts', 'constants') + + +# Global registry for implementations of builtin operations +# (functions, attributes, type casts) +builtin_registry = Registry('builtin_registry') + +lower_builtin = builtin_registry.lower +lower_getattr = builtin_registry.lower_getattr +lower_getattr_generic = builtin_registry.lower_getattr_generic +lower_setattr = builtin_registry.lower_setattr +lower_setattr_generic = builtin_registry.lower_setattr_generic +lower_cast = builtin_registry.lower_cast +lower_constant = builtin_registry.lower_constant + + +def _decorate_getattr(impl, ty, attr): + real_impl = impl + + if attr is not None: + def res(context, builder, typ, value, attr): + return real_impl(context, builder, typ, value) + else: + def res(context, builder, typ, value, attr): + return real_impl(context, builder, typ, value, attr) + + res.signature = (ty,) + res.attr = attr + return res + +def _decorate_setattr(impl, ty, attr): + real_impl = impl + + if attr is not None: + def res(context, builder, sig, args, attr): + return real_impl(context, builder, sig, args) + else: + def res(context, builder, sig, args, attr): + return real_impl(context, builder, sig, args, attr) + + res.signature = (ty, types.Any) + res.attr = attr + return res + + +def fix_returning_optional(context, builder, sig, status, retval): + # Reconstruct optional return type + if isinstance(sig.return_type, types.Optional): + value_type = sig.return_type.type + optional_none = context.make_optional_none(builder, value_type) + retvalptr = cgutils.alloca_once_value(builder, optional_none) + with builder.if_then(builder.not_(status.is_none)): + optional_value = context.make_optional_value( + builder, value_type, retval, + ) + builder.store(optional_value, retvalptr) + retval = builder.load(retvalptr) + return retval + +def user_function(fndesc, libs): + """ + A wrapper inserting code calling Numba-compiled *fndesc*. + """ + + def imp(context, builder, sig, args): + func = context.declare_function(builder.module, fndesc) + # env=None assumes this is a nopython function + status, retval = context.call_conv.call_function( + builder, func, fndesc.restype, fndesc.argtypes, args) + with cgutils.if_unlikely(builder, status.is_error): + context.call_conv.return_status_propagate(builder, status) + assert sig.return_type == fndesc.restype + # Reconstruct optional return type + retval = fix_returning_optional(context, builder, sig, status, retval) + # If the data representations don't match up + if retval.type != context.get_value_type(sig.return_type): + msg = "function returned {0} but expect {1}" + raise TypeError(msg.format(retval.type, sig.return_type)) + + return impl_ret_new_ref(context, builder, fndesc.restype, retval) + + imp.signature = fndesc.argtypes + imp.libs = tuple(libs) + return imp + + +def user_generator(gendesc, libs): + """ + A wrapper inserting code calling Numba-compiled *gendesc*. + """ + + def imp(context, builder, sig, args): + func = context.declare_function(builder.module, gendesc) + # env=None assumes this is a nopython function + status, retval = context.call_conv.call_function( + builder, func, gendesc.restype, gendesc.argtypes, args) + # Return raw status for caller to process StopIteration + return status, retval + + imp.libs = tuple(libs) + return imp + + +def iterator_impl(iterable_type, iterator_type): + """ + Decorator a given class as implementing *iterator_type* + (by providing an `iternext()` method). + """ + + def wrapper(cls): + # These are unbound methods + iternext = cls.iternext + + @iternext_impl(RefType.BORROWED) + def iternext_wrapper(context, builder, sig, args, result): + (value,) = args + iterobj = cls(context, builder, value) + return iternext(iterobj, context, builder, result) + + lower_builtin('iternext', iterator_type)(iternext_wrapper) + return cls + + return wrapper + + +class _IternextResult(object): + """ + A result wrapper for iteration, passed by iternext_impl() into the + wrapped function. + """ + __slots__ = ('_context', '_builder', '_pairobj') + + def __init__(self, context, builder, pairobj): + self._context = context + self._builder = builder + self._pairobj = pairobj + + def set_exhausted(self): + """ + Mark the iterator as exhausted. + """ + self._pairobj.second = self._context.get_constant(types.boolean, False) + + def set_valid(self, is_valid=True): + """ + Mark the iterator as valid according to *is_valid* (which must + be either a Python boolean or a LLVM inst). + """ + if is_valid in (False, True): + is_valid = self._context.get_constant(types.boolean, is_valid) + self._pairobj.second = is_valid + + def yield_(self, value): + """ + Mark the iterator as yielding the given *value* (a LLVM inst). + """ + self._pairobj.first = value + + def is_valid(self): + """ + Return whether the iterator is marked valid. + """ + return self._context.get_argument_value(self._builder, + types.boolean, + self._pairobj.second) + + def yielded_value(self): + """ + Return the iterator's yielded value, if any. + """ + return self._pairobj.first + +class RefType(Enum): + """ + Enumerate the reference type + """ + """ + A new reference + """ + NEW = 1 + """ + A borrowed reference + """ + BORROWED = 2 + """ + An untracked reference + """ + UNTRACKED = 3 + +def iternext_impl(ref_type=None): + """ + Wrap the given iternext() implementation so that it gets passed + an _IternextResult() object easing the returning of the iternext() + result pair. + + ref_type: a numba.targets.imputils.RefType value, the reference type used is + that specified through the RefType enum. + + The wrapped function will be called with the following signature: + (context, builder, sig, args, iternext_result) + """ + if ref_type not in [x for x in RefType]: + raise ValueError("ref_type must be an enum member of imputils.RefType") + + def outer(func): + def wrapper(context, builder, sig, args): + pair_type = sig.return_type + pairobj = context.make_helper(builder, pair_type) + func(context, builder, sig, args, + _IternextResult(context, builder, pairobj)) + if ref_type == RefType.NEW: + impl_ret = impl_ret_new_ref + elif ref_type == RefType.BORROWED: + impl_ret = impl_ret_borrowed + elif ref_type == RefType.UNTRACKED: + impl_ret = impl_ret_untracked + else: + raise ValueError("Unknown ref_type encountered") + return impl_ret(context, builder, + pair_type, pairobj._getvalue()) + return wrapper + return outer + + +def call_getiter(context, builder, iterable_type, val): + """ + Call the `getiter()` implementation for the given *iterable_type* + of value *val*, and return the corresponding LLVM inst. + """ + getiter_sig = typing.signature(iterable_type.iterator_type, iterable_type) + getiter_impl = context.get_function('getiter', getiter_sig) + return getiter_impl(builder, (val,)) + + +def call_iternext(context, builder, iterator_type, val): + """ + Call the `iternext()` implementation for the given *iterator_type* + of value *val*, and return a convenience _IternextResult() object + reflecting the results. + """ + itemty = iterator_type.yield_type + pair_type = types.Pair(itemty, types.boolean) + iternext_sig = typing.signature(pair_type, iterator_type) + iternext_impl = context.get_function('iternext', iternext_sig) + val = iternext_impl(builder, (val,)) + pairobj = context.make_helper(builder, pair_type, val) + return _IternextResult(context, builder, pairobj) + + +def call_len(context, builder, ty, val): + """ + Call len() on the given value. Return None if len() isn't defined on + this type. + """ + try: + len_impl = context.get_function(len, typing.signature(types.intp, ty,)) + except NotImplementedError: + return None + else: + return len_impl(builder, (val,)) + + +_ForIterLoop = collections.namedtuple('_ForIterLoop', + ('value', 'do_break')) + + +@contextlib.contextmanager +def for_iter(context, builder, iterable_type, val): + """ + Simulate a for loop on the given iterable. Yields a namedtuple with + the given members: + - `value` is the value being yielded + - `do_break` is a callable to early out of the loop + """ + iterator_type = iterable_type.iterator_type + iterval = call_getiter(context, builder, iterable_type, val) + + bb_body = builder.append_basic_block('for_iter.body') + bb_end = builder.append_basic_block('for_iter.end') + + def do_break(): + builder.branch(bb_end) + + builder.branch(bb_body) + + with builder.goto_block(bb_body): + res = call_iternext(context, builder, iterator_type, iterval) + with builder.if_then(builder.not_(res.is_valid()), likely=False): + builder.branch(bb_end) + yield _ForIterLoop(res.yielded_value(), do_break) + builder.branch(bb_body) + + builder.position_at_end(bb_end) + if context.enable_nrt: + context.nrt.decref(builder, iterator_type, iterval) + + +def impl_ret_new_ref(ctx, builder, retty, ret): + """ + The implementation returns a new reference. + """ + return ret + + +def impl_ret_borrowed(ctx, builder, retty, ret): + """ + The implementation returns a borrowed reference. + This function automatically incref so that the implementation is + returning a new reference. + """ + if ctx.enable_nrt: + ctx.nrt.incref(builder, retty, ret) + return ret + + +def impl_ret_untracked(ctx, builder, retty, ret): + """ + The return type is not a NRT object. + """ + return ret + + +@contextlib.contextmanager +def force_error_model(context, model_name='numpy'): + """ + Temporarily change the context's error model. + """ + from numba.core import callconv + + old_error_model = context.error_model + context.error_model = callconv.create_error_model(model_name, context) + try: + yield + finally: + context.error_model = old_error_model + + +def numba_typeref_ctor(*args, **kwargs): + """A stub for use internally by Numba when a call is emitted + on a TypeRef. + """ + raise NotImplementedError("This function should not be executed.") diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/inline_closurecall.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/inline_closurecall.py new file mode 100644 index 0000000000000000000000000000000000000000..badb6b49d7ad70dc85824c1574374c7e706cb637 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/inline_closurecall.py @@ -0,0 +1,1557 @@ +import types as pytypes # avoid confusion with numba.types +import copy +import ctypes +import numba.core.analysis +from numba.core import utils, types, typing, errors, ir, rewrites, config, ir_utils +from numba import prange +from numba.parfors.parfor import internal_prange +from numba.core.ir_utils import ( + mk_unique_var, + next_label, + add_offset_to_labels, + replace_vars, + remove_dels, + rename_labels, + find_topo_order, + merge_adjacent_blocks, + GuardException, + require, + guard, + get_definition, + find_callname, + find_build_sequence, + get_np_ufunc_typ, + get_ir_of_code, + simplify_CFG, + canonicalize_array_math, + dead_code_elimination, + ) + +from numba.core.analysis import ( + compute_cfg_from_blocks, + compute_use_defs, + compute_live_variables) +from numba.core import postproc +from numba.np.unsafe.ndarray import empty_inferred as unsafe_empty_inferred +import numpy as np +import operator +import numba.misc.special + +""" +Variable enable_inline_arraycall is only used for testing purpose. +""" +enable_inline_arraycall = True + + +def callee_ir_validator(func_ir): + """Checks the IR of a callee is supported for inlining + """ + for blk in func_ir.blocks.values(): + for stmt in blk.find_insts(ir.Assign): + if isinstance(stmt.value, ir.Yield): + msg = "The use of yield in a closure is unsupported." + raise errors.UnsupportedError(msg, loc=stmt.loc) + + +def _created_inlined_var_name(function_name, var_name): + """Creates a name for an inlined variable based on the function name and the + variable name. It does this "safely" to avoid the use of characters that are + illegal in python variable names as there are occasions when function + generation needs valid python name tokens.""" + inlined_name = f'{function_name}.{var_name}' + # Replace angle brackets, e.g. "" is replaced with "_locals_" + new_name = inlined_name.replace('<', '_').replace('>', '_') + # The version "version" of the closure function e.g. foo$2 (id 2) is + # rewritten as "foo_v2". Further "." is also replaced with "_". + new_name = new_name.replace('.', '_').replace('$', '_v') + return new_name + + +class InlineClosureCallPass(object): + """InlineClosureCallPass class looks for direct calls to locally defined + closures, and inlines the body of the closure function to the call site. + """ + + def __init__(self, func_ir, parallel_options, swapped={}, typed=False): + self.func_ir = func_ir + self.parallel_options = parallel_options + self.swapped = swapped + self._processed_stencils = [] + self.typed = typed + + def run(self): + """Run inline closure call pass. + """ + # Analysis relies on ir.Del presence, strip out later + pp = postproc.PostProcessor(self.func_ir) + pp.run(True) + + modified = False + work_list = list(self.func_ir.blocks.items()) + debug_print = _make_debug_print("InlineClosureCallPass") + debug_print("START") + while work_list: + label, block = work_list.pop() + for i, instr in enumerate(block.body): + if isinstance(instr, ir.Assign): + lhs = instr.target + expr = instr.value + if isinstance(expr, ir.Expr) and expr.op == 'call': + call_name = guard(find_callname, self.func_ir, expr) + func_def = guard(get_definition, self.func_ir, expr.func) + + if guard(self._inline_reduction, + work_list, block, i, expr, call_name): + modified = True + break # because block structure changed + + if guard(self._inline_closure, + work_list, block, i, func_def): + modified = True + break # because block structure changed + + if guard(self._inline_stencil, + instr, call_name, func_def): + modified = True + + if enable_inline_arraycall: + # Identify loop structure + if modified: + # Need to do some cleanups if closure inlining kicked in + merge_adjacent_blocks(self.func_ir.blocks) + cfg = compute_cfg_from_blocks(self.func_ir.blocks) + debug_print("start inline arraycall") + _debug_dump(cfg) + loops = cfg.loops() + sized_loops = [(k, len(loops[k].body)) for k in loops.keys()] + visited = [] + # We go over all loops, bigger loops first (outer first) + for k, s in sorted(sized_loops, key=lambda tup: tup[1], reverse=True): + visited.append(k) + if guard(_inline_arraycall, self.func_ir, cfg, visited, loops[k], + self.swapped, self.parallel_options.comprehension, + self.typed): + modified = True + if modified: + _fix_nested_array(self.func_ir) + + if modified: + # clean up now dead/unreachable blocks, e.g. unconditionally raising + # an exception in an inlined function would render some parts of the + # inliner unreachable + cfg = compute_cfg_from_blocks(self.func_ir.blocks) + for dead in cfg.dead_nodes(): + del self.func_ir.blocks[dead] + + # run dead code elimination + dead_code_elimination(self.func_ir) + # do label renaming + self.func_ir.blocks = rename_labels(self.func_ir.blocks) + + # inlining done, strip dels + remove_dels(self.func_ir.blocks) + + debug_print("END") + + def _inline_reduction(self, work_list, block, i, expr, call_name): + # only inline reduction in sequential execution, parallel handling + # is done in ParforPass. + require(not self.parallel_options.reduction) + require(call_name == ('reduce', 'builtins') or + call_name == ('reduce', '_functools')) + if len(expr.args) not in (2, 3): + raise TypeError("invalid reduce call, " + "two arguments are required (optional initial " + "value can also be specified)") + check_reduce_func(self.func_ir, expr.args[0]) + def reduce_func(f, A, v=None): + it = iter(A) + if v is not None: + s = v + else: + s = next(it) + for a in it: + s = f(s, a) + return s + inline_closure_call(self.func_ir, + self.func_ir.func_id.func.__globals__, + block, i, reduce_func, work_list=work_list, + callee_validator=callee_ir_validator) + return True + + def _inline_stencil(self, instr, call_name, func_def): + from numba.stencils.stencil import StencilFunc + lhs = instr.target + expr = instr.value + # We keep the escaping variables of the stencil kernel + # alive by adding them to the actual kernel call as extra + # keyword arguments, which is ignored anyway. + if (isinstance(func_def, ir.Global) and + func_def.name == 'stencil' and + isinstance(func_def.value, StencilFunc)): + if expr.kws: + expr.kws += func_def.value.kws + else: + expr.kws = func_def.value.kws + return True + # Otherwise we proceed to check if it is a call to numba.stencil + require(call_name == ('stencil', 'numba.stencils.stencil') or + call_name == ('stencil', 'numba')) + require(expr not in self._processed_stencils) + self._processed_stencils.append(expr) + if not len(expr.args) == 1: + raise ValueError("As a minimum Stencil requires" + " a kernel as an argument") + stencil_def = guard(get_definition, self.func_ir, expr.args[0]) + require(isinstance(stencil_def, ir.Expr) and + stencil_def.op == "make_function") + kernel_ir = get_ir_of_code(self.func_ir.func_id.func.__globals__, + stencil_def.code) + options = dict(expr.kws) + if 'neighborhood' in options: + fixed = guard(self._fix_stencil_neighborhood, options) + if not fixed: + raise ValueError("stencil neighborhood option should be a tuple" + " with constant structure such as ((-w, w),)") + if 'index_offsets' in options: + fixed = guard(self._fix_stencil_index_offsets, options) + if not fixed: + raise ValueError("stencil index_offsets option should be a tuple" + " with constant structure such as (offset, )") + sf = StencilFunc(kernel_ir, 'constant', options) + sf.kws = expr.kws # hack to keep variables live + sf_global = ir.Global('stencil', sf, expr.loc) + self.func_ir._definitions[lhs.name] = [sf_global] + instr.value = sf_global + return True + + def _fix_stencil_neighborhood(self, options): + """ + Extract the two-level tuple representing the stencil neighborhood + from the program IR to provide a tuple to StencilFunc. + """ + # build_tuple node with neighborhood for each dimension + dims_build_tuple = get_definition(self.func_ir, options['neighborhood']) + require(hasattr(dims_build_tuple, 'items')) + res = [] + for window_var in dims_build_tuple.items: + win_build_tuple = get_definition(self.func_ir, window_var) + require(hasattr(win_build_tuple, 'items')) + res.append(tuple(win_build_tuple.items)) + options['neighborhood'] = tuple(res) + return True + + def _fix_stencil_index_offsets(self, options): + """ + Extract the tuple representing the stencil index offsets + from the program IR to provide to StencilFunc. + """ + offset_tuple = get_definition(self.func_ir, options['index_offsets']) + require(hasattr(offset_tuple, 'items')) + options['index_offsets'] = tuple(offset_tuple.items) + return True + + def _inline_closure(self, work_list, block, i, func_def): + require(isinstance(func_def, ir.Expr) and + func_def.op == "make_function") + inline_closure_call(self.func_ir, + self.func_ir.func_id.func.__globals__, + block, i, func_def, work_list=work_list, + callee_validator=callee_ir_validator) + return True + +def check_reduce_func(func_ir, func_var): + """Checks the function at func_var in func_ir to make sure it's amenable + for inlining. Returns the function itself""" + reduce_func = guard(get_definition, func_ir, func_var) + if reduce_func is None: + raise ValueError("Reduce function cannot be found for njit \ + analysis") + if isinstance(reduce_func, (ir.FreeVar, ir.Global)): + if not isinstance(reduce_func.value, + numba.core.registry.CPUDispatcher): + raise ValueError("Invalid reduction function") + # pull out the python function for inlining + reduce_func = reduce_func.value.py_func + elif not (hasattr(reduce_func, 'code') + or hasattr(reduce_func, '__code__')): + raise ValueError("Invalid reduction function") + f_code = (reduce_func.code if hasattr(reduce_func, 'code') + else reduce_func.__code__) + if not f_code.co_argcount == 2: + raise TypeError("Reduction function should take 2 arguments") + return reduce_func + + +class InlineWorker(object): + """ A worker class for inlining, this is a more advanced version of + `inline_closure_call` in that it permits inlining from function type, Numba + IR and code object. It also, runs the entire untyped compiler pipeline on + the inlinee to ensure that it is transformed as though it were compiled + directly. + """ + + def __init__(self, + typingctx=None, + targetctx=None, + locals=None, + pipeline=None, + flags=None, + validator=callee_ir_validator, + typemap=None, + calltypes=None): + """ + Instantiate a new InlineWorker, all arguments are optional though some + must be supplied together for certain use cases. The methods will refuse + to run if the object isn't configured in the manner needed. Args are the + same as those in a numba.core.Compiler.state, except the validator which + is a function taking Numba IR and validating it for use when inlining + (this is optional and really to just provide better error messages about + things which the inliner cannot handle like yield in closure). + """ + def check(arg, name): + if arg is None: + raise TypeError("{} must not be None".format(name)) + + from numba.core.compiler import DefaultPassBuilder + + # check the stuff needed to run the more advanced compilation pipeline + # is valid if any of it is provided + compiler_args = (targetctx, locals, pipeline, flags) + compiler_group = [x is not None for x in compiler_args] + if any(compiler_group) and not all(compiler_group): + check(targetctx, 'targetctx') + check(locals, 'locals') + check(pipeline, 'pipeline') + check(flags, 'flags') + elif all(compiler_group): + check(typingctx, 'typingctx') + + self._compiler_pipeline = DefaultPassBuilder.define_untyped_pipeline + + self.typingctx = typingctx + self.targetctx = targetctx + self.locals = locals + self.pipeline = pipeline + self.flags = flags + self.validator = validator + self.debug_print = _make_debug_print("InlineWorker") + + # check whether this inliner can also support typemap and calltypes + # update and if what's provided is valid + pair = (typemap, calltypes) + pair_is_none = [x is None for x in pair] + if any(pair_is_none) and not all(pair_is_none): + msg = ("typemap and calltypes must both be either None or have a " + "value, got: %s, %s") + raise TypeError(msg % pair) + self._permit_update_type_and_call_maps = not all(pair_is_none) + self.typemap = typemap + self.calltypes = calltypes + + + def inline_ir(self, caller_ir, block, i, callee_ir, callee_freevars, + arg_typs=None): + """ Inlines the callee_ir in the caller_ir at statement index i of block + `block`, callee_freevars are the free variables for the callee_ir. If + the callee_ir is derived from a function `func` then this is + `func.__code__.co_freevars`. If `arg_typs` is given and the InlineWorker + instance was initialized with a typemap and calltypes then they will be + appropriately updated based on the arg_typs. + """ + + # Always copy the callee IR, it gets mutated + def copy_ir(the_ir): + kernel_copy = the_ir.copy() + kernel_copy.blocks = {} + for block_label, block in the_ir.blocks.items(): + new_block = copy.deepcopy(the_ir.blocks[block_label]) + new_block.body = [] + for stmt in the_ir.blocks[block_label].body: + scopy = copy.deepcopy(stmt) + new_block.body.append(scopy) + kernel_copy.blocks[block_label] = new_block + return kernel_copy + + callee_ir = copy_ir(callee_ir) + + # check that the contents of the callee IR is something that can be + # inlined if a validator is present + if self.validator is not None: + self.validator(callee_ir) + + # save an unmutated copy of the callee_ir to return + callee_ir_original = copy_ir(callee_ir) + scope = block.scope + instr = block.body[i] + call_expr = instr.value + callee_blocks = callee_ir.blocks + + # 1. relabel callee_ir by adding an offset + max_label = max(ir_utils._the_max_label.next(), max(caller_ir.blocks.keys())) + callee_blocks = add_offset_to_labels(callee_blocks, max_label + 1) + callee_blocks = simplify_CFG(callee_blocks) + callee_ir.blocks = callee_blocks + min_label = min(callee_blocks.keys()) + max_label = max(callee_blocks.keys()) + # reset globals in ir_utils before we use it + ir_utils._the_max_label.update(max_label) + self.debug_print("After relabel") + _debug_dump(callee_ir) + + # 2. rename all local variables in callee_ir with new locals created in + # caller_ir + callee_scopes = _get_all_scopes(callee_blocks) + self.debug_print("callee_scopes = ", callee_scopes) + # one function should only have one local scope + assert(len(callee_scopes) == 1) + callee_scope = callee_scopes[0] + var_dict = {} + for var in tuple(callee_scope.localvars._con.values()): + if not (var.name in callee_freevars): + inlined_name = _created_inlined_var_name( + callee_ir.func_id.unique_name, var.name) + # Update the caller scope with the new names + new_var = scope.redefine(inlined_name, loc=var.loc) + # Also update the callee scope with the new names. Should the + # type and call maps need updating (which requires SSA form) the + # transformation to SSA is valid as the IR object is internally + # consistent. + callee_scope.redefine(inlined_name, loc=var.loc) + var_dict[var.name] = new_var + self.debug_print("var_dict = ", var_dict) + replace_vars(callee_blocks, var_dict) + self.debug_print("After local var rename") + _debug_dump(callee_ir) + + # 3. replace formal parameters with actual arguments + callee_func = callee_ir.func_id.func + args = _get_callee_args(call_expr, callee_func, block.body[i].loc, + caller_ir) + + # 4. Update typemap + if self._permit_update_type_and_call_maps: + if arg_typs is None: + raise TypeError('arg_typs should have a value not None') + self.update_type_and_call_maps(callee_ir, arg_typs) + # update_type_and_call_maps replaces blocks + callee_blocks = callee_ir.blocks + + self.debug_print("After arguments rename: ") + _debug_dump(callee_ir) + + _replace_args_with(callee_blocks, args) + # 5. split caller blocks into two + new_blocks = [] + new_block = ir.Block(scope, block.loc) + new_block.body = block.body[i + 1:] + new_label = next_label() + caller_ir.blocks[new_label] = new_block + new_blocks.append((new_label, new_block)) + block.body = block.body[:i] + block.body.append(ir.Jump(min_label, instr.loc)) + + # 6. replace Return with assignment to LHS + topo_order = find_topo_order(callee_blocks) + _replace_returns(callee_blocks, instr.target, new_label) + + # remove the old definition of instr.target too + if (instr.target.name in caller_ir._definitions + and call_expr in caller_ir._definitions[instr.target.name]): + # NOTE: target can have multiple definitions due to control flow + caller_ir._definitions[instr.target.name].remove(call_expr) + + # 7. insert all new blocks, and add back definitions + for label in topo_order: + # block scope must point to parent's + block = callee_blocks[label] + block.scope = scope + _add_definitions(caller_ir, block) + caller_ir.blocks[label] = block + new_blocks.append((label, block)) + self.debug_print("After merge in") + _debug_dump(caller_ir) + + return callee_ir_original, callee_blocks, var_dict, new_blocks + + def inline_function(self, caller_ir, block, i, function, arg_typs=None): + """ Inlines the function in the caller_ir at statement index i of block + `block`. If `arg_typs` is given and the InlineWorker instance was + initialized with a typemap and calltypes then they will be appropriately + updated based on the arg_typs. + """ + callee_ir = self.run_untyped_passes(function) + freevars = function.__code__.co_freevars + return self.inline_ir(caller_ir, block, i, callee_ir, freevars, + arg_typs=arg_typs) + + def run_untyped_passes(self, func, enable_ssa=False): + """ + Run the compiler frontend's untyped passes over the given Python + function, and return the function's canonical Numba IR. + + Disable SSA transformation by default, since the call site won't be in SSA + form and self.inline_ir depends on this being the case. + """ + from numba.core.compiler import StateDict, _CompileStatus + from numba.core.untyped_passes import ExtractByteCode, WithLifting + from numba.core import bytecode + from numba.parfors.parfor import ParforDiagnostics + state = StateDict() + state.func_ir = None + state.typingctx = self.typingctx + state.targetctx = self.targetctx + state.locals = self.locals + state.pipeline = self.pipeline + state.flags = self.flags + state.flags.enable_ssa = enable_ssa + + state.func_id = bytecode.FunctionIdentity.from_function(func) + + state.typemap = None + state.calltypes = None + state.type_annotation = None + state.status = _CompileStatus(False) + state.return_type = None + state.parfor_diagnostics = ParforDiagnostics() + state.metadata = {} + + ExtractByteCode().run_pass(state) + # This is a lie, just need *some* args for the case where an obj mode + # with lift is needed + state.args = len(state.bc.func_id.pysig.parameters) * (types.pyobject,) + + pm = self._compiler_pipeline(state) + + pm.finalize() + pm.run(state) + return state.func_ir + + def update_type_and_call_maps(self, callee_ir, arg_typs): + """ Updates the type and call maps based on calling callee_ir with arguments + from arg_typs""" + from numba.core.ssa import reconstruct_ssa + from numba.core.typed_passes import PreLowerStripPhis + + if not self._permit_update_type_and_call_maps: + msg = ("InlineWorker instance not configured correctly, typemap or " + "calltypes missing in initialization.") + raise ValueError(msg) + from numba.core import typed_passes + # call branch pruning to simplify IR and avoid inference errors + callee_ir._definitions = ir_utils.build_definitions(callee_ir.blocks) + numba.core.analysis.dead_branch_prune(callee_ir, arg_typs) + # callee's typing may require SSA + callee_ir = reconstruct_ssa(callee_ir) + callee_ir._definitions = ir_utils.build_definitions(callee_ir.blocks) + f_typemap, f_return_type, f_calltypes, _ = typed_passes.type_inference_stage( + self.typingctx, self.targetctx, callee_ir, arg_typs, None) + callee_ir = PreLowerStripPhis()._strip_phi_nodes(callee_ir) + callee_ir._definitions = ir_utils.build_definitions(callee_ir.blocks) + canonicalize_array_math(callee_ir, f_typemap, + f_calltypes, self.typingctx) + # remove argument entries like arg.a from typemap + arg_names = [vname for vname in f_typemap if vname.startswith("arg.")] + for a in arg_names: + f_typemap.pop(a) + self.typemap.update(f_typemap) + self.calltypes.update(f_calltypes) + + +def inline_closure_call(func_ir, glbls, block, i, callee, typingctx=None, + targetctx=None, arg_typs=None, typemap=None, + calltypes=None, work_list=None, callee_validator=None, + replace_freevars=True): + """Inline the body of `callee` at its callsite (`i`-th instruction of `block`) + + `func_ir` is the func_ir object of the caller function and `glbls` is its + global variable environment (func_ir.func_id.func.__globals__). + `block` is the IR block of the callsite and `i` is the index of the + callsite's node. `callee` is either the called function or a + make_function node. `typingctx`, `typemap` and `calltypes` are typing + data structures of the caller, available if we are in a typed pass. + `arg_typs` includes the types of the arguments at the callsite. + `callee_validator` is an optional callable which can be used to validate the + IR of the callee to ensure that it contains IR supported for inlining, it + takes one argument, the func_ir of the callee + + Returns IR blocks of the callee and the variable renaming dictionary used + for them to facilitate further processing of new blocks. + """ + scope = block.scope + instr = block.body[i] + call_expr = instr.value + debug_print = _make_debug_print("inline_closure_call") + debug_print("Found closure call: ", instr, " with callee = ", callee) + # support both function object and make_function Expr + callee_code = callee.code if hasattr(callee, 'code') else callee.__code__ + callee_closure = callee.closure if hasattr(callee, 'closure') else callee.__closure__ + # first, get the IR of the callee + if isinstance(callee, pytypes.FunctionType): + from numba.core import compiler + callee_ir = compiler.run_frontend(callee, inline_closures=True) + else: + callee_ir = get_ir_of_code(glbls, callee_code) + + # check that the contents of the callee IR is something that can be inlined + # if a validator is supplied + if callee_validator is not None: + callee_validator(callee_ir) + + callee_blocks = callee_ir.blocks + + # 1. relabel callee_ir by adding an offset + max_label = max(ir_utils._the_max_label.next(), max(func_ir.blocks.keys())) + callee_blocks = add_offset_to_labels(callee_blocks, max_label + 1) + callee_blocks = simplify_CFG(callee_blocks) + callee_ir.blocks = callee_blocks + min_label = min(callee_blocks.keys()) + max_label = max(callee_blocks.keys()) + # reset globals in ir_utils before we use it + ir_utils._the_max_label.update(max_label) + debug_print("After relabel") + _debug_dump(callee_ir) + + # 2. rename all local variables in callee_ir with new locals created in func_ir + callee_scopes = _get_all_scopes(callee_blocks) + debug_print("callee_scopes = ", callee_scopes) + # one function should only have one local scope + assert(len(callee_scopes) == 1) + callee_scope = callee_scopes[0] + var_dict = {} + for var in callee_scope.localvars._con.values(): + if not (var.name in callee_code.co_freevars): + inlined_name = _created_inlined_var_name( + callee_ir.func_id.unique_name, var.name) + new_var = scope.redefine(inlined_name, loc=var.loc) + var_dict[var.name] = new_var + debug_print("var_dict = ", var_dict) + replace_vars(callee_blocks, var_dict) + debug_print("After local var rename") + _debug_dump(callee_ir) + + # 3. replace formal parameters with actual arguments + args = _get_callee_args(call_expr, callee, block.body[i].loc, func_ir) + + debug_print("After arguments rename: ") + _debug_dump(callee_ir) + + # 4. replace freevar with actual closure var + if callee_closure and replace_freevars: + closure = func_ir.get_definition(callee_closure) + debug_print("callee's closure = ", closure) + if isinstance(closure, tuple): + cellget = ctypes.pythonapi.PyCell_Get + cellget.restype = ctypes.py_object + cellget.argtypes = (ctypes.py_object,) + items = tuple(cellget(x) for x in closure) + else: + assert(isinstance(closure, ir.Expr) + and closure.op == 'build_tuple') + items = closure.items + assert(len(callee_code.co_freevars) == len(items)) + _replace_freevars(callee_blocks, items) + debug_print("After closure rename") + _debug_dump(callee_ir) + + if typingctx: + from numba.core import typed_passes + # call branch pruning to simplify IR and avoid inference errors + callee_ir._definitions = ir_utils.build_definitions(callee_ir.blocks) + numba.core.analysis.dead_branch_prune(callee_ir, arg_typs) + try: + f_typemap, f_return_type, f_calltypes, _ = typed_passes.type_inference_stage( + typingctx, targetctx, callee_ir, arg_typs, None) + except Exception as e: + f_typemap, f_return_type, f_calltypes, _ = typed_passes.type_inference_stage( + typingctx, targetctx, callee_ir, arg_typs, None) + pass + canonicalize_array_math(callee_ir, f_typemap, + f_calltypes, typingctx) + # remove argument entries like arg.a from typemap + arg_names = [vname for vname in f_typemap if vname.startswith("arg.")] + for a in arg_names: + f_typemap.pop(a) + typemap.update(f_typemap) + calltypes.update(f_calltypes) + + _replace_args_with(callee_blocks, args) + # 5. split caller blocks into two + new_blocks = [] + new_block = ir.Block(scope, block.loc) + new_block.body = block.body[i + 1:] + new_label = next_label() + func_ir.blocks[new_label] = new_block + new_blocks.append((new_label, new_block)) + block.body = block.body[:i] + block.body.append(ir.Jump(min_label, instr.loc)) + + # 6. replace Return with assignment to LHS + topo_order = find_topo_order(callee_blocks) + _replace_returns(callee_blocks, instr.target, new_label) + + # remove the old definition of instr.target too + if (instr.target.name in func_ir._definitions + and call_expr in func_ir._definitions[instr.target.name]): + # NOTE: target can have multiple definitions due to control flow + func_ir._definitions[instr.target.name].remove(call_expr) + + # 7. insert all new blocks, and add back definitions + for label in topo_order: + # block scope must point to parent's + block = callee_blocks[label] + block.scope = scope + _add_definitions(func_ir, block) + func_ir.blocks[label] = block + new_blocks.append((label, block)) + debug_print("After merge in") + _debug_dump(func_ir) + + if work_list is not None: + for block in new_blocks: + work_list.append(block) + return callee_blocks, var_dict + + +def _get_callee_args(call_expr, callee, loc, func_ir): + """Get arguments for calling 'callee', including the default arguments. + keyword arguments are currently only handled when 'callee' is a function. + """ + if call_expr.op == 'call': + args = list(call_expr.args) + if call_expr.vararg: + msg = "Calling a closure with *args is unsupported." + raise errors.UnsupportedError(msg, call_expr.loc) + elif call_expr.op == 'getattr': + args = [call_expr.value] + elif ir_utils.is_operator_or_getitem(call_expr): + args = call_expr.list_vars() + else: + raise TypeError("Unsupported ir.Expr.{}".format(call_expr.op)) + + debug_print = _make_debug_print("inline_closure_call default handling") + + # handle defaults and kw arguments using pysignature if callee is function + if isinstance(callee, pytypes.FunctionType): + pysig = numba.core.utils.pysignature(callee) + normal_handler = lambda index, param, default: default + default_handler = lambda index, param, default: ir.Const(default, loc) + # Throw error for stararg + # TODO: handle stararg + def stararg_handler(index, param, default): + raise NotImplementedError( + "Stararg not supported in inliner for arg {} {}".format( + index, param)) + if call_expr.op == 'call': + kws = dict(call_expr.kws) + else: + kws = {} + return numba.core.typing.fold_arguments( + pysig, args, kws, normal_handler, default_handler, + stararg_handler) + else: + # TODO: handle arguments for make_function case similar to function + # case above + callee_defaults = (callee.defaults if hasattr(callee, 'defaults') + else callee.__defaults__) + if callee_defaults: + debug_print("defaults = ", callee_defaults) + if isinstance(callee_defaults, tuple): # Python 3.5 + defaults_list = [] + for x in callee_defaults: + if isinstance(x, ir.Var): + defaults_list.append(x) + else: + # this branch is predominantly for kwargs from + # inlinable functions + defaults_list.append(ir.Const(value=x, loc=loc)) + args = args + defaults_list + elif (isinstance(callee_defaults, ir.Var) + or isinstance(callee_defaults, str)): + default_tuple = func_ir.get_definition(callee_defaults) + assert(isinstance(default_tuple, ir.Expr)) + assert(default_tuple.op == "build_tuple") + const_vals = [func_ir.get_definition(x) for + x in default_tuple.items] + args = args + const_vals + else: + raise NotImplementedError( + "Unsupported defaults to make_function: {}".format( + defaults)) + return args + + +def _make_debug_print(prefix): + def debug_print(*args): + if config.DEBUG_INLINE_CLOSURE: + print(prefix + ": " + "".join(str(x) for x in args)) + return debug_print + + +def _debug_dump(func_ir): + if config.DEBUG_INLINE_CLOSURE: + func_ir.dump() + + +def _get_all_scopes(blocks): + """Get all block-local scopes from an IR. + """ + all_scopes = [] + for label, block in blocks.items(): + if not (block.scope in all_scopes): + all_scopes.append(block.scope) + return all_scopes + + +def _replace_args_with(blocks, args): + """ + Replace ir.Arg(...) with real arguments from call site + """ + for label, block in blocks.items(): + assigns = block.find_insts(ir.Assign) + for stmt in assigns: + if isinstance(stmt.value, ir.Arg): + idx = stmt.value.index + assert(idx < len(args)) + stmt.value = args[idx] + + +def _replace_freevars(blocks, args): + """ + Replace ir.FreeVar(...) with real variables from parent function + """ + for label, block in blocks.items(): + assigns = block.find_insts(ir.Assign) + for stmt in assigns: + if isinstance(stmt.value, ir.FreeVar): + idx = stmt.value.index + assert(idx < len(args)) + if isinstance(args[idx], ir.Var): + stmt.value = args[idx] + else: + stmt.value = ir.Const(args[idx], stmt.loc) + + +def _replace_returns(blocks, target, return_label): + """ + Return return statement by assigning directly to target, and a jump. + """ + for label, block in blocks.items(): + casts = [] + for i in range(len(block.body)): + stmt = block.body[i] + if isinstance(stmt, ir.Return): + assert(i + 1 == len(block.body)) + block.body[i] = ir.Assign(stmt.value, target, stmt.loc) + block.body.append(ir.Jump(return_label, stmt.loc)) + # remove cast of the returned value + for cast in casts: + if cast.target.name == stmt.value.name: + cast.value = cast.value.value + elif isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr) and stmt.value.op == 'cast': + casts.append(stmt) + +def _add_definitions(func_ir, block): + """ + Add variable definitions found in a block to parent func_ir. + """ + definitions = func_ir._definitions + assigns = block.find_insts(ir.Assign) + for stmt in assigns: + definitions[stmt.target.name].append(stmt.value) + +def _find_arraycall(func_ir, block): + """Look for statement like "x = numpy.array(y)" or "x[..] = y" + immediately after the closure call that creates list y (the i-th + statement in block). Return the statement index if found, or + raise GuardException. + """ + array_var = None + array_call_index = None + list_var_dead_after_array_call = False + list_var = None + + i = 0 + while i < len(block.body): + instr = block.body[i] + if isinstance(instr, ir.Del): + # Stop the process if list_var becomes dead + if list_var and array_var and instr.value == list_var.name: + list_var_dead_after_array_call = True + break + pass + elif isinstance(instr, ir.Assign): + # Found array_var = array(list_var) + lhs = instr.target + expr = instr.value + if (guard(find_callname, func_ir, expr) == ('array', 'numpy') and + isinstance(expr.args[0], ir.Var)): + list_var = expr.args[0] + array_var = lhs + array_stmt_index = i + array_kws = dict(expr.kws) + elif (isinstance(instr, ir.SetItem) and + isinstance(instr.value, ir.Var) and + not list_var): + list_var = instr.value + # Found array_var[..] = list_var, the case for nested array + array_var = instr.target + array_def = get_definition(func_ir, array_var) + require(guard(_find_unsafe_empty_inferred, func_ir, array_def)) + array_stmt_index = i + array_kws = {} + else: + # Bail out otherwise + break + i = i + 1 + # require array_var is found, and list_var is dead after array_call. + require(array_var and list_var_dead_after_array_call) + _make_debug_print("find_array_call")(block.body[array_stmt_index]) + return list_var, array_stmt_index, array_kws + + +def _find_iter_range(func_ir, range_iter_var, swapped): + """Find the iterator's actual range if it is either range(n), or range(m, n), + otherwise return raise GuardException. + """ + debug_print = _make_debug_print("find_iter_range") + range_iter_def = get_definition(func_ir, range_iter_var) + debug_print("range_iter_var = ", range_iter_var, " def = ", range_iter_def) + require(isinstance(range_iter_def, ir.Expr) and range_iter_def.op == 'getiter') + range_var = range_iter_def.value + range_def = get_definition(func_ir, range_var) + debug_print("range_var = ", range_var, " range_def = ", range_def) + require(isinstance(range_def, ir.Expr) and range_def.op == 'call') + func_var = range_def.func + func_def = get_definition(func_ir, func_var) + debug_print("func_var = ", func_var, " func_def = ", func_def) + require(isinstance(func_def, ir.Global) and + (func_def.value == range or func_def.value == numba.misc.special.prange)) + nargs = len(range_def.args) + swapping = [('"array comprehension"', 'closure of'), range_def.func.loc] + if nargs == 1: + swapped[range_def.func.name] = swapping + stop = get_definition(func_ir, range_def.args[0], lhs_only=True) + return (0, range_def.args[0], func_def) + elif nargs == 2: + swapped[range_def.func.name] = swapping + start = get_definition(func_ir, range_def.args[0], lhs_only=True) + stop = get_definition(func_ir, range_def.args[1], lhs_only=True) + return (start, stop, func_def) + else: + raise GuardException + +def _inline_arraycall(func_ir, cfg, visited, loop, swapped, enable_prange=False, + typed=False): + """Look for array(list) call in the exit block of a given loop, and turn list operations into + array operations in the loop if the following conditions are met: + 1. The exit block contains an array call on the list; + 2. The list variable is no longer live after array call; + 3. The list is created in the loop entry block; + 4. The loop is created from an range iterator whose length is known prior to the loop; + 5. There is only one list_append operation on the list variable in the loop body; + 6. The block that contains list_append dominates the loop head, which ensures list + length is the same as loop length; + If any condition check fails, no modification will be made to the incoming IR. + """ + debug_print = _make_debug_print("inline_arraycall") + # There should only be one loop exit + require(len(loop.exits) == 1) + exit_block = next(iter(loop.exits)) + list_var, array_call_index, array_kws = _find_arraycall(func_ir, func_ir.blocks[exit_block]) + + # check if dtype is present in array call + dtype_def = None + dtype_mod_def = None + if 'dtype' in array_kws: + require(isinstance(array_kws['dtype'], ir.Var)) + # We require that dtype argument to be a constant of getattr Expr, and we'll + # remember its definition for later use. + dtype_def = get_definition(func_ir, array_kws['dtype']) + require(isinstance(dtype_def, ir.Expr) and dtype_def.op == 'getattr') + dtype_mod_def = get_definition(func_ir, dtype_def.value) + + list_var_def = get_definition(func_ir, list_var) + debug_print("list_var = ", list_var, " def = ", list_var_def) + if isinstance(list_var_def, ir.Expr) and list_var_def.op == 'cast': + list_var_def = get_definition(func_ir, list_var_def.value) + # Check if the definition is a build_list + require(isinstance(list_var_def, ir.Expr) and list_var_def.op == 'build_list') + # The build_list must be empty + require(len(list_var_def.items) == 0) + + # Look for list_append in "last" block in loop body, which should be a block that is + # a post-dominator of the loop header. + list_append_stmts = [] + for label in loop.body: + # We have to consider blocks of this loop, but not sub-loops. + # To achieve this, we require the set of "in_loops" of "label" to be visited loops. + in_visited_loops = [l.header in visited for l in cfg.in_loops(label)] + if not all(in_visited_loops): + continue + block = func_ir.blocks[label] + debug_print("check loop body block ", label) + for stmt in block.find_insts(ir.Assign): + lhs = stmt.target + expr = stmt.value + if isinstance(expr, ir.Expr) and expr.op == 'call': + func_def = get_definition(func_ir, expr.func) + if isinstance(func_def, ir.Expr) and func_def.op == 'getattr' \ + and func_def.attr == 'append': + list_def = get_definition(func_ir, func_def.value) + debug_print("list_def = ", list_def, list_def is list_var_def) + if list_def is list_var_def: + # found matching append call + list_append_stmts.append((label, block, stmt)) + + # Require only one list_append, otherwise we won't know the indices + require(len(list_append_stmts) == 1) + append_block_label, append_block, append_stmt = list_append_stmts[0] + + # Check if append_block (besides loop entry) dominates loop header. + # Since CFG doesn't give us this info without loop entry, we approximate + # by checking if the predecessor set of the header block is the same + # as loop_entries plus append_block, which is certainly more restrictive + # than necessary, and can be relaxed if needed. + preds = set(l for l, b in cfg.predecessors(loop.header)) + debug_print("preds = ", preds, (loop.entries | set([append_block_label]))) + require(preds == (loop.entries | set([append_block_label]))) + + # Find iterator in loop header + iter_vars = [] + iter_first_vars = [] + loop_header = func_ir.blocks[loop.header] + for stmt in loop_header.find_insts(ir.Assign): + expr = stmt.value + if isinstance(expr, ir.Expr): + if expr.op == 'iternext': + iter_def = get_definition(func_ir, expr.value) + debug_print("iter_def = ", iter_def) + iter_vars.append(expr.value) + elif expr.op == 'pair_first': + iter_first_vars.append(stmt.target) + + # Require only one iterator in loop header + require(len(iter_vars) == 1 and len(iter_first_vars) == 1) + iter_var = iter_vars[0] # variable that holds the iterator object + iter_first_var = iter_first_vars[0] # variable that holds the value out of iterator + + # Final requirement: only one loop entry, and we're going to modify it by: + # 1. replacing the list definition with an array definition; + # 2. adding a counter for the array iteration. + require(len(loop.entries) == 1) + loop_entry = func_ir.blocks[next(iter(loop.entries))] + terminator = loop_entry.terminator + scope = loop_entry.scope + loc = loop_entry.loc + stmts = [] + removed = [] + def is_removed(val, removed): + if isinstance(val, ir.Var): + for x in removed: + if x.name == val.name: + return True + return False + # Skip list construction and skip terminator, add the rest to stmts + for i in range(len(loop_entry.body) - 1): + stmt = loop_entry.body[i] + if isinstance(stmt, ir.Assign) and (stmt.value is list_def or is_removed(stmt.value, removed)): + removed.append(stmt.target) + else: + stmts.append(stmt) + debug_print("removed variables: ", removed) + + # Define an index_var to index the array. + # If the range happens to be single step ranges like range(n), or range(m, n), + # then the index_var correlates to iterator index; otherwise we'll have to + # define a new counter. + range_def = guard(_find_iter_range, func_ir, iter_var, swapped) + index_var = ir.Var(scope, mk_unique_var("index"), loc) + if range_def and range_def[0] == 0: + # iterator starts with 0, index_var can just be iter_first_var + index_var = iter_first_var + else: + # index_var = -1 # starting the index with -1 since it will incremented in loop header + stmts.append(_new_definition(func_ir, index_var, ir.Const(value=-1, loc=loc), loc)) + + # Insert statement to get the size of the loop iterator + size_var = ir.Var(scope, mk_unique_var("size"), loc) + if range_def: + start, stop, range_func_def = range_def + if start == 0: + size_val = stop + else: + size_val = ir.Expr.binop(fn=operator.sub, lhs=stop, rhs=start, loc=loc) + # we can parallelize this loop if enable_prange = True, by changing + # range function from range, to prange. + if enable_prange and isinstance(range_func_def, ir.Global): + range_func_def.name = 'internal_prange' + range_func_def.value = internal_prange + + else: + # this doesn't work in objmode as it's effectively untyped + if typed: + len_func_var = ir.Var(scope, mk_unique_var("len_func"), loc) + from numba.cpython.rangeobj import length_of_iterator + stmts.append(_new_definition(func_ir, len_func_var, + ir.Global('length_of_iterator', + length_of_iterator, + loc=loc), + loc)) + size_val = ir.Expr.call(len_func_var, (iter_var,), (), loc=loc) + else: + raise GuardException + + + stmts.append(_new_definition(func_ir, size_var, size_val, loc)) + + size_tuple_var = ir.Var(scope, mk_unique_var("size_tuple"), loc) + stmts.append(_new_definition(func_ir, size_tuple_var, + ir.Expr.build_tuple(items=[size_var], loc=loc), loc)) + + # Insert array allocation + array_var = ir.Var(scope, mk_unique_var("array"), loc) + empty_func = ir.Var(scope, mk_unique_var("empty_func"), loc) + if dtype_def and dtype_mod_def: + # when dtype is present, we'll call empty with dtype + dtype_mod_var = ir.Var(scope, mk_unique_var("dtype_mod"), loc) + dtype_var = ir.Var(scope, mk_unique_var("dtype"), loc) + stmts.append(_new_definition(func_ir, dtype_mod_var, dtype_mod_def, loc)) + stmts.append(_new_definition(func_ir, dtype_var, + ir.Expr.getattr(dtype_mod_var, dtype_def.attr, loc), loc)) + stmts.append(_new_definition(func_ir, empty_func, + ir.Global('empty', np.empty, loc=loc), loc)) + array_kws = [('dtype', dtype_var)] + else: + # this doesn't work in objmode as it's effectively untyped + if typed: + # otherwise we'll call unsafe_empty_inferred + stmts.append(_new_definition(func_ir, empty_func, + ir.Global('unsafe_empty_inferred', + unsafe_empty_inferred, loc=loc), loc)) + array_kws = [] + else: + raise GuardException + + # array_var = empty_func(size_tuple_var) + stmts.append(_new_definition(func_ir, array_var, + ir.Expr.call(empty_func, (size_tuple_var,), list(array_kws), loc=loc), loc)) + + # Add back removed just in case they are used by something else + for var in removed: + stmts.append(_new_definition(func_ir, var, array_var, loc)) + + # Add back terminator + stmts.append(terminator) + # Modify loop_entry + loop_entry.body = stmts + + if range_def: + if range_def[0] != 0: + # when range doesn't start from 0, index_var becomes loop index + # (iter_first_var) minus an offset (range_def[0]) + terminator = loop_header.terminator + assert(isinstance(terminator, ir.Branch)) + # find the block in the loop body that header jumps to + block_id = terminator.truebr + blk = func_ir.blocks[block_id] + loc = blk.loc + blk.body.insert(0, _new_definition(func_ir, index_var, + ir.Expr.binop(fn=operator.sub, lhs=iter_first_var, + rhs=range_def[0], loc=loc), + loc)) + else: + # Insert index_var increment to the end of loop header + loc = loop_header.loc + terminator = loop_header.terminator + stmts = loop_header.body[0:-1] + next_index_var = ir.Var(scope, mk_unique_var("next_index"), loc) + one = ir.Var(scope, mk_unique_var("one"), loc) + # one = 1 + stmts.append(_new_definition(func_ir, one, + ir.Const(value=1,loc=loc), loc)) + # next_index_var = index_var + 1 + stmts.append(_new_definition(func_ir, next_index_var, + ir.Expr.binop(fn=operator.add, lhs=index_var, rhs=one, loc=loc), loc)) + # index_var = next_index_var + stmts.append(_new_definition(func_ir, index_var, next_index_var, loc)) + stmts.append(terminator) + loop_header.body = stmts + + # In append_block, change list_append into array assign + for i in range(len(append_block.body)): + if append_block.body[i] is append_stmt: + debug_print("Replace append with SetItem") + append_block.body[i] = ir.SetItem(target=array_var, index=index_var, + value=append_stmt.value.args[0], loc=append_stmt.loc) + + # replace array call, by changing "a = array(b)" to "a = b" + stmt = func_ir.blocks[exit_block].body[array_call_index] + # stmt can be either array call or SetItem, we only replace array call + if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr): + stmt.value = array_var + func_ir._definitions[stmt.target.name] = [stmt.value] + + return True + + +def _find_unsafe_empty_inferred(func_ir, expr): + unsafe_empty_inferred + require(isinstance(expr, ir.Expr) and expr.op == 'call') + callee = expr.func + callee_def = get_definition(func_ir, callee) + require(isinstance(callee_def, ir.Global)) + _make_debug_print("_find_unsafe_empty_inferred")(callee_def.value) + return callee_def.value == unsafe_empty_inferred + + +def _fix_nested_array(func_ir): + """Look for assignment like: a[..] = b, where both a and b are numpy arrays, and + try to eliminate array b by expanding a with an extra dimension. + """ + blocks = func_ir.blocks + cfg = compute_cfg_from_blocks(blocks) + usedefs = compute_use_defs(blocks) + empty_deadmap = dict([(label, set()) for label in blocks.keys()]) + livemap = compute_live_variables(cfg, blocks, usedefs.defmap, empty_deadmap) + + def find_array_def(arr): + """Find numpy array definition such as + arr = numba.unsafe.ndarray.empty_inferred(...). + If it is arr = b[...], find array definition of b recursively. + """ + arr_def = get_definition(func_ir, arr) + _make_debug_print("find_array_def")(arr, arr_def) + if isinstance(arr_def, ir.Expr): + if guard(_find_unsafe_empty_inferred, func_ir, arr_def): + return arr_def + elif arr_def.op == 'getitem': + return find_array_def(arr_def.value) + raise GuardException + + def fix_dependencies(expr, varlist): + """Double check if all variables in varlist are defined before + expr is used. Try to move constant definition when the check fails. + Bails out by raising GuardException if it can't be moved. + """ + debug_print = _make_debug_print("fix_dependencies") + for label, block in blocks.items(): + scope = block.scope + body = block.body + defined = set() + for i in range(len(body)): + inst = body[i] + if isinstance(inst, ir.Assign): + defined.add(inst.target.name) + if inst.value is expr: + new_varlist = [] + for var in varlist: + # var must be defined before this inst, or live + # and not later defined. + if (var.name in defined or + (var.name in livemap[label] and + not (var.name in usedefs.defmap[label]))): + debug_print(var.name, " already defined") + new_varlist.append(var) + else: + debug_print(var.name, " not yet defined") + var_def = get_definition(func_ir, var.name) + if isinstance(var_def, ir.Const): + loc = var.loc + new_var = ir.Var(scope, mk_unique_var("new_var"), loc) + new_const = ir.Const(var_def.value, loc) + new_vardef = _new_definition(func_ir, + new_var, new_const, loc) + new_body = [] + new_body.extend(body[:i]) + new_body.append(new_vardef) + new_body.extend(body[i:]) + block.body = new_body + new_varlist.append(new_var) + else: + raise GuardException + return new_varlist + # when expr is not found in block + raise GuardException + + def fix_array_assign(stmt): + """For assignment like lhs[idx] = rhs, where both lhs and rhs are arrays, do the + following: + 1. find the definition of rhs, which has to be a call to numba.unsafe.ndarray.empty_inferred + 2. find the source array creation for lhs, insert an extra dimension of size of b. + 3. replace the definition of rhs = numba.unsafe.ndarray.empty_inferred(...) with rhs = lhs[idx] + """ + require(isinstance(stmt, ir.SetItem)) + require(isinstance(stmt.value, ir.Var)) + debug_print = _make_debug_print("fix_array_assign") + debug_print("found SetItem: ", stmt) + lhs = stmt.target + # Find the source array creation of lhs + lhs_def = find_array_def(lhs) + debug_print("found lhs_def: ", lhs_def) + rhs_def = get_definition(func_ir, stmt.value) + debug_print("found rhs_def: ", rhs_def) + require(isinstance(rhs_def, ir.Expr)) + if rhs_def.op == 'cast': + rhs_def = get_definition(func_ir, rhs_def.value) + require(isinstance(rhs_def, ir.Expr)) + require(_find_unsafe_empty_inferred(func_ir, rhs_def)) + # Find the array dimension of rhs + dim_def = get_definition(func_ir, rhs_def.args[0]) + require(isinstance(dim_def, ir.Expr) and dim_def.op == 'build_tuple') + debug_print("dim_def = ", dim_def) + extra_dims = [ get_definition(func_ir, x, lhs_only=True) for x in dim_def.items ] + debug_print("extra_dims = ", extra_dims) + # Expand size tuple when creating lhs_def with extra_dims + size_tuple_def = get_definition(func_ir, lhs_def.args[0]) + require(isinstance(size_tuple_def, ir.Expr) and size_tuple_def.op == 'build_tuple') + debug_print("size_tuple_def = ", size_tuple_def) + extra_dims = fix_dependencies(size_tuple_def, extra_dims) + size_tuple_def.items += extra_dims + # In-place modify rhs_def to be getitem + rhs_def.op = 'getitem' + rhs_def.fn = operator.getitem + rhs_def.value = get_definition(func_ir, lhs, lhs_only=True) + rhs_def.index = stmt.index + del rhs_def._kws['func'] + del rhs_def._kws['args'] + del rhs_def._kws['vararg'] + del rhs_def._kws['kws'] + # success + return True + + for label in find_topo_order(func_ir.blocks): + block = func_ir.blocks[label] + for stmt in block.body: + if guard(fix_array_assign, stmt): + block.body.remove(stmt) + +def _new_definition(func_ir, var, value, loc): + func_ir._definitions[var.name] = [value] + return ir.Assign(value=value, target=var, loc=loc) + +@rewrites.register_rewrite('after-inference') +class RewriteArrayOfConsts(rewrites.Rewrite): + '''The RewriteArrayOfConsts class is responsible for finding + 1D array creations from a constant list, and rewriting it into + direct initialization of array elements without creating the list. + ''' + def __init__(self, state, *args, **kws): + self.typingctx = state.typingctx + super(RewriteArrayOfConsts, self).__init__(*args, **kws) + + def match(self, func_ir, block, typemap, calltypes): + if len(calltypes) == 0: + return False + self.crnt_block = block + self.new_body = guard(_inline_const_arraycall, block, func_ir, + self.typingctx, typemap, calltypes) + return self.new_body is not None + + def apply(self): + self.crnt_block.body = self.new_body + return self.crnt_block + + +def _inline_const_arraycall(block, func_ir, context, typemap, calltypes): + """Look for array(list) call where list is a constant list created by build_list, + and turn them into direct array creation and initialization, if the following + conditions are met: + 1. The build_list call immediate precedes the array call; + 2. The list variable is no longer live after array call; + If any condition check fails, no modification will be made. + """ + debug_print = _make_debug_print("inline_const_arraycall") + scope = block.scope + + def inline_array(array_var, expr, stmts, list_vars, dels): + """Check to see if the given "array_var" is created from a list + of constants, and try to inline the list definition as array + initialization. + + Extra statements produced with be appended to "stmts". + """ + callname = guard(find_callname, func_ir, expr) + require(callname and callname[1] == 'numpy' and callname[0] == 'array') + require(expr.args[0].name in list_vars) + ret_type = calltypes[expr].return_type + require(isinstance(ret_type, types.ArrayCompatible) and + ret_type.ndim == 1) + loc = expr.loc + list_var = expr.args[0] + # Get the type of the array to be created. + array_typ = typemap[array_var.name] + debug_print("inline array_var = ", array_var, " list_var = ", list_var) + # Get the element type of the array to be created. + dtype = array_typ.dtype + # Get the sequence of operations to provide values to the new array. + seq, _ = find_build_sequence(func_ir, list_var) + size = len(seq) + # Create a tuple to pass to empty below to specify the new array size. + size_var = ir.Var(scope, mk_unique_var("size"), loc) + size_tuple_var = ir.Var(scope, mk_unique_var("size_tuple"), loc) + size_typ = types.intp + size_tuple_typ = types.UniTuple(size_typ, 1) + typemap[size_var.name] = size_typ + typemap[size_tuple_var.name] = size_tuple_typ + stmts.append(_new_definition(func_ir, size_var, + ir.Const(size, loc=loc), loc)) + stmts.append(_new_definition(func_ir, size_tuple_var, + ir.Expr.build_tuple(items=[size_var], loc=loc), loc)) + + # The general approach is to create an empty array and then fill + # the elements in one-by-one from their specification. + + # Get the numpy type to pass to empty. + nptype = types.DType(dtype) + + # Create a variable to hold the numpy empty function. + empty_func = ir.Var(scope, mk_unique_var("empty_func"), loc) + fnty = get_np_ufunc_typ(np.empty) + sig = context.resolve_function_type(fnty, (size_typ,), {'dtype':nptype}) + + typemap[empty_func.name] = fnty + + stmts.append(_new_definition(func_ir, empty_func, + ir.Global('empty', np.empty, loc=loc), loc)) + + # We pass two arguments to empty, first the size tuple and second + # the dtype of the new array. Here, we created typ_var which is + # the dtype argument of the new array. typ_var in turn is created + # by getattr of the dtype string on the numpy module. + + # Create var for numpy module. + g_np_var = ir.Var(scope, mk_unique_var("$np_g_var"), loc) + typemap[g_np_var.name] = types.misc.Module(np) + g_np = ir.Global('np', np, loc) + stmts.append(_new_definition(func_ir, g_np_var, g_np, loc)) + + # Create var for result of numpy.. + typ_var = ir.Var(scope, mk_unique_var("$np_typ_var"), loc) + typemap[typ_var.name] = nptype + dtype_str = str(dtype) + if dtype_str == 'bool': + dtype_str = 'bool_' + # Get dtype attribute of numpy module. + np_typ_getattr = ir.Expr.getattr(g_np_var, dtype_str, loc) + stmts.append(_new_definition(func_ir, typ_var, np_typ_getattr, loc)) + + # Create the call to numpy.empty passing the size tuple and dtype var. + empty_call = ir.Expr.call(empty_func, [size_var, typ_var], {}, loc=loc) + calltypes[empty_call] = typing.signature(array_typ, size_typ, nptype) + stmts.append(_new_definition(func_ir, array_var, empty_call, loc)) + + # Fill in the new empty array one-by-one. + for i in range(size): + index_var = ir.Var(scope, mk_unique_var("index"), loc) + index_typ = types.intp + typemap[index_var.name] = index_typ + stmts.append(_new_definition(func_ir, index_var, + ir.Const(i, loc), loc)) + setitem = ir.SetItem(array_var, index_var, seq[i], loc) + calltypes[setitem] = typing.signature(types.none, array_typ, + index_typ, dtype) + stmts.append(setitem) + + stmts.extend(dels) + return True + + class State(object): + """ + This class is used to hold the state in the following loop so as to make + it easy to reset the state of the variables tracking the various + statement kinds + """ + + def __init__(self): + # list_vars keep track of the variable created from the latest + # build_list instruction, as well as its synonyms. + self.list_vars = [] + # dead_vars keep track of those in list_vars that are considered dead. + self.dead_vars = [] + # list_items keep track of the elements used in build_list. + self.list_items = [] + self.stmts = [] + # dels keep track of the deletion of list_items, which will need to be + # moved after array initialization. + self.dels = [] + # tracks if a modification has taken place + self.modified = False + + def reset(self): + """ + Resets the internal state of the variables used for tracking + """ + self.list_vars = [] + self.dead_vars = [] + self.list_items = [] + self.dels = [] + + def list_var_used(self, inst): + """ + Returns True if the list being analysed is used between the + build_list and the array call. + """ + return any([x.name in self.list_vars for x in inst.list_vars()]) + + state = State() + + for inst in block.body: + if isinstance(inst, ir.Assign): + if isinstance(inst.value, ir.Var): + if inst.value.name in state.list_vars: + state.list_vars.append(inst.target.name) + state.stmts.append(inst) + continue + elif isinstance(inst.value, ir.Expr): + expr = inst.value + if expr.op == 'build_list': + # new build_list encountered, reset state + state.reset() + state.list_items = [x.name for x in expr.items] + state.list_vars = [inst.target.name] + state.stmts.append(inst) + continue + elif expr.op == 'call' and expr in calltypes: + arr_var = inst.target + if guard(inline_array, inst.target, expr, + state.stmts, state.list_vars, state.dels): + state.modified = True + continue + elif isinstance(inst, ir.Del): + removed_var = inst.value + if removed_var in state.list_items: + state.dels.append(inst) + continue + elif removed_var in state.list_vars: + # one of the list_vars is considered dead. + state.dead_vars.append(removed_var) + state.list_vars.remove(removed_var) + state.stmts.append(inst) + if state.list_vars == []: + # if all list_vars are considered dead, we need to filter + # them out from existing stmts to completely remove + # build_list. + # Note that if a translation didn't take place, dead_vars + # will also be empty when we reach this point. + body = [] + for inst in state.stmts: + if ((isinstance(inst, ir.Assign) and + inst.target.name in state.dead_vars) or + (isinstance(inst, ir.Del) and + inst.value in state.dead_vars)): + continue + body.append(inst) + state.stmts = body + state.dead_vars = [] + state.modified = True + continue + state.stmts.append(inst) + + # If the list is used in any capacity between build_list and array + # call, then we must call off the translation for this list because + # it could be mutated and list_items would no longer be applicable. + if state.list_var_used(inst): + state.reset() + + return state.stmts if state.modified else None diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/interpreter.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/interpreter.py new file mode 100644 index 0000000000000000000000000000000000000000..71ea49831b3ce27893d74208efb131e486c408e0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/interpreter.py @@ -0,0 +1,2876 @@ +import builtins +import collections +import dis +import operator +import logging +import textwrap + +from numba.core import errors, dataflow, controlflow, ir, config +from numba.core.errors import NotDefinedError, UnsupportedError, error_extras +from numba.core.ir_utils import get_definition, guard +from numba.core.utils import (PYVERSION, BINOPS_TO_OPERATORS, + INPLACE_BINOPS_TO_OPERATORS,) +from numba.core.byteflow import Flow, AdaptDFA, AdaptCFA +from numba.core.unsafe import eh +from numba.cpython.unsafe.tuple import unpack_single_tuple + + +class _UNKNOWN_VALUE(object): + """Represents an unknown value, this is for ease of debugging purposes only. + """ + + def __init__(self, varname): + self._varname = varname + + def __repr__(self): + return "_UNKNOWN_VALUE({})".format(self._varname) + + +_logger = logging.getLogger(__name__) + + +class Assigner(object): + """ + This object keeps track of potential assignment simplifications + inside a code block. + For example `$O.1 = x` followed by `y = $0.1` can be simplified + into `y = x`, but it's not possible anymore if we have `x = z` + in-between those two instructions. + + NOTE: this is not only an optimization, but is actually necessary + due to certain limitations of Numba - such as only accepting the + returning of an array passed as function argument. + """ + + def __init__(self): + # { destination variable name -> source Var object } + self.dest_to_src = {} + # Basically a reverse mapping of dest_to_src: + # { source variable name -> all destination names in dest_to_src } + self.src_invalidate = collections.defaultdict(list) + self.unused_dests = set() + + def assign(self, srcvar, destvar): + """ + Assign *srcvar* to *destvar*. Return either *srcvar* or a possible + simplified assignment source (earlier assigned to *srcvar*). + """ + srcname = srcvar.name + destname = destvar.name + if destname in self.src_invalidate: + # destvar will change, invalidate all previously known + # simplifications + for d in self.src_invalidate.pop(destname): + self.dest_to_src.pop(d) + if srcname in self.dest_to_src: + srcvar = self.dest_to_src[srcname] + if destvar.is_temp: + self.dest_to_src[destname] = srcvar + self.src_invalidate[srcname].append(destname) + self.unused_dests.add(destname) + return srcvar + + def get_assignment_source(self, destname): + """ + Get a possible assignment source (a ir.Var instance) to replace + *destname*, otherwise None. + """ + if destname in self.dest_to_src: + return self.dest_to_src[destname] + self.unused_dests.discard(destname) + return None + + +def _remove_assignment_definition(old_body, idx, func_ir, already_deleted_defs): + """ + Deletes the definition defined for old_body at index idx + from func_ir. We assume this stmt will be deleted from + new_body. + + In some optimizations we may update the same variable multiple times. + In this situation, we only need to delete a particular definition once, + this is tracked in already_deleted_def, which is a map from + assignment name to the set of values that have already been + deleted. + """ + lhs = old_body[idx].target.name + rhs = old_body[idx].value + if rhs in func_ir._definitions[lhs]: + func_ir._definitions[lhs].remove(rhs) + already_deleted_defs[lhs].add(rhs) + elif rhs not in already_deleted_defs[lhs]: + raise UnsupportedError( + "Inconsistency found in the definitions while executing" + " a peephole optimization. This suggests an internal" + " error or inconsistency elsewhere in the compiler." + ) + + +def _call_function_ex_replace_kws_small( + old_body, + keyword_expr, + new_body, + buildmap_idx, + func_ir, + already_deleted_defs +): + """ + Extracts the kws args passed as varkwarg + for CALL_FUNCTION_EX. This pass is taken when + n_kws <= 15 and the bytecode looks like: + + # Start for each argument + LOAD_FAST # Load each argument. + # End for each argument + ... + BUILD_CONST_KEY_MAP # Build a map + + In the generated IR, the varkwarg refers + to a single build_map that contains all of the + kws. In addition to returning the kws, this + function updates new_body to remove all usage + of the map. + """ + kws = keyword_expr.items.copy() + # kws are required to have constant keys. + # We update these with the value_indexes + value_indexes = keyword_expr.value_indexes + for key, index in value_indexes.items(): + kws[index] = (key, kws[index][1]) + # Remove the build_map by setting the list + # index to None. Nones will be removed later. + new_body[buildmap_idx] = None + # Remove the definition. + _remove_assignment_definition( + old_body, buildmap_idx, func_ir, already_deleted_defs + ) + return kws + + +def _call_function_ex_replace_kws_large( + old_body, + buildmap_name, + buildmap_idx, + search_end, + new_body, + func_ir, + errmsg, + already_deleted_defs +): + """ + Extracts the kws args passed as varkwarg + for CALL_FUNCTION_EX. This pass is taken when + n_kws > 15 and the bytecode looks like: + + BUILD_MAP # Construct the map + # Start for each argument + LOAD_CONST # Load a constant for the name of the argument + LOAD_FAST # Load each argument. + MAP_ADD # Append the (key, value) pair to the map + # End for each argument + + In the IR generated, the initial build map is empty and a series + of setitems are applied afterwards. THE IR looks like: + + $build_map_var = build_map(items=[]) + $constvar = const(str, ...) # create the const key + # CREATE THE ARGUMENT, This may take multiple lines. + $created_arg = ... + $var = getattr( + value=$build_map_var, + attr=__setitem__, + ) + $unused_var = call $var($constvar, $created_arg) + + We iterate through the IR, deleting all usages of the buildmap + from the new_body, and adds the kws to a new kws list. + """ + # Remove the build_map from the body. + new_body[buildmap_idx] = None + # Remove the definition. + _remove_assignment_definition( + old_body, buildmap_idx, func_ir, already_deleted_defs + ) + kws = [] + search_start = buildmap_idx + 1 + while search_start <= search_end: + # The first value must be a constant. + const_stmt = old_body[search_start] + if not ( + isinstance(const_stmt, ir.Assign) + and isinstance(const_stmt.value, ir.Const) + ): + # We cannot handle this format so raise the + # original error message. + raise UnsupportedError(errmsg) + key_var_name = const_stmt.target.name + key_val = const_stmt.value.value + search_start += 1 + # Now we need to search for a getattr with setitem + found_getattr = False + while ( + search_start <= search_end + and not found_getattr + ): + getattr_stmt = old_body[search_start] + if ( + isinstance(getattr_stmt, ir.Assign) + and isinstance(getattr_stmt.value, ir.Expr) + and getattr_stmt.value.op == "getattr" + and ( + getattr_stmt.value.value.name + == buildmap_name + ) + and getattr_stmt.value.attr == "__setitem__" + ): + found_getattr = True + else: + # If the argument is "created" in JIT, then there + # will be intermediate operations in between setitems. + # For example we have arg5=pow(arg5, 2), + # then the IR would look like: + # + # # Creation of the constant key. + # $const44.26 = const(str, arg5) + # + # # Argument creation. This is the section we are skipping + # $46load_global.27 = global(pow: ) + # $const50.29 = const(int, 2) + # $call.30 = call $46load_global.27(arg5, $const50.29) + # + # # Setitem with arg5 + # $54map_add.31 = getattr(value=$map.2, attr=__setitem__) + # $54map_add.32 = call $54map_add.31($const44.26, $call.30) + search_start += 1 + if ( + not found_getattr + or search_start == search_end + ): + # We cannot handle this format so raise the + # original error message. + raise UnsupportedError(errmsg) + setitem_stmt = old_body[search_start + 1] + if not ( + isinstance(setitem_stmt, ir.Assign) + and isinstance(setitem_stmt.value, ir.Expr) + and setitem_stmt.value.op == "call" + and ( + setitem_stmt.value.func.name + == getattr_stmt.target.name + ) + and len(setitem_stmt.value.args) == 2 + and ( + setitem_stmt.value.args[0].name + == key_var_name + ) + ): + # A call statement should always immediately follow the + # getattr. If for some reason this doesn't match the code + # format, we raise the original error message. This check + # is meant as a precaution. + raise UnsupportedError(errmsg) + arg_var = setitem_stmt.value.args[1] + # Append the (key, value) pair. + kws.append((key_val, arg_var)) + # Remove the __setitem__ getattr and call + new_body[search_start] = None + new_body[search_start + 1] = None + # Remove the definitions. + _remove_assignment_definition( + old_body, search_start, func_ir, already_deleted_defs + ) + _remove_assignment_definition( + old_body, search_start + 1, func_ir, already_deleted_defs + ) + search_start += 2 + return kws + + +def _call_function_ex_replace_args_small( + old_body, + tuple_expr, + new_body, + buildtuple_idx, + func_ir, + already_deleted_defs +): + """ + Extracts the args passed as vararg + for CALL_FUNCTION_EX. This pass is taken when + n_args <= 30 and the bytecode looks like: + + # Start for each argument + LOAD_FAST # Load each argument. + # End for each argument + ... + BUILD_TUPLE # Create a tuple of the arguments + + In the IR generated, the vararg refer + to a single build_tuple that contains all of the + args. In addition to returning the args, this + function updates new_body to remove all usage + of the tuple. + """ + # Delete the build tuple + new_body[buildtuple_idx] = None + # Remove the definition. + _remove_assignment_definition( + old_body, buildtuple_idx, func_ir, already_deleted_defs + ) + # Return the args. + return tuple_expr.items + + +def _call_function_ex_replace_args_large( + old_body, + vararg_stmt, + new_body, + search_end, + func_ir, + errmsg, + already_deleted_defs +): + """ + Extracts the args passed as vararg + for CALL_FUNCTION_EX. This pass is taken when + n_args > 30 and the bytecode looks like: + + BUILD_TUPLE # Create a list to append to + # Start for each argument + LOAD_FAST # Load each argument. + LIST_APPEND # Add the argument to the list + # End for each argument + ... + LIST_TO_TUPLE # Convert the args to a tuple. + + In the IR generated, the tuple is created by concatenating + together several 1 element tuples to an initial empty tuple. + We traverse backwards in the IR, collecting args, until we + find the original empty tuple. For example, the IR might + look like: + + $orig_tuple = build_tuple(items=[]) + $first_var = build_tuple(items=[Var(arg0, test.py:6)]) + $next_tuple = $orig_tuple + $first_var + ... + $final_var = build_tuple(items=[Var(argn, test.py:6)]) + $final_tuple = $prev_tuple + $final_var + $varargs_var = $final_tuple + """ + # We traverse to the front of the block to look for the original + # tuple. + search_start = 0 + total_args = [] + if ( + isinstance(vararg_stmt, ir.Assign) + and isinstance(vararg_stmt.value, ir.Var) + ): + target_name = vararg_stmt.value.name + # If there is an initial assignment, delete it + new_body[search_end] = None + # Remove the definition. + _remove_assignment_definition( + old_body, search_end, func_ir, already_deleted_defs + ) + search_end -= 1 + else: + # There must always be an initial assignment + # https://github.com/numba/numba/blob/59fa2e335be68148b3bd72a29de3ff011430038d/numba/core/interpreter.py#L259-L260 + # If this changes we may need to support this branch. + raise AssertionError("unreachable") + # Traverse backwards to find all concatenations + # until eventually reaching the original empty tuple. + while search_end >= search_start: + concat_stmt = old_body[search_end] + if ( + isinstance(concat_stmt, ir.Assign) + and concat_stmt.target.name == target_name + and isinstance(concat_stmt.value, ir.Expr) + and concat_stmt.value.op == "build_tuple" + and not concat_stmt.value.items + ): + new_body[search_end] = None + # Remove the definition. + _remove_assignment_definition( + old_body, search_end, func_ir, already_deleted_defs + ) + # If we have reached the build_tuple we exit. + break + else: + # We expect to find another arg to append. + # The first stmt must be a binop "add" + if (search_end == search_start) or not ( + isinstance(concat_stmt, ir.Assign) + and ( + concat_stmt.target.name + == target_name + ) + and isinstance( + concat_stmt.value, ir.Expr + ) + and concat_stmt.value.op == "binop" + and concat_stmt.value.fn == operator.add + ): + # We cannot handle this format. + raise UnsupportedError(errmsg) + lhs_name = concat_stmt.value.lhs.name + rhs_name = concat_stmt.value.rhs.name + # The previous statement should be a + # build_tuple containing the arg. + arg_tuple_stmt = old_body[search_end - 1] + if not ( + isinstance(arg_tuple_stmt, ir.Assign) + and isinstance( + arg_tuple_stmt.value, ir.Expr + ) + and ( + arg_tuple_stmt.value.op + == "build_tuple" + ) + and len(arg_tuple_stmt.value.items) == 1 + ): + # We cannot handle this format. + raise UnsupportedError(errmsg) + if arg_tuple_stmt.target.name == lhs_name: + # The tuple should always be generated on the RHS. + raise AssertionError("unreachable") + elif arg_tuple_stmt.target.name == rhs_name: + target_name = lhs_name + else: + # We cannot handle this format. + raise UnsupportedError(errmsg) + total_args.append( + arg_tuple_stmt.value.items[0] + ) + new_body[search_end] = None + new_body[search_end - 1] = None + # Remove the definitions. + _remove_assignment_definition( + old_body, search_end, func_ir, already_deleted_defs + ) + _remove_assignment_definition( + old_body, search_end - 1, func_ir, already_deleted_defs + ) + search_end -= 2 + # Avoid any space between appends + keep_looking = True + while search_end >= search_start and keep_looking: + next_stmt = old_body[search_end] + if ( + isinstance(next_stmt, ir.Assign) + and ( + next_stmt.target.name + == target_name + ) + ): + keep_looking = False + else: + # If the argument is "created" in JIT, then there + # will be intermediate operations in between appends. + # For example if the next arg after arg4 is pow(arg5, 2), + # then the IR would look like: + # + # # Appending arg4 + # $arg4_tup = build_tuple(items=[arg4]) + # $append_var.5 = $append_var.4 + $arg4_tup + # + # # Creation of arg5. + # # This is the section that we are skipping. + # $32load_global.20 = global(pow: ) + # $const36.22 = const(int, 2) + # $call.23 = call $32load_global.20(arg5, $const36.22) + # + # # Appending arg5 + # $arg5_tup = build_tuple(items=[$call.23]) + # $append_var.6 = $append_var.5 + $arg5_tup + search_end -= 1 + if search_end == search_start: + # If we reached the start we never found the build_tuple. + # We cannot handle this format so raise the + # original error message. + raise UnsupportedError(errmsg) + # Reverse the arguments so we get the correct order. + return total_args[::-1] + + +def peep_hole_call_function_ex_to_call_function_kw(func_ir): + """ + This peephole rewrites a bytecode sequence unique to Python 3.10 + where CALL_FUNCTION_EX is used instead of CALL_FUNCTION_KW because of + stack limitations set by CPython. This limitation is imposed whenever + a function call has too many arguments or keyword arguments. + + https://github.com/python/cpython/blob/a58ebcc701dd6c43630df941481475ff0f615a81/Python/compile.c#L55 + https://github.com/python/cpython/blob/a58ebcc701dd6c43630df941481475ff0f615a81/Python/compile.c#L4442 + + In particular, this change is imposed whenever (n_args / 2) + n_kws > 15. + + Different bytecode is generated for args depending on if n_args > 30 + or n_args <= 30 and similarly if n_kws > 15 or n_kws <= 15. + + This function unwraps the *args and **kwargs in the function call + and places these values directly into the args and kwargs of the call. + """ + # All changes are local to the a single block + # so it can be traversed in any order. + errmsg = textwrap.dedent(""" + CALL_FUNCTION_EX with **kwargs not supported. + If you are not using **kwargs this may indicate that + you have a large number of kwargs and are using inlined control + flow. You can resolve this issue by moving the control flow out of + the function call. For example, if you have + + f(a=1 if flag else 0, ...) + + Replace that with: + + a_val = 1 if flag else 0 + f(a=a_val, ...)""") + + # Track which definitions have already been deleted + already_deleted_defs = collections.defaultdict(set) + for blk in func_ir.blocks.values(): + blk_changed = False + new_body = [] + for i, stmt in enumerate(blk.body): + if ( + isinstance(stmt, ir.Assign) + and isinstance(stmt.value, ir.Expr) + and stmt.value.op == "call" + and stmt.value.varkwarg is not None + ): + blk_changed = True + call = stmt.value + args = call.args + kws = call.kws + # We need to check the call expression contents if + # it contains either vararg or varkwarg. If it contains + # varkwarg we need to update the IR. If it just contains + # vararg we don't need to update the IR, but we need to + # check if peep_hole_list_to_tuple failed to replace the + # vararg list with a tuple. If so, we output an error + # message with suggested code changes. + vararg = call.vararg + varkwarg = call.varkwarg + start_search = i - 1 + # varkwarg should be defined second so we start there. + varkwarg_loc = start_search + keyword_def = None + found = False + while varkwarg_loc >= 0 and not found: + keyword_def = blk.body[varkwarg_loc] + if ( + isinstance(keyword_def, ir.Assign) + and keyword_def.target.name == varkwarg.name + ): + found = True + else: + varkwarg_loc -= 1 + if ( + kws + or not found + or not ( + isinstance(keyword_def.value, ir.Expr) + and keyword_def.value.op == "build_map" + ) + ): + # If we couldn't find where the kwargs are created + # then it should be a normal **kwargs call + # so we produce an unsupported message. + raise UnsupportedError(errmsg) + # Determine the kws + if keyword_def.value.items: + # n_kws <= 15 case. + # Here the IR looks like a series of + # constants, then the arguments and finally + # a build_map that contains all of the pairs. + # For Example: + # + # $const_n = const("arg_name") + # $arg_n = ... + # $kwargs_var = build_map(items=[ + # ($const_0, $arg_0), + # ..., + # ($const_n, $arg_n),]) + kws = _call_function_ex_replace_kws_small( + blk.body, + keyword_def.value, + new_body, + varkwarg_loc, + func_ir, + already_deleted_defs, + ) + else: + # n_kws > 15 case. + # Here the IR is an initial empty build_map + # followed by a series of setitems with a constant + # key and then the argument. + # For example: + # + # $kwargs_var = build_map(items=[]) + # $const_0 = const("arg_name") + # $arg_0 = ... + # $my_attr = getattr(const_0, attr=__setitem__) + # $unused_var = call $my_attr($const_0, $arg_0) + # ... + kws = _call_function_ex_replace_kws_large( + blk.body, + varkwarg.name, + varkwarg_loc, + i - 1, + new_body, + func_ir, + errmsg, + already_deleted_defs, + ) + start_search = varkwarg_loc + # Vararg isn't required to be provided. + if vararg is not None: + if args: + # If we have vararg then args is expected to + # be an empty list. + raise UnsupportedError(errmsg) + vararg_loc = start_search + args_def = None + found = False + while vararg_loc >= 0 and not found: + args_def = blk.body[vararg_loc] + if ( + isinstance(args_def, ir.Assign) + and args_def.target.name == vararg.name + ): + found = True + else: + vararg_loc -= 1 + if not found: + # If we couldn't find where the args are created + # then we can't handle this format. + raise UnsupportedError(errmsg) + if ( + isinstance(args_def.value, ir.Expr) + and args_def.value.op == "build_tuple" + ): + # n_args <= 30 case. + # Here the IR is a simple build_tuple containing + # all of the args. + # For example: + # + # $arg_n = ... + # $varargs = build_tuple( + # items=[$arg_0, ..., $arg_n] + # ) + args = _call_function_ex_replace_args_small( + blk.body, + args_def.value, + new_body, + vararg_loc, + func_ir, + already_deleted_defs, + ) + elif ( + isinstance(args_def.value, ir.Expr) + and args_def.value.op == "list_to_tuple" + ): + # If there is a call with vararg we need to check + # if the list -> tuple conversion failed and if so + # throw an error. + raise UnsupportedError(errmsg) + else: + # Here the IR is an initial empty build_tuple. + # Then for each arg, a new tuple with a single + # element is created and one by one these are + # added to a growing tuple. + # For example: + # + # $combo_tup_0 = build_tuple(items=[]) + # $arg0 = ... + # $arg0_tup = build_tuple(items=[$arg0]) + # $combo_tup_1 = $combo_tup_0 + $arg0_tup + # $arg1 = ... + # $arg1_tup = build_tuple(items=[$arg1]) + # $combo_tup_2 = $combo_tup_1 + $arg1_tup + # ... + # $combo_tup_n = $combo_tup_{n-1} + $argn_tup + # + # In addition, the IR contains a final + # assignment for the varargs that looks like: + # + # $varargs_var = $combo_tup_n + # + # Here args_def is expected to be a simple assignment. + args = _call_function_ex_replace_args_large( + blk.body, + args_def, + new_body, + vararg_loc, + func_ir, + errmsg, + already_deleted_defs, + ) + # Create a new call updating the args and kws + new_call = ir.Expr.call( + call.func, args, kws, call.loc, target=call.target + ) + # Drop the existing definition for this stmt. + _remove_assignment_definition( + blk.body, i, func_ir, already_deleted_defs + ) + # Update the statement + stmt = ir.Assign(new_call, stmt.target, stmt.loc) + # Update the definition + func_ir._definitions[stmt.target.name].append(new_call) + elif ( + isinstance(stmt, ir.Assign) + and isinstance(stmt.value, ir.Expr) + and stmt.value.op == "call" + and stmt.value.vararg is not None + ): + # If there is a call with vararg we need to check + # if the list -> tuple conversion failed and if so + # throw an error. + call = stmt.value + vararg_name = call.vararg.name + if ( + vararg_name in func_ir._definitions + and len(func_ir._definitions[vararg_name]) == 1 + ): + # If this value is still a list to tuple raise the + # exception. + expr = func_ir._definitions[vararg_name][0] + if isinstance(expr, ir.Expr) and expr.op == "list_to_tuple": + raise UnsupportedError(errmsg) + + new_body.append(stmt) + # Replace the block body if we changed the IR + if blk_changed: + blk.body.clear() + blk.body.extend([x for x in new_body if x is not None]) + return func_ir + + +def peep_hole_list_to_tuple(func_ir): + """ + This peephole rewrites a bytecode sequence new to Python 3.9 that looks + like e.g.: + + def foo(a): + return (*a,) + + 41 0 BUILD_LIST 0 + 2 LOAD_FAST 0 (a) + 4 LIST_EXTEND 1 + 6 LIST_TO_TUPLE + 8 RETURN_VAL + + essentially, the unpacking of tuples is written as a list which is appended + to/extended and then "magicked" into a tuple by the new LIST_TO_TUPLE + opcode. + + This peephole repeatedly analyses the bytecode in a block looking for a + window between a `LIST_TO_TUPLE` and `BUILD_LIST` and... + + 1. Turns the BUILD_LIST into a BUILD_TUPLE + 2. Sets an accumulator's initial value as the target of the BUILD_TUPLE + 3. Searches for 'extend' on the original list and turns these into binary + additions on the accumulator. + 4. Searches for 'append' on the original list and turns these into a + `BUILD_TUPLE` which is then appended via binary addition to the + accumulator. + 5. Assigns the accumulator to the variable that exits the peephole and the + rest of the block/code refers to as the result of the unpack operation. + 6. Patches up + """ + _DEBUG = False + + # For all blocks + for offset, blk in func_ir.blocks.items(): + # keep doing the peephole rewrite until nothing is left that matches + while True: + # first try and find a matching region + # i.e. BUILD_LIST......LIST_TO_TUPLE + def find_postive_region(): + found = False + for idx in reversed(range(len(blk.body))): + stmt = blk.body[idx] + if isinstance(stmt, ir.Assign): + value = stmt.value + if (isinstance(value, ir.Expr) and + value.op == 'list_to_tuple'): + target_list = value.info[0] + found = True + bt = (idx, stmt) + if found: + if isinstance(stmt, ir.Assign): + if stmt.target.name == target_list: + region = (bt, (idx, stmt)) + return region + + region = find_postive_region() + # if there's a peep hole region then do something with it + if region is not None: + peep_hole = blk.body[region[1][0] : region[0][0]] + if _DEBUG: + print("\nWINDOW:") + for x in peep_hole: + print(x) + print("") + + appends = [] + extends = [] + init = region[1][1] + const_list = init.target.name + # Walk through the peep_hole and find things that are being + # "extend"ed and "append"ed to the BUILD_LIST + for x in peep_hole: + if isinstance(x, ir.Assign): + if isinstance(x.value, ir.Expr): + expr = x.value + if (expr.op == 'getattr' and + expr.value.name == const_list): + # it's not strictly necessary to split out + # extends and appends, but it helps with + # debugging to do so! + if expr.attr == 'extend': + extends.append(x.target.name) + elif expr.attr == 'append': + appends.append(x.target.name) + else: + assert 0 + # go back through the peep hole build new IR based on it. + new_hole = [] + + def append_and_fix(x): + """ Adds to the new_hole and fixes up definitions""" + new_hole.append(x) + if x.target.name in func_ir._definitions: + # if there's already a definition, drop it, should only + # be 1 as the way cpython emits the sequence for + # `list_to_tuple` should ensure this. + assert len(func_ir._definitions[x.target.name]) == 1 + func_ir._definitions[x.target.name].clear() + func_ir._definitions[x.target.name].append(x.value) + + the_build_list = init.target + + # Do the transform on the peep hole + if _DEBUG: + print("\nBLOCK:") + blk.dump() + + # This section basically accumulates list appends and extends + # as binop(+) on tuples, it drops all the getattr() for extend + # and append as they are now dead and replaced with binop(+). + # It also switches out the build_list for a build_tuple and then + # ensures everything is wired up and defined ok. + t2l_agn = region[0][1] + acc = the_build_list + for x in peep_hole: + if isinstance(x, ir.Assign): + if isinstance(x.value, ir.Expr): + expr = x.value + if expr.op == 'getattr': + if (x.target.name in extends or + x.target.name in appends): + # drop definition, it's being wholesale + # replaced. + func_ir._definitions.pop(x.target.name) + continue + else: + # a getattr on something we're not + # interested in + new_hole.append(x) + elif expr.op == 'call': + fname = expr.func.name + if fname in extends or fname in appends: + arg = expr.args[0] + if isinstance(arg, ir.Var): + tmp_name = "%s_var_%s" % (fname, + arg.name) + if fname in appends: + bt = ir.Expr.build_tuple([arg,], + expr.loc) + else: + # Extend as tuple + gv_tuple = ir.Global( + name="tuple", value=tuple, + loc=expr.loc, + ) + tuple_var = arg.scope.redefine( + "$_list_extend_gv_tuple", + loc=expr.loc, + ) + new_hole.append( + ir.Assign( + target=tuple_var, + value=gv_tuple, + loc=expr.loc, + ), + ) + bt = ir.Expr.call( + tuple_var, (arg,), (), + loc=expr.loc, + ) + var = ir.Var(arg.scope, tmp_name, + expr.loc) + asgn = ir.Assign(bt, var, expr.loc) + append_and_fix(asgn) + arg = var + + # this needs to be a binary add + new = ir.Expr.binop(fn=operator.add, + lhs=acc, + rhs=arg, + loc=x.loc) + asgn = ir.Assign(new, x.target, expr.loc) + append_and_fix(asgn) + acc = asgn.target + else: + # there could be a call in the unpack, like + # *(a, x.append(y)) + new_hole.append(x) + elif (expr.op == 'build_list' and + x.target.name == const_list): + new = ir.Expr.build_tuple(expr.items, expr.loc) + asgn = ir.Assign(new, x.target, expr.loc) + # Not a temporary any more + append_and_fix(asgn) + else: + new_hole.append(x) + else: + new_hole.append(x) + + else: + # stick everything else in as-is + new_hole.append(x) + # Finally write the result back into the original build list as + # everything refers to it. + append_and_fix(ir.Assign(acc, t2l_agn.target, + the_build_list.loc)) + if _DEBUG: + print("\nNEW HOLE:") + for x in new_hole: + print(x) + + # and then update the block body with the modified region + cpy = blk.body[:] + head = cpy[:region[1][0]] + tail = blk.body[region[0][0] + 1:] + tmp = head + new_hole + tail + blk.body.clear() + blk.body.extend(tmp) + + if _DEBUG: + print("\nDUMP post hole:") + blk.dump() + + else: + # else escape + break + + return func_ir + + +def peep_hole_delete_with_exit(func_ir): + """ + This rewrite removes variables used to store the `__exit__` function + loaded by SETUP_WITH. + """ + dead_vars = set() + + for blk in func_ir.blocks.values(): + for stmt in blk.body: + # Any statement that uses a variable with the '$setup_with_exitfn' + # prefix is considered dead. + used = set(stmt.list_vars()) + for v in used: + if v.name.startswith('$setup_with_exitfn'): + dead_vars.add(v) + # Any assignment that uses any of the dead variable is considered + # dead. + if used & dead_vars: + if isinstance(stmt, ir.Assign): + dead_vars.add(stmt.target) + + new_body = [] + for stmt in blk.body: + # Skip any statements that uses anyone of the dead variable. + if not (set(stmt.list_vars()) & dead_vars): + new_body.append(stmt) + blk.body.clear() + blk.body.extend(new_body) + + return func_ir + + +def peep_hole_fuse_dict_add_updates(func_ir): + """ + This rewrite removes d1._update_from_bytecode(d2) + calls that are between two dictionaries, d1 and d2, + in the same basic block. This pattern can appear as a + result of Python 3.10 bytecode emission changes, which + prevent large constant literal dictionaries + (> 15 elements) from being constant. If both dictionaries + are constant dictionaries defined in the same block and + neither is used between the update call, then we replace d1 + with a new definition that combines the two dictionaries. At + the bytecode translation stage we convert DICT_UPDATE into + _update_from_bytecode, so we know that _update_from_bytecode + always comes from the bytecode change and not user code. + + Python 3.10 may also rewrite the individual dictionaries + as an empty build_map + many map_add. Here we again look + for an _update_from_bytecode, and if so we replace these + with a single constant dictionary. + + When running this algorithm we can always safely remove d2. + + This is the relevant section of the CPython 3.10 that causes + this bytecode change: + https://github.com/python/cpython/blob/3.10/Python/compile.c#L4048 + """ + + # This algorithm fuses build_map expressions into the largest + # possible build map before use. For example, if we have an + # IR that looks like this: + # + # $d1 = build_map([]) + # $key = const("a") + # $value = const(2) + # $setitem_func = getattr($d1, "__setitem__") + # $unused1 = call (setitem_func, ($key, $value)) + # $key2 = const("b") + # $value2 = const(3) + # $d2 = build_map([($key2, $value2)]) + # $update_func = getattr($d1, "_update_from_bytecode") + # $unused2 = call ($update_func, ($d2,)) + # $othervar = None + # $retvar = cast($othervar) + # return $retvar + # + # Then the IR is rewritten such that any __setitem__ and + # _update_from_bytecode operations are fused into the original buildmap. + # The new buildmap is then added to the + # last location where it had previously had encountered a __setitem__, + # _update_from_bytecode, or build_map before any other uses. + # The new IR would look like: + # + # $key = const("a") + # $value = const(2) + # $key2 = const("b") + # $value2 = const(3) + # $d1 = build_map([($key, $value), ($key2, $value2)]) + # $othervar = None + # $retvar = cast($othervar) + # return $retvar + # + # Note that we don't push $d1 to the bottom of the block. This is because + # some values may be found below this block (e.g pop_block) that are pattern + # matched in other locations, such as objmode handling. It should be safe to + # move a map to the last location at which there was _update_from_bytecode. + + errmsg = textwrap.dedent(""" + A DICT_UPDATE op-code was encountered that could not be replaced. + If you have created a large constant dictionary, this may + be an an indication that you are using inlined control + flow. You can resolve this issue by moving the control flow out of + the dicitonary constructor. For example, if you have + + d = {a: 1 if flag else 0, ...) + + Replace that with: + + a_val = 1 if flag else 0 + d = {a: a_val, ...)""") + + already_deleted_defs = collections.defaultdict(set) + for blk in func_ir.blocks.values(): + new_body = [] + # literal map var name -> block idx of the original build_map + lit_map_def_idx = {} + # literal map var name -> list(map_uses) + # This is the index of every build_map or __setitem__ + # in the IR that will need to be removed if the map + # is updated. + lit_map_use_idx = collections.defaultdict(list) + # literal map var name -> list of key/value items for build map + map_updates = {} + blk_changed = False + + for i, stmt in enumerate(blk.body): + # What instruction should we append + new_inst = stmt + # Name that should be skipped when tracking used + # vars in statement. This is always the lhs with + # a build_map. + stmt_build_map_out = None + if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr): + if stmt.value.op == "build_map": + # Skip the output build_map when looking for used vars. + stmt_build_map_out = stmt.target.name + # If we encounter a build map add it to the + # tracked maps. + lit_map_def_idx[stmt.target.name] = i + lit_map_use_idx[stmt.target.name].append(i) + map_updates[stmt.target.name] = stmt.value.items.copy() + elif stmt.value.op == "call" and i > 0: + # If we encounter a call we may need to replace + # the body + func_name = stmt.value.func.name + # If we have an update or a setitem + # it will be the previous expression. + getattr_stmt = blk.body[i - 1] + args = stmt.value.args + if ( + isinstance(getattr_stmt, ir.Assign) + and getattr_stmt.target.name == func_name + and isinstance(getattr_stmt.value, ir.Expr) + and getattr_stmt.value.op == "getattr" + and getattr_stmt.value.attr in ( + "__setitem__", "_update_from_bytecode" + ) + ): + update_map_name = getattr_stmt.value.value.name + attr = getattr_stmt.value.attr + if (attr == "__setitem__" + and update_map_name in lit_map_use_idx): + # If we have a setitem, update the lists + map_updates[update_map_name].append(args) + # Update the list of instructions that would + # need to be removed to include the setitem + # and the the getattr + lit_map_use_idx[update_map_name].extend([i - 1, i]) + elif attr == "_update_from_bytecode": + d2_map_name = args[0].name + if (update_map_name in lit_map_use_idx + and d2_map_name in lit_map_use_idx): + # If we have an update and the arg is also + # a literal dictionary, fuse the lists. + map_updates[update_map_name].extend( + map_updates[d2_map_name] + ) + # Delete the old IR for d1 and d2 + lit_map_use_idx[update_map_name].extend( + lit_map_use_idx[d2_map_name] + ) + lit_map_use_idx[update_map_name].append(i - 1) + for linenum in lit_map_use_idx[update_map_name]: + # Drop the existing definition. + _remove_assignment_definition( + blk.body, + linenum, + func_ir, + already_deleted_defs, + ) + # Delete it from the new block + new_body[linenum] = None + # Delete the maps from dicts + del lit_map_def_idx[d2_map_name] + del lit_map_use_idx[d2_map_name] + del map_updates[d2_map_name] + # Add d1 as the new instruction, removing the + # old definition. + _remove_assignment_definition( + blk.body, i, func_ir, already_deleted_defs + ) + new_inst = _build_new_build_map( + func_ir, + update_map_name, + blk.body, + lit_map_def_idx[update_map_name], + map_updates[update_map_name], + ) + # Update d1 in lit_map_use_idx to just the new + # definition and clear the previous list. + lit_map_use_idx[update_map_name].clear() + lit_map_use_idx[update_map_name].append(i) + # Mark that this block has been modified + blk_changed = True + else: + # If we cannot remove _update_from_bytecode + # Then raise an error for the user. + raise UnsupportedError(errmsg) + + # Check if we need to drop any maps from being tracked. + # Skip the setitem/_update_from_bytecode getattr that + # will be removed when handling their call in the next + # iteration. + if not ( + isinstance(stmt, ir.Assign) + and isinstance(stmt.value, ir.Expr) + and stmt.value.op == "getattr" + and stmt.value.value.name in lit_map_use_idx + and stmt.value.attr in ("__setitem__", "_update_from_bytecode") + ): + for var in stmt.list_vars(): + # If a map is used it cannot be fused later in + # the block. As a result we delete it from + # the dicitonaries + if ( + var.name in lit_map_use_idx + and var.name != stmt_build_map_out + ): + del lit_map_def_idx[var.name] + del lit_map_use_idx[var.name] + del map_updates[var.name] + + # Append the instruction to the new block + new_body.append(new_inst) + + if blk_changed: + # If the block is changed replace the block body. + blk.body.clear() + blk.body.extend([x for x in new_body if x is not None]) + + return func_ir + + +def _build_new_build_map(func_ir, name, old_body, old_lineno, new_items): + """ + Create a new build_map with a new set of key/value items + but all the other info the same. + """ + old_assign = old_body[old_lineno] + old_target = old_assign.target + old_bm = old_assign.value + # Build the literals + literal_keys = [] + # Track the constant key/values to set the literal_value + # field of build_map properly + values = [] + for pair in new_items: + k, v = pair + key_def = guard(get_definition, func_ir, k) + if isinstance(key_def, (ir.Const, ir.Global, ir.FreeVar)): + literal_keys.append(key_def.value) + value_def = guard(get_definition, func_ir, v) + if isinstance(value_def, (ir.Const, ir.Global, ir.FreeVar)): + values.append(value_def.value) + else: + # Append unknown value if not a literal. + values.append(_UNKNOWN_VALUE(v.name)) + + value_indexes = {} + if len(literal_keys) == len(new_items): + # All keys must be literals to have any literal values. + literal_value = {x: y for x, y in zip(literal_keys, values)} + for i, k in enumerate(literal_keys): + value_indexes[k] = i + else: + literal_value = None + + # Construct a new build map. + new_bm = ir.Expr.build_map( + items=new_items, + size=len(new_items), + literal_value=literal_value, + value_indexes=value_indexes, + loc=old_bm.loc, + ) + + # The previous definition has already been removed + # when updating the IR in peep_hole_fuse_dict_add_updates + func_ir._definitions[name].append(new_bm) + + # Return a new assign. + return ir.Assign( + new_bm, ir.Var(old_target.scope, name, old_target.loc), new_bm.loc + ) + + +class Interpreter(object): + """A bytecode interpreter that builds up the IR. + """ + + def __init__(self, func_id): + self.func_id = func_id + self.arg_count = func_id.arg_count + self.arg_names = func_id.arg_names + self.loc = self.first_loc = ir.Loc.from_function_id(func_id) + self.is_generator = func_id.is_generator + + # { inst offset : ir.Block } + self.blocks = {} + # { name: [definitions] } of local variables + self.definitions = collections.defaultdict(list) + # A set to keep track of all exception variables. + # To be used in _legalize_exception_vars() + self._exception_vars = set() + + def interpret(self, bytecode): + """ + Generate IR for this bytecode. + """ + self.bytecode = bytecode + + self.scopes = [] + global_scope = ir.Scope(parent=None, loc=self.loc) + self.scopes.append(global_scope) + + if PYVERSION < (3, 7): + # Control flow analysis + self.cfa = controlflow.ControlFlowAnalysis(bytecode) + self.cfa.run() + if config.DUMP_CFG: + self.cfa.dump() + + # Data flow analysis + self.dfa = dataflow.DataFlowAnalysis(self.cfa) + self.dfa.run() + else: + flow = Flow(bytecode) + flow.run() + self.dfa = AdaptDFA(flow) + self.cfa = AdaptCFA(flow) + if config.DUMP_CFG: + self.cfa.dump() + + # Temp states during interpretation + self.current_block = None + self.current_block_offset = None + self.syntax_blocks = [] + self.dfainfo = None + + self.scopes.append(ir.Scope(parent=self.current_scope, loc=self.loc)) + # Interpret loop + for inst, kws in self._iter_inst(): + self._dispatch(inst, kws) + self._legalize_exception_vars() + # Prepare FunctionIR + func_ir = ir.FunctionIR(self.blocks, self.is_generator, self.func_id, + self.first_loc, self.definitions, + self.arg_count, self.arg_names) + _logger.debug(func_ir.dump_to_string()) + + # post process the IR to rewrite opcodes/byte sequences that are too + # involved to risk handling as part of direct interpretation + peepholes = [] + if PYVERSION in [(3, 9), (3, 10)]: + peepholes.append(peep_hole_list_to_tuple) + peepholes.append(peep_hole_delete_with_exit) + if PYVERSION == (3, 10): + # peep_hole_call_function_ex_to_call_function_kw + # depends on peep_hole_list_to_tuple converting + # any large number of arguments from a list to a + # tuple. + peepholes.append(peep_hole_call_function_ex_to_call_function_kw) + peepholes.append(peep_hole_fuse_dict_add_updates) + + post_processed_ir = self.post_process(peepholes, func_ir) + return post_processed_ir + + def post_process(self, peepholes, func_ir): + for peep in peepholes: + func_ir = peep(func_ir) + return func_ir + + def _legalize_exception_vars(self): + """Search for unsupported use of exception variables. + Note, they cannot be stored into user variable. + """ + # Build a set of exception variables + excvars = self._exception_vars.copy() + # Propagate the exception variables to LHS of assignment + for varname, defnvars in self.definitions.items(): + for v in defnvars: + if isinstance(v, ir.Var): + k = v.name + if k in excvars: + excvars.add(varname) + # Filter out the user variables. + uservar = list(filter(lambda x: not x.startswith('$'), excvars)) + if uservar: + # Complain about the first user-variable storing an exception + first = uservar[0] + loc = self.current_scope.get(first).loc + msg = "Exception object cannot be stored into variable ({})." + raise errors.UnsupportedError(msg.format(first), loc=loc) + + def init_first_block(self): + # Define variables receiving the function arguments + for index, name in enumerate(self.arg_names): + val = ir.Arg(index=index, name=name, loc=self.loc) + self.store(val, name) + + def _iter_inst(self): + for blkct, block in enumerate(self.cfa.iterliveblocks()): + firstinst = self.bytecode[block.offset] + self.loc = self.loc.with_lineno(firstinst.lineno) + self._start_new_block(block.offset) + if blkct == 0: + # Is first block + self.init_first_block() + for offset, kws in self.dfainfo.insts: + inst = self.bytecode[offset] + self.loc = self.loc.with_lineno(inst.lineno) + yield inst, kws + self._end_current_block() + + def _start_new_block(self, offset): + oldblock = self.current_block + self.insert_block(offset) + # Ensure the last block is terminated + if oldblock is not None and not oldblock.is_terminated: + # Handle ending try block. + tryblk = self.dfainfo.active_try_block + # If there's an active try-block and the handler block is live. + if tryblk is not None and tryblk['end'] in self.cfa.graph.nodes(): + # We are in a try-block, insert a branch to except-block. + # This logic cannot be in self._end_current_block() + # because we the non-raising next block-offset. + branch = ir.Branch( + cond=self.get('$exception_check'), + truebr=tryblk['end'], + falsebr=offset, + loc=self.loc, + ) + oldblock.append(branch) + # Handle normal case + else: + jmp = ir.Jump(offset, loc=self.loc) + oldblock.append(jmp) + # Get DFA block info + self.dfainfo = self.dfa.infos[self.current_block_offset] + self.assigner = Assigner() + # Check out-of-scope syntactic-block + while self.syntax_blocks: + if offset >= self.syntax_blocks[-1].exit: + self.syntax_blocks.pop() + else: + break + + def _end_current_block(self): + # Handle try block + if not self.current_block.is_terminated: + tryblk = self.dfainfo.active_try_block + if tryblk is not None: + self._insert_exception_check() + # Handle normal block cleanup + self._remove_unused_temporaries() + self._insert_outgoing_phis() + + def _inject_call(self, func, gv_name, res_name=None): + """A helper function to inject a call to *func* which is a python + function. + Parameters + ---------- + func : callable + The function object to be called. + gv_name : str + The variable name to be used to store the function object. + res_name : str; optional + The variable name to be used to store the call result. + If ``None``, a name is created automatically. + """ + gv_fn = ir.Global(gv_name, func, loc=self.loc) + self.store(value=gv_fn, name=gv_name, redefine=True) + callres = ir.Expr.call(self.get(gv_name), (), (), loc=self.loc) + res_name = res_name or '$callres_{}'.format(gv_name) + self.store(value=callres, name=res_name, redefine=True) + + def _insert_try_block_begin(self): + """Insert IR-nodes to mark the start of a `try` block. + """ + self._inject_call(eh.mark_try_block, 'mark_try_block') + + def _insert_try_block_end(self): + """Insert IR-nodes to mark the end of a `try` block. + """ + self._inject_call(eh.end_try_block, 'end_try_block') + + def _insert_exception_variables(self): + """Insert IR-nodes to initialize the exception variables. + """ + tryblk = self.dfainfo.active_try_block + # Get exception variables + endblk = tryblk['end'] + edgepushed = self.dfainfo.outgoing_edgepushed.get(endblk) + # Note: the last value on the stack is the exception value + # Note: due to the current limitation, all exception variables are None + if edgepushed: + const_none = ir.Const(value=None, loc=self.loc) + # For each variable going to the handler block. + for var in edgepushed: + if var in self.definitions: + raise AssertionError( + "exception variable CANNOT be defined by other code", + ) + self.store(value=const_none, name=var) + self._exception_vars.add(var) + + def _insert_exception_check(self): + """Called before the end of a block to inject checks if raised. + """ + self._insert_exception_variables() + # Do exception check + self._inject_call(eh.exception_check, 'exception_check', + '$exception_check') + + def _remove_unused_temporaries(self): + """ + Remove assignments to unused temporary variables from the + current block. + """ + new_body = [] + replaced_var = {} + for inst in self.current_block.body: + # the same temporary is assigned to multiple variables in cases + # like a = b[i] = 1, so need to handle replaced temporaries in + # later setitem/setattr nodes + if (isinstance(inst, (ir.SetItem, ir.SetAttr)) + and inst.value.name in replaced_var): + inst.value = replaced_var[inst.value.name] + elif isinstance(inst, ir.Assign): + if (inst.target.is_temp + and inst.target.name in self.assigner.unused_dests): + continue + # the same temporary is assigned to multiple variables in cases + # like a = b = 1, so need to handle replaced temporaries in + # later assignments + if (isinstance(inst.value, ir.Var) + and inst.value.name in replaced_var): + inst.value = replaced_var[inst.value.name] + new_body.append(inst) + continue + # chained unpack cases may reuse temporary + # e.g. a = (b, c) = (x, y) + if (isinstance(inst.value, ir.Expr) + and inst.value.op == "exhaust_iter" + and inst.value.value.name in replaced_var): + inst.value.value = replaced_var[inst.value.value.name] + new_body.append(inst) + continue + # eliminate temporary variables that are assigned to user + # variables right after creation. E.g.: + # $1 = f(); a = $1 -> a = f() + # the temporary variable is not reused elsewhere since CPython + # bytecode is stack-based and this pattern corresponds to a pop + if (isinstance(inst.value, ir.Var) and inst.value.is_temp + and new_body and isinstance(new_body[-1], ir.Assign)): + prev_assign = new_body[-1] + # _var_used_in_binop check makes sure we don't create a new + # inplace binop operation which can fail + # (see TestFunctionType.test_in_iter_func_call) + if (prev_assign.target.name == inst.value.name + and not self._var_used_in_binop( + inst.target.name, prev_assign.value)): + replaced_var[inst.value.name] = inst.target + prev_assign.target = inst.target + # replace temp var definition in target with proper defs + self.definitions[inst.target.name].remove(inst.value) + self.definitions[inst.target.name].extend( + self.definitions.pop(inst.value.name) + ) + continue + + new_body.append(inst) + + self.current_block.body = new_body + + def _var_used_in_binop(self, varname, expr): + """return True if 'expr' is a binary expression and 'varname' is used + in it as an argument + """ + return (isinstance(expr, ir.Expr) + and expr.op in ("binop", "inplace_binop") + and (varname == expr.lhs.name or varname == expr.rhs.name)) + + def _insert_outgoing_phis(self): + """ + Add assignments to forward requested outgoing values + to subsequent blocks. + """ + for phiname, varname in self.dfainfo.outgoing_phis.items(): + target = self.current_scope.get_or_define(phiname, + loc=self.loc) + stmt = ir.Assign(value=self.get(varname), target=target, + loc=self.loc) + self.definitions[target.name].append(stmt.value) + if not self.current_block.is_terminated: + self.current_block.append(stmt) + else: + self.current_block.insert_before_terminator(stmt) + + def get_global_value(self, name): + """ + Get a global value from the func_global (first) or + as a builtins (second). If both failed, return a ir.UNDEFINED. + """ + try: + return self.func_id.func.__globals__[name] + except KeyError: + return getattr(builtins, name, ir.UNDEFINED) + + def get_closure_value(self, index): + """ + Get a value from the cell contained in this function's closure. + If not set, return a ir.UNDEFINED. + """ + cell = self.func_id.func.__closure__[index] + try: + return cell.cell_contents + except ValueError: + return ir.UNDEFINED + + @property + def current_scope(self): + return self.scopes[-1] + + @property + def code_consts(self): + return self.bytecode.co_consts + + @property + def code_locals(self): + return self.bytecode.co_varnames + + @property + def code_names(self): + return self.bytecode.co_names + + @property + def code_cellvars(self): + return self.bytecode.co_cellvars + + @property + def code_freevars(self): + return self.bytecode.co_freevars + + def _dispatch(self, inst, kws): + assert self.current_block is not None + fname = "op_%s" % inst.opname.replace('+', '_') + try: + fn = getattr(self, fname) + except AttributeError: + raise NotImplementedError(inst) + else: + try: + return fn(inst, **kws) + except errors.NotDefinedError as e: + if e.loc is None: + loc = self.loc + else: + loc = e.loc + + err = errors.NotDefinedError(e.name, loc=loc) + if not config.FULL_TRACEBACKS: + raise err from None + else: + raise err + + # --- Scope operations --- + + def store(self, value, name, redefine=False): + """ + Store *value* (a Expr or Var instance) into the variable named *name* + (a str object). Returns the target variable. + """ + if redefine or self.current_block_offset in self.cfa.backbone: + rename = not (name in self.code_cellvars) + target = self.current_scope.redefine(name, loc=self.loc, + rename=rename) + else: + target = self.current_scope.get_or_define(name, loc=self.loc) + if isinstance(value, ir.Var): + value = self.assigner.assign(value, target) + stmt = ir.Assign(value=value, target=target, loc=self.loc) + self.current_block.append(stmt) + self.definitions[target.name].append(value) + return target + + def get(self, name): + """ + Get the variable (a Var instance) with the given *name*. + """ + # Implicit argument for comprehension starts with '.' + # See Parameter class in inspect.py (from Python source) + if name[0] == '.' and name[1:].isdigit(): + name = 'implicit{}'.format(name[1:]) + + # Try to simplify the variable lookup by returning an earlier + # variable assigned to *name*. + var = self.assigner.get_assignment_source(name) + if var is None: + var = self.current_scope.get(name) + return var + + # --- Block operations --- + + def insert_block(self, offset, scope=None, loc=None): + scope = scope or self.current_scope + loc = loc or self.loc + blk = ir.Block(scope=scope, loc=loc) + self.blocks[offset] = blk + self.current_block = blk + self.current_block_offset = offset + return blk + + # --- Bytecode handlers --- + + def op_NOP(self, inst): + pass + + def op_PRINT_ITEM(self, inst, item, printvar, res): + item = self.get(item) + printgv = ir.Global("print", print, loc=self.loc) + self.store(value=printgv, name=printvar) + call = ir.Expr.call(self.get(printvar), (item,), (), loc=self.loc) + self.store(value=call, name=res) + + def op_PRINT_NEWLINE(self, inst, printvar, res): + printgv = ir.Global("print", print, loc=self.loc) + self.store(value=printgv, name=printvar) + call = ir.Expr.call(self.get(printvar), (), (), loc=self.loc) + self.store(value=call, name=res) + + def op_UNPACK_SEQUENCE(self, inst, iterable, stores, tupleobj): + count = len(stores) + # Exhaust the iterable into a tuple-like object + tup = ir.Expr.exhaust_iter(value=self.get(iterable), loc=self.loc, + count=count) + self.store(name=tupleobj, value=tup) + + # then index the tuple-like object to extract the values + for i, st in enumerate(stores): + expr = ir.Expr.static_getitem(self.get(tupleobj), + index=i, index_var=None, + loc=self.loc) + self.store(expr, st) + + def op_FORMAT_VALUE(self, inst, value, res, strvar): + """ + FORMAT_VALUE(flags): flags argument specifies format spec which is not + supported yet. Currently, str() is simply called on the value. + https://docs.python.org/3/library/dis.html#opcode-FORMAT_VALUE + """ + value = self.get(value) + strgv = ir.Global("str", str, loc=self.loc) + self.store(value=strgv, name=strvar) + call = ir.Expr.call(self.get(strvar), (value,), (), loc=self.loc) + self.store(value=call, name=res) + + def op_BUILD_STRING(self, inst, strings, tmps): + """ + BUILD_STRING(count): Concatenates count strings. + Required for supporting f-strings. + https://docs.python.org/3/library/dis.html#opcode-BUILD_STRING + """ + count = inst.arg + # corner case: f"" + if count == 0: + const = ir.Const("", loc=self.loc) + self.store(const, tmps[-1]) + return + + prev = self.get(strings[0]) + for other, tmp in zip(strings[1:], tmps): + other = self.get(other) + expr = ir.Expr.binop( + operator.add, lhs=prev, rhs=other, loc=self.loc + ) + self.store(expr, tmp) + prev = self.get(tmp) + + def op_BUILD_SLICE(self, inst, start, stop, step, res, slicevar): + start = self.get(start) + stop = self.get(stop) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + if step is None: + sliceinst = ir.Expr.call(self.get(slicevar), (start, stop), (), + loc=self.loc) + else: + step = self.get(step) + sliceinst = ir.Expr.call(self.get(slicevar), (start, stop, step), + (), loc=self.loc) + self.store(value=sliceinst, name=res) + + def op_SLICE_0(self, inst, base, res, slicevar, indexvar, nonevar): + base = self.get(base) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + nonegv = ir.Const(None, loc=self.loc) + self.store(value=nonegv, name=nonevar) + none = self.get(nonevar) + + index = ir.Expr.call(self.get(slicevar), (none, none), (), loc=self.loc) + self.store(value=index, name=indexvar) + + expr = ir.Expr.getitem(base, self.get(indexvar), loc=self.loc) + self.store(value=expr, name=res) + + def op_SLICE_1(self, inst, base, start, nonevar, res, slicevar, indexvar): + base = self.get(base) + start = self.get(start) + + nonegv = ir.Const(None, loc=self.loc) + self.store(value=nonegv, name=nonevar) + none = self.get(nonevar) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + index = ir.Expr.call(self.get(slicevar), (start, none), (), + loc=self.loc) + self.store(value=index, name=indexvar) + + expr = ir.Expr.getitem(base, self.get(indexvar), loc=self.loc) + self.store(value=expr, name=res) + + def op_SLICE_2(self, inst, base, nonevar, stop, res, slicevar, indexvar): + base = self.get(base) + stop = self.get(stop) + + nonegv = ir.Const(None, loc=self.loc) + self.store(value=nonegv, name=nonevar) + none = self.get(nonevar) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + index = ir.Expr.call(self.get(slicevar), (none, stop,), (), + loc=self.loc) + self.store(value=index, name=indexvar) + + expr = ir.Expr.getitem(base, self.get(indexvar), loc=self.loc) + self.store(value=expr, name=res) + + def op_SLICE_3(self, inst, base, start, stop, res, slicevar, indexvar): + base = self.get(base) + start = self.get(start) + stop = self.get(stop) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + index = ir.Expr.call(self.get(slicevar), (start, stop), (), + loc=self.loc) + self.store(value=index, name=indexvar) + + expr = ir.Expr.getitem(base, self.get(indexvar), loc=self.loc) + self.store(value=expr, name=res) + + def op_STORE_SLICE_0(self, inst, base, value, slicevar, indexvar, nonevar): + base = self.get(base) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + nonegv = ir.Const(None, loc=self.loc) + self.store(value=nonegv, name=nonevar) + none = self.get(nonevar) + + index = ir.Expr.call(self.get(slicevar), (none, none), (), loc=self.loc) + self.store(value=index, name=indexvar) + + stmt = ir.SetItem(base, self.get(indexvar), self.get(value), + loc=self.loc) + self.current_block.append(stmt) + + def op_STORE_SLICE_1(self, inst, base, start, nonevar, value, slicevar, + indexvar): + base = self.get(base) + start = self.get(start) + + nonegv = ir.Const(None, loc=self.loc) + self.store(value=nonegv, name=nonevar) + none = self.get(nonevar) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + index = ir.Expr.call(self.get(slicevar), (start, none), (), + loc=self.loc) + self.store(value=index, name=indexvar) + + stmt = ir.SetItem(base, self.get(indexvar), self.get(value), + loc=self.loc) + self.current_block.append(stmt) + + def op_STORE_SLICE_2(self, inst, base, nonevar, stop, value, slicevar, + indexvar): + base = self.get(base) + stop = self.get(stop) + + nonegv = ir.Const(None, loc=self.loc) + self.store(value=nonegv, name=nonevar) + none = self.get(nonevar) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + index = ir.Expr.call(self.get(slicevar), (none, stop,), (), + loc=self.loc) + self.store(value=index, name=indexvar) + + stmt = ir.SetItem(base, self.get(indexvar), self.get(value), + loc=self.loc) + self.current_block.append(stmt) + + def op_STORE_SLICE_3(self, inst, base, start, stop, value, slicevar, + indexvar): + base = self.get(base) + start = self.get(start) + stop = self.get(stop) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + index = ir.Expr.call(self.get(slicevar), (start, stop), (), + loc=self.loc) + self.store(value=index, name=indexvar) + stmt = ir.SetItem(base, self.get(indexvar), self.get(value), + loc=self.loc) + self.current_block.append(stmt) + + def op_DELETE_SLICE_0(self, inst, base, slicevar, indexvar, nonevar): + base = self.get(base) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + nonegv = ir.Const(None, loc=self.loc) + self.store(value=nonegv, name=nonevar) + none = self.get(nonevar) + + index = ir.Expr.call(self.get(slicevar), (none, none), (), loc=self.loc) + self.store(value=index, name=indexvar) + + stmt = ir.DelItem(base, self.get(indexvar), loc=self.loc) + self.current_block.append(stmt) + + def op_DELETE_SLICE_1(self, inst, base, start, nonevar, slicevar, indexvar): + base = self.get(base) + start = self.get(start) + + nonegv = ir.Const(None, loc=self.loc) + self.store(value=nonegv, name=nonevar) + none = self.get(nonevar) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + index = ir.Expr.call(self.get(slicevar), (start, none), (), + loc=self.loc) + self.store(value=index, name=indexvar) + + stmt = ir.DelItem(base, self.get(indexvar), loc=self.loc) + self.current_block.append(stmt) + + def op_DELETE_SLICE_2(self, inst, base, nonevar, stop, slicevar, indexvar): + base = self.get(base) + stop = self.get(stop) + + nonegv = ir.Const(None, loc=self.loc) + self.store(value=nonegv, name=nonevar) + none = self.get(nonevar) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + index = ir.Expr.call(self.get(slicevar), (none, stop,), (), + loc=self.loc) + self.store(value=index, name=indexvar) + + stmt = ir.DelItem(base, self.get(indexvar), loc=self.loc) + self.current_block.append(stmt) + + def op_DELETE_SLICE_3(self, inst, base, start, stop, slicevar, indexvar): + base = self.get(base) + start = self.get(start) + stop = self.get(stop) + + slicegv = ir.Global("slice", slice, loc=self.loc) + self.store(value=slicegv, name=slicevar) + + index = ir.Expr.call(self.get(slicevar), (start, stop), (), + loc=self.loc) + self.store(value=index, name=indexvar) + stmt = ir.DelItem(base, self.get(indexvar), loc=self.loc) + self.current_block.append(stmt) + + def op_LOAD_FAST(self, inst, res): + srcname = self.code_locals[inst.arg] + self.store(value=self.get(srcname), name=res) + + def op_STORE_FAST(self, inst, value): + dstname = self.code_locals[inst.arg] + value = self.get(value) + self.store(value=value, name=dstname) + + def op_DELETE_FAST(self, inst): + dstname = self.code_locals[inst.arg] + self.current_block.append(ir.Del(dstname, loc=self.loc)) + + def op_DUP_TOPX(self, inst, orig, duped): + for src, dst in zip(orig, duped): + self.store(value=self.get(src), name=dst) + + op_DUP_TOP = op_DUP_TOPX + op_DUP_TOP_TWO = op_DUP_TOPX + + def op_STORE_ATTR(self, inst, target, value): + attr = self.code_names[inst.arg] + sa = ir.SetAttr(target=self.get(target), value=self.get(value), + attr=attr, loc=self.loc) + self.current_block.append(sa) + + def op_DELETE_ATTR(self, inst, target): + attr = self.code_names[inst.arg] + sa = ir.DelAttr(target=self.get(target), attr=attr, loc=self.loc) + self.current_block.append(sa) + + def op_LOAD_ATTR(self, inst, item, res): + item = self.get(item) + attr = self.code_names[inst.arg] + getattr = ir.Expr.getattr(item, attr, loc=self.loc) + self.store(getattr, res) + + def op_LOAD_CONST(self, inst, res): + value = self.code_consts[inst.arg] + if isinstance(value, tuple): + st = [] + for x in value: + nm = '$const_%s' % str(x) + val_const = ir.Const(x, loc=self.loc) + target = self.store(val_const, name=nm, redefine=True) + st.append(target) + const = ir.Expr.build_tuple(st, loc=self.loc) + elif isinstance(value, frozenset): + st = [] + for x in value: + nm = '$const_%s' % str(x) + val_const = ir.Const(x, loc=self.loc) + target = self.store(val_const, name=nm, redefine=True) + st.append(target) + const = ir.Expr.build_set(st, loc=self.loc) + else: + const = ir.Const(value, loc=self.loc) + self.store(const, res) + + def op_LOAD_GLOBAL(self, inst, res): + name = self.code_names[inst.arg] + value = self.get_global_value(name) + gl = ir.Global(name, value, loc=self.loc) + self.store(gl, res) + + def op_LOAD_DEREF(self, inst, res): + n_cellvars = len(self.code_cellvars) + if inst.arg < n_cellvars: + name = self.code_cellvars[inst.arg] + gl = self.get(name) + else: + idx = inst.arg - n_cellvars + name = self.code_freevars[idx] + value = self.get_closure_value(idx) + gl = ir.FreeVar(idx, name, value, loc=self.loc) + self.store(gl, res) + + def op_STORE_DEREF(self, inst, value): + n_cellvars = len(self.code_cellvars) + if inst.arg < n_cellvars: + dstname = self.code_cellvars[inst.arg] + else: + dstname = self.code_freevars[inst.arg - n_cellvars] + value = self.get(value) + self.store(value=value, name=dstname) + + def op_SETUP_LOOP(self, inst): + assert self.blocks[inst.offset] is self.current_block + loop = ir.Loop(inst.offset, exit=(inst.next + inst.arg)) + self.syntax_blocks.append(loop) + + def op_SETUP_WITH(self, inst, contextmanager, exitfn=None): + assert self.blocks[inst.offset] is self.current_block + # Handle with + exitpt = inst.next + inst.arg + wth = ir.With(inst.offset, exit=exitpt) + self.syntax_blocks.append(wth) + ctxmgr = self.get(contextmanager) + self.current_block.append(ir.EnterWith(contextmanager=ctxmgr, + begin=inst.offset, + end=exitpt, loc=self.loc,)) + + # Store exit fn + exit_fn_obj = ir.Const(None, loc=self.loc) + self.store(value=exit_fn_obj, name=exitfn) + + def op_SETUP_EXCEPT(self, inst): + # Removed since python3.8 + self._insert_try_block_begin() + + def op_SETUP_FINALLY(self, inst): + self._insert_try_block_begin() + + def op_WITH_CLEANUP(self, inst): + "no-op" + + def op_WITH_CLEANUP_START(self, inst): + "no-op" + + def op_WITH_CLEANUP_FINISH(self, inst): + "no-op" + + def op_END_FINALLY(self, inst): + "no-op" + + def op_BEGIN_FINALLY(self, inst, temps): + # The *temps* are the exception variables + const_none = ir.Const(None, loc=self.loc) + for tmp in temps: + # Set to None for now + self.store(const_none, name=tmp) + self._exception_vars.add(tmp) + + if PYVERSION < (3, 6): + + def op_CALL_FUNCTION(self, inst, func, args, kws, res, vararg): + func = self.get(func) + args = [self.get(x) for x in args] + if vararg is not None: + vararg = self.get(vararg) + + # Process keywords + keyvalues = [] + removethese = [] + for k, v in kws: + k, v = self.get(k), self.get(v) + for inst in self.current_block.body: + if isinstance(inst, ir.Assign) and inst.target is k: + removethese.append(inst) + keyvalues.append((inst.value.value, v)) + + # Remove keyword constant statements + for inst in removethese: + self.current_block.remove(inst) + + expr = ir.Expr.call(func, args, keyvalues, loc=self.loc, + vararg=vararg) + self.store(expr, res) + + op_CALL_FUNCTION_VAR = op_CALL_FUNCTION + else: + def op_CALL_FUNCTION(self, inst, func, args, res): + func = self.get(func) + args = [self.get(x) for x in args] + expr = ir.Expr.call(func, args, (), loc=self.loc) + self.store(expr, res) + + def op_CALL_FUNCTION_KW(self, inst, func, args, names, res): + func = self.get(func) + args = [self.get(x) for x in args] + # Find names const + names = self.get(names) + for inst in self.current_block.body: + if isinstance(inst, ir.Assign) and inst.target is names: + self.current_block.remove(inst) + # scan up the block looking for the values, remove them + # and find their name strings + named_items = [] + for x in inst.value.items: + for y in self.current_block.body[::-1]: + if x == y.target: + self.current_block.remove(y) + named_items.append(y.value.value) + break + keys = named_items + break + + nkeys = len(keys) + posvals = args[:-nkeys] + kwvals = args[-nkeys:] + keyvalues = list(zip(keys, kwvals)) + + expr = ir.Expr.call(func, posvals, keyvalues, loc=self.loc) + self.store(expr, res) + + def op_CALL_FUNCTION_EX(self, inst, func, vararg, varkwarg, res): + func = self.get(func) + vararg = self.get(vararg) + if varkwarg is not None: + varkwarg = self.get(varkwarg) + expr = ir.Expr.call( + func, [], [], loc=self.loc, vararg=vararg, varkwarg=varkwarg + ) + self.store(expr, res) + + def _build_tuple_unpack(self, inst, tuples, temps, is_assign): + first = self.get(tuples[0]) + if is_assign: + # it's assign-like, defer handling to an intrinsic that will have + # type information. + # Can deal with tuples only, i.e. y = (*x,). where x = + gv_name = "unpack_single_tuple" + gv_fn = ir.Global(gv_name, unpack_single_tuple, loc=self.loc,) + self.store(value=gv_fn, name=gv_name, redefine=True) + exc = ir.Expr.call(self.get(gv_name), args=(first,), kws=(), + loc=self.loc,) + self.store(exc, temps[0]) + else: + loc = self.loc + for other, tmp in zip(map(self.get, tuples[1:]), temps): + # Emit as `first + tuple(other)` + gv_tuple = ir.Global( + name="tuple", value=tuple, + loc=loc, + ) + tuple_var = self.store( + gv_tuple, "$_list_extend_gv_tuple", redefine=True, + ) + tuplify_val = ir.Expr.call( + tuple_var, (other,), (), + loc=loc, + ) + tuplify_var = self.store(tuplify_val, "$_tuplify", + redefine=True) + out = ir.Expr.binop( + fn=operator.add, lhs=first, rhs=self.get(tuplify_var.name), + loc=self.loc, + ) + self.store(out, tmp) + first = self.get(tmp) + + def op_BUILD_TUPLE_UNPACK_WITH_CALL(self, inst, tuples, temps, is_assign): + # just unpack the input tuple, call inst will be handled afterwards + self._build_tuple_unpack(inst, tuples, temps, is_assign) + + def op_BUILD_TUPLE_UNPACK(self, inst, tuples, temps, is_assign): + self._build_tuple_unpack(inst, tuples, temps, is_assign) + + def op_LIST_TO_TUPLE(self, inst, const_list, res): + expr = ir.Expr.dummy('list_to_tuple', (const_list,), loc=self.loc) + self.store(expr, res) + + def op_BUILD_CONST_KEY_MAP(self, inst, keys, keytmps, values, res): + # Unpack the constant key-tuple and reused build_map which takes + # a sequence of (key, value) pair. + keyvar = self.get(keys) + # TODO: refactor this pattern. occurred several times. + for inst in self.current_block.body: + if isinstance(inst, ir.Assign) and inst.target is keyvar: + self.current_block.remove(inst) + # scan up the block looking for the values, remove them + # and find their name strings + named_items = [] + for x in inst.value.items: + for y in self.current_block.body[::-1]: + if x == y.target: + self.current_block.remove(y) + named_items.append(y.value.value) + break + keytup = named_items + break + assert len(keytup) == len(values) + keyconsts = [ir.Const(value=x, loc=self.loc) for x in keytup] + for kval, tmp in zip(keyconsts, keytmps): + self.store(kval, tmp) + items = list(zip(map(self.get, keytmps), map(self.get, values))) + + # sort out literal values + literal_items = [] + for v in values: + defns = self.definitions[v] + if len(defns) != 1: + break + defn = defns[0] + if not isinstance(defn, ir.Const): + break + literal_items.append(defn.value) + + def resolve_const(v): + defns = self.definitions[v] + if len(defns) != 1: + return _UNKNOWN_VALUE(self.get(v).name) + defn = defns[0] + if not isinstance(defn, ir.Const): + return _UNKNOWN_VALUE(self.get(v).name) + return defn.value + + if len(literal_items) != len(values): + literal_dict = {x: resolve_const(y) for x, y in + zip(keytup, values)} + else: + literal_dict = {x:y for x, y in zip(keytup, literal_items)} + + # to deal with things like {'a': 1, 'a': 'cat', 'b': 2, 'a': 2j} + # store the index of the actual used value for a given key, this is + # used when lowering to pull the right value out into the tuple repr + # of a mixed value type dictionary. + value_indexes = {} + for i, k in enumerate(keytup): + value_indexes[k] = i + + expr = ir.Expr.build_map(items=items, + size=2, + literal_value=literal_dict, + value_indexes=value_indexes, + loc=self.loc) + + self.store(expr, res) + + def op_GET_ITER(self, inst, value, res): + expr = ir.Expr.getiter(value=self.get(value), loc=self.loc) + self.store(expr, res) + + def op_FOR_ITER(self, inst, iterator, pair, indval, pred): + """ + Assign new block other this instruction. + """ + assert inst.offset in self.blocks, "FOR_ITER must be block head" + + # Emit code + val = self.get(iterator) + + pairval = ir.Expr.iternext(value=val, loc=self.loc) + self.store(pairval, pair) + + iternext = ir.Expr.pair_first(value=self.get(pair), loc=self.loc) + self.store(iternext, indval) + + isvalid = ir.Expr.pair_second(value=self.get(pair), loc=self.loc) + self.store(isvalid, pred) + + # Conditional jump + br = ir.Branch(cond=self.get(pred), truebr=inst.next, + falsebr=inst.get_jump_target(), + loc=self.loc) + self.current_block.append(br) + + def op_BINARY_SUBSCR(self, inst, target, index, res): + index = self.get(index) + target = self.get(target) + expr = ir.Expr.getitem(target, index=index, loc=self.loc) + self.store(expr, res) + + def op_STORE_SUBSCR(self, inst, target, index, value): + index = self.get(index) + target = self.get(target) + value = self.get(value) + stmt = ir.SetItem(target=target, index=index, value=value, + loc=self.loc) + self.current_block.append(stmt) + + def op_DELETE_SUBSCR(self, inst, target, index): + index = self.get(index) + target = self.get(target) + stmt = ir.DelItem(target=target, index=index, loc=self.loc) + self.current_block.append(stmt) + + def op_BUILD_TUPLE(self, inst, items, res): + expr = ir.Expr.build_tuple(items=[self.get(x) for x in items], + loc=self.loc) + self.store(expr, res) + + def op_BUILD_LIST(self, inst, items, res): + expr = ir.Expr.build_list(items=[self.get(x) for x in items], + loc=self.loc) + self.store(expr, res) + + def op_BUILD_SET(self, inst, items, res): + expr = ir.Expr.build_set(items=[self.get(x) for x in items], + loc=self.loc) + self.store(expr, res) + + def op_SET_UPDATE(self, inst, target, value, updatevar, res): + target = self.get(target) + value = self.get(value) + updateattr = ir.Expr.getattr(target, 'update', loc=self.loc) + self.store(value=updateattr, name=updatevar) + updateinst = ir.Expr.call(self.get(updatevar), (value,), (), + loc=self.loc) + self.store(value=updateinst, name=res) + + def op_DICT_UPDATE(self, inst, target, value, updatevar, res): + target = self.get(target) + value = self.get(value) + # We generate _update_from_bytecode instead of update so we can + # differentiate between user .update() calls and those from the + # bytecode. This is then used to recombine dictionaries in peephole + # optimizations. See the dicussion in this PR about why: + # https://github.com/numba/numba/pull/7964/files#r868229306 + updateattr = ir.Expr.getattr( + target, '_update_from_bytecode', loc=self.loc + ) + self.store(value=updateattr, name=updatevar) + updateinst = ir.Expr.call(self.get(updatevar), (value,), (), + loc=self.loc) + self.store(value=updateinst, name=res) + + def op_BUILD_MAP(self, inst, items, size, res): + got_items = [(self.get(k), self.get(v)) for k, v in items] + + # sort out literal values, this is a bit contrived but is to handle + # situations like `{1: 10, 1: 10}` where the size of the literal dict + # is smaller than the definition + def get_literals(target): + literal_items = [] + values = [self.get(v.name) for v in target] + for v in values: + defns = self.definitions[v.name] + if len(defns) != 1: + break + defn = defns[0] + if not isinstance(defn, ir.Const): + break + literal_items.append(defn.value) + return literal_items + + literal_keys = get_literals(x[0] for x in got_items) + literal_values = get_literals(x[1] for x in got_items) + + has_literal_keys = len(literal_keys) == len(got_items) + has_literal_values = len(literal_values) == len(got_items) + + value_indexes = {} + if not has_literal_keys and not has_literal_values: + literal_dict = None + elif has_literal_keys and not has_literal_values: + literal_dict = {x: _UNKNOWN_VALUE(y[1]) for x, y in + zip(literal_keys, got_items)} + for i, k in enumerate(literal_keys): + value_indexes[k] = i + else: + literal_dict = {x: y for x, y in zip(literal_keys, literal_values)} + for i, k in enumerate(literal_keys): + value_indexes[k] = i + + expr = ir.Expr.build_map(items=got_items, size=size, + literal_value=literal_dict, + value_indexes=value_indexes, + loc=self.loc) + self.store(expr, res) + + def op_STORE_MAP(self, inst, dct, key, value): + stmt = ir.StoreMap(dct=self.get(dct), key=self.get(key), + value=self.get(value), loc=self.loc) + self.current_block.append(stmt) + + def op_UNARY_NEGATIVE(self, inst, value, res): + value = self.get(value) + expr = ir.Expr.unary('-', value=value, loc=self.loc) + return self.store(expr, res) + + def op_UNARY_POSITIVE(self, inst, value, res): + value = self.get(value) + expr = ir.Expr.unary('+', value=value, loc=self.loc) + return self.store(expr, res) + + def op_UNARY_INVERT(self, inst, value, res): + value = self.get(value) + expr = ir.Expr.unary('~', value=value, loc=self.loc) + return self.store(expr, res) + + def op_UNARY_NOT(self, inst, value, res): + value = self.get(value) + expr = ir.Expr.unary('not', value=value, loc=self.loc) + return self.store(expr, res) + + def _binop(self, op, lhs, rhs, res): + op = BINOPS_TO_OPERATORS[op] + lhs = self.get(lhs) + rhs = self.get(rhs) + expr = ir.Expr.binop(op, lhs=lhs, rhs=rhs, loc=self.loc) + self.store(expr, res) + + def _inplace_binop(self, op, lhs, rhs, res): + immuop = BINOPS_TO_OPERATORS[op] + op = INPLACE_BINOPS_TO_OPERATORS[op + '='] + lhs = self.get(lhs) + rhs = self.get(rhs) + expr = ir.Expr.inplace_binop(op, immuop, lhs=lhs, rhs=rhs, + loc=self.loc) + self.store(expr, res) + + def op_BINARY_ADD(self, inst, lhs, rhs, res): + self._binop('+', lhs, rhs, res) + + def op_BINARY_SUBTRACT(self, inst, lhs, rhs, res): + self._binop('-', lhs, rhs, res) + + def op_BINARY_MULTIPLY(self, inst, lhs, rhs, res): + self._binop('*', lhs, rhs, res) + + def op_BINARY_DIVIDE(self, inst, lhs, rhs, res): + self._binop('/?', lhs, rhs, res) + + def op_BINARY_TRUE_DIVIDE(self, inst, lhs, rhs, res): + self._binop('/', lhs, rhs, res) + + def op_BINARY_FLOOR_DIVIDE(self, inst, lhs, rhs, res): + self._binop('//', lhs, rhs, res) + + def op_BINARY_MODULO(self, inst, lhs, rhs, res): + self._binop('%', lhs, rhs, res) + + def op_BINARY_POWER(self, inst, lhs, rhs, res): + self._binop('**', lhs, rhs, res) + + def op_BINARY_MATRIX_MULTIPLY(self, inst, lhs, rhs, res): + self._binop('@', lhs, rhs, res) + + def op_BINARY_LSHIFT(self, inst, lhs, rhs, res): + self._binop('<<', lhs, rhs, res) + + def op_BINARY_RSHIFT(self, inst, lhs, rhs, res): + self._binop('>>', lhs, rhs, res) + + def op_BINARY_AND(self, inst, lhs, rhs, res): + self._binop('&', lhs, rhs, res) + + def op_BINARY_OR(self, inst, lhs, rhs, res): + self._binop('|', lhs, rhs, res) + + def op_BINARY_XOR(self, inst, lhs, rhs, res): + self._binop('^', lhs, rhs, res) + + def op_INPLACE_ADD(self, inst, lhs, rhs, res): + self._inplace_binop('+', lhs, rhs, res) + + def op_INPLACE_SUBTRACT(self, inst, lhs, rhs, res): + self._inplace_binop('-', lhs, rhs, res) + + def op_INPLACE_MULTIPLY(self, inst, lhs, rhs, res): + self._inplace_binop('*', lhs, rhs, res) + + def op_INPLACE_DIVIDE(self, inst, lhs, rhs, res): + self._inplace_binop('/?', lhs, rhs, res) + + def op_INPLACE_TRUE_DIVIDE(self, inst, lhs, rhs, res): + self._inplace_binop('/', lhs, rhs, res) + + def op_INPLACE_FLOOR_DIVIDE(self, inst, lhs, rhs, res): + self._inplace_binop('//', lhs, rhs, res) + + def op_INPLACE_MODULO(self, inst, lhs, rhs, res): + self._inplace_binop('%', lhs, rhs, res) + + def op_INPLACE_POWER(self, inst, lhs, rhs, res): + self._inplace_binop('**', lhs, rhs, res) + + def op_INPLACE_MATRIX_MULTIPLY(self, inst, lhs, rhs, res): + self._inplace_binop('@', lhs, rhs, res) + + def op_INPLACE_LSHIFT(self, inst, lhs, rhs, res): + self._inplace_binop('<<', lhs, rhs, res) + + def op_INPLACE_RSHIFT(self, inst, lhs, rhs, res): + self._inplace_binop('>>', lhs, rhs, res) + + def op_INPLACE_AND(self, inst, lhs, rhs, res): + self._inplace_binop('&', lhs, rhs, res) + + def op_INPLACE_OR(self, inst, lhs, rhs, res): + self._inplace_binop('|', lhs, rhs, res) + + def op_INPLACE_XOR(self, inst, lhs, rhs, res): + self._inplace_binop('^', lhs, rhs, res) + + def op_JUMP_ABSOLUTE(self, inst): + jmp = ir.Jump(inst.get_jump_target(), loc=self.loc) + self.current_block.append(jmp) + + def op_JUMP_FORWARD(self, inst): + jmp = ir.Jump(inst.get_jump_target(), loc=self.loc) + self.current_block.append(jmp) + + def op_POP_BLOCK(self, inst, kind=None): + if kind is None: + self.syntax_blocks.pop() + elif kind == 'with': + d = ir.PopBlock(loc=self.loc) + self.current_block.append(d) + elif kind == 'try': + self._insert_try_block_end() + + def op_RETURN_VALUE(self, inst, retval, castval): + self.store(ir.Expr.cast(self.get(retval), loc=self.loc), castval) + ret = ir.Return(self.get(castval), loc=self.loc) + self.current_block.append(ret) + + def op_COMPARE_OP(self, inst, lhs, rhs, res): + op = dis.cmp_op[inst.arg] + if op == 'in' or op == 'not in': + lhs, rhs = rhs, lhs + + if op == 'not in': + self._binop('in', lhs, rhs, res) + tmp = self.get(res) + out = ir.Expr.unary('not', value=tmp, loc=self.loc) + self.store(out, res) + elif op == 'exception match': + gv_fn = ir.Global( + "exception_match", eh.exception_match, loc=self.loc, + ) + exc_match_name = '$exc_match' + self.store(value=gv_fn, name=exc_match_name, redefine=True) + lhs = self.get(lhs) + rhs = self.get(rhs) + exc = ir.Expr.call( + self.get(exc_match_name), args=(lhs, rhs), kws=(), loc=self.loc, + ) + self.store(exc, res) + else: + self._binop(op, lhs, rhs, res) + + def op_IS_OP(self, inst, lhs, rhs, res): + # invert if op case is 1 + op = 'is not' if inst.arg == 1 else 'is' + self._binop(op, lhs, rhs, res) + + def op_CONTAINS_OP(self, inst, lhs, rhs, res): + lhs, rhs = rhs, lhs + self._binop('in', lhs, rhs, res) + # invert if op case is 1 + if inst.arg == 1: + tmp = self.get(res) + out = ir.Expr.unary('not', value=tmp, loc=self.loc) + self.store(out, res) + + def op_BREAK_LOOP(self, inst, end=None): + if end is None: + loop = self.syntax_blocks[-1] + assert isinstance(loop, ir.Loop) + end = loop.exit + jmp = ir.Jump(target=end, loc=self.loc) + self.current_block.append(jmp) + + def _op_JUMP_IF(self, inst, pred, iftrue): + brs = { + True: inst.get_jump_target(), + False: inst.next, + } + truebr = brs[iftrue] + falsebr = brs[not iftrue] + + name = "bool%s" % (inst.offset) + gv_fn = ir.Global("bool", bool, loc=self.loc) + self.store(value=gv_fn, name=name) + + callres = ir.Expr.call(self.get(name), (self.get(pred),), (), + loc=self.loc) + + pname = "$%spred" % (inst.offset) + predicate = self.store(value=callres, name=pname) + bra = ir.Branch(cond=predicate, truebr=truebr, falsebr=falsebr, + loc=self.loc) + self.current_block.append(bra) + + def op_JUMP_IF_FALSE(self, inst, pred): + self._op_JUMP_IF(inst, pred=pred, iftrue=False) + + def op_JUMP_IF_TRUE(self, inst, pred): + self._op_JUMP_IF(inst, pred=pred, iftrue=True) + + def op_POP_JUMP_IF_FALSE(self, inst, pred): + self._op_JUMP_IF(inst, pred=pred, iftrue=False) + + def op_POP_JUMP_IF_TRUE(self, inst, pred): + self._op_JUMP_IF(inst, pred=pred, iftrue=True) + + def op_JUMP_IF_FALSE_OR_POP(self, inst, pred): + self._op_JUMP_IF(inst, pred=pred, iftrue=False) + + def op_JUMP_IF_TRUE_OR_POP(self, inst, pred): + self._op_JUMP_IF(inst, pred=pred, iftrue=True) + + def op_JUMP_IF_NOT_EXC_MATCH(self, inst, pred, tos, tos1): + truebr = inst.next + falsebr = inst.get_jump_target() + gv_fn = ir.Global( + "exception_match", eh.exception_match, loc=self.loc, + ) + exc_match_name = '$exc_match' + self.store(value=gv_fn, name=exc_match_name, redefine=True) + lhs = self.get(tos1) + rhs = self.get(tos) + exc = ir.Expr.call( + self.get(exc_match_name), args=(lhs, rhs), kws=(), loc=self.loc, + ) + predicate = self.store(exc, pred) + bra = ir.Branch(cond=predicate, truebr=truebr, falsebr=falsebr, + loc=self.loc) + self.current_block.append(bra) + + def op_RERAISE(self, inst, exc): + # Numba can't handle this case and it's caught else where, this is a + # runtime guard in case this is reached by unknown means. + msg = (f"Unreachable condition reached (op code RERAISE executed)" + f"{error_extras['reportable']}") + stmt = ir.StaticRaise(AssertionError, (msg,), self.loc) + self.current_block.append(stmt) + + def op_RAISE_VARARGS(self, inst, exc): + if exc is not None: + exc = self.get(exc) + tryblk = self.dfainfo.active_try_block + if tryblk is not None: + # In a try block + stmt = ir.TryRaise(exception=exc, loc=self.loc) + self.current_block.append(stmt) + self._insert_try_block_end() + self.current_block.append(ir.Jump(tryblk['end'], loc=self.loc)) + else: + # Not in a try block + stmt = ir.Raise(exception=exc, loc=self.loc) + self.current_block.append(stmt) + + def op_YIELD_VALUE(self, inst, value, res): + # initialize index to None. it's being set later in post-processing + index = None + inst = ir.Yield(value=self.get(value), index=index, loc=self.loc) + return self.store(inst, res) + + def op_MAKE_FUNCTION(self, inst, name, code, closure, annotations, + kwdefaults, defaults, res): + # annotations are ignored by numba but useful for static analysis + # re. https://github.com/numba/numba/issues/7269 + if kwdefaults is not None: + msg = "op_MAKE_FUNCTION with kwdefaults is not implemented" + raise NotImplementedError(msg) + if defaults: + if isinstance(defaults, tuple): + defaults = tuple([self.get(name) for name in defaults]) + else: + defaults = self.get(defaults) + + assume_code_const = self.definitions[code][0] + if not isinstance(assume_code_const, ir.Const): + msg = ( + "Unsupported use of closure. " + "Probably caused by complex control-flow constructs; " + "e.g. try-except" + ) + raise errors.UnsupportedError(msg, loc=self.loc) + fcode = assume_code_const.value + if name: + name = self.get(name) + if closure: + closure = self.get(closure) + expr = ir.Expr.make_function(name, fcode, closure, defaults, self.loc) + self.store(expr, res) + + def op_MAKE_CLOSURE(self, inst, name, code, closure, annotations, + kwdefaults, defaults, res): + self.op_MAKE_FUNCTION(inst, name, code, closure, annotations, + kwdefaults, defaults, res) + + def op_LOAD_CLOSURE(self, inst, res): + n_cellvars = len(self.code_cellvars) + if inst.arg < n_cellvars: + name = self.code_cellvars[inst.arg] + try: + gl = self.get(name) + except NotDefinedError: + msg = "Unsupported use of op_LOAD_CLOSURE encountered" + raise NotImplementedError(msg) + else: + idx = inst.arg - n_cellvars + name = self.code_freevars[idx] + value = self.get_closure_value(idx) + gl = ir.FreeVar(idx, name, value, loc=self.loc) + self.store(gl, res) + + def op_LIST_APPEND(self, inst, target, value, appendvar, res): + target = self.get(target) + value = self.get(value) + appendattr = ir.Expr.getattr(target, 'append', loc=self.loc) + self.store(value=appendattr, name=appendvar) + appendinst = ir.Expr.call(self.get(appendvar), (value,), (), + loc=self.loc) + self.store(value=appendinst, name=res) + + def op_LIST_EXTEND(self, inst, target, value, extendvar, res): + target = self.get(target) + value = self.get(value) + # If the statements between the current instruction and the target + # are N * consts followed by build_tuple AND the target has no items, + # it's a situation where a list is being statically initialised, rewrite + # the build_tuple as a build_list, drop the extend, and wire up the + # target as the result from the build_tuple that's been rewritten. + + # See if this is the first statement in a block, if so its probably from + # control flow in a tuple unpack like: + # `(*(1, (2,) if predicate else (3,)))` + # this cannot be handled as present so raise + msg = ("An unsupported bytecode sequence has been encountered: " + "op_LIST_EXTEND at the start of a block.\n\nThis could be " + "due to the use of a branch in a tuple unpacking statement.") + if not self.current_block.body: + raise errors.UnsupportedError(msg) + + # is last emitted statement a build_tuple? + stmt = self.current_block.body[-1] + ok = isinstance(stmt.value, ir.Expr) and stmt.value.op == "build_tuple" + # check statements from self.current_block.body[-1] through to target, + # make sure they are consts + build_empty_list = None + if ok: + for stmt in reversed(self.current_block.body[:-1]): + if not isinstance(stmt, ir.Assign): + ok = False + break + # if its not a const, it needs to be the `build_list` for the + # target, else it's something else we don't know about so just + # bail + if isinstance(stmt.value, ir.Const): + continue + + # it's not a const, check for target + elif isinstance(stmt.value, ir.Expr) and stmt.target == target: + build_empty_list = stmt + # it's only ok to do this if the target has no initializer + # already + ok = not stmt.value.items + break + else: + ok = False + break + if ok and build_empty_list is None: + raise errors.UnsupportedError(msg) + if ok: + stmts = self.current_block.body + build_tuple_asgn = self.current_block.body[-1] + # move build list to last issued statement + stmts.append(stmts.pop(stmts.index(build_empty_list))) + # fix the build list + build_tuple = build_tuple_asgn.value + build_list = build_empty_list.value + build_list.items = build_tuple.items + else: + # it's just a list extend with no static init, let it be + extendattr = ir.Expr.getattr(target, 'extend', loc=self.loc) + self.store(value=extendattr, name=extendvar) + extendinst = ir.Expr.call(self.get(extendvar), (value,), (), + loc=self.loc) + self.store(value=extendinst, name=res) + + def op_MAP_ADD(self, inst, target, key, value, setitemvar, res): + target = self.get(target) + key = self.get(key) + value = self.get(value) + setitemattr = ir.Expr.getattr(target, '__setitem__', loc=self.loc) + self.store(value=setitemattr, name=setitemvar) + appendinst = ir.Expr.call(self.get(setitemvar), (key, value,), (), + loc=self.loc) + self.store(value=appendinst, name=res) + + def op_LOAD_ASSERTION_ERROR(self, inst, res): + gv_fn = ir.Global("AssertionError", AssertionError, loc=self.loc) + self.store(value=gv_fn, name=res) + + # NOTE: The LOAD_METHOD opcode is implemented as a LOAD_ATTR for ease, + # however this means a new object (the bound-method instance) could be + # created. Conversely, using a pure LOAD_METHOD no intermediary is present + # and it is essentially like a pointer grab and forward to CALL_METHOD. The + # net outcome is that the implementation in Numba produces the same result, + # but in object mode it may be that it runs more slowly than it would if + # run in CPython. + + def op_LOAD_METHOD(self, *args, **kws): + self.op_LOAD_ATTR(*args, **kws) + + def op_CALL_METHOD(self, *args, **kws): + self.op_CALL_FUNCTION(*args, **kws) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/intrinsics.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/intrinsics.py new file mode 100644 index 0000000000000000000000000000000000000000..8e85bb35400e7770c1746fb375afc248cb0fe35d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/intrinsics.py @@ -0,0 +1,101 @@ +""" +LLVM pass that converts intrinsic into other math calls +""" + +from llvmlite import ir + + +class _DivmodFixer(ir.Visitor): + def visit_Instruction(self, instr): + if instr.type == ir.IntType(64): + if instr.opname in ['srem', 'urem', 'sdiv', 'udiv']: + name = 'numba_{op}'.format(op=instr.opname) + fn = self.module.globals.get(name) + # Declare the function if it doesn't already exist + if fn is None: + opty = instr.type + sdivfnty = ir.FunctionType(opty, [opty, opty]) + fn = ir.Function(self.module, sdivfnty, name=name) + # Replace the operation with a call to the builtin + repl = ir.CallInstr(parent=instr.parent, func=fn, + args=instr.operands, name=instr.name) + instr.parent.replace(instr, repl) + + +def fix_divmod(mod): + """Replace division and reminder instructions to builtins calls + """ + _DivmodFixer().visit(mod) + + +INTR_TO_CMATH = { + "llvm.pow.f32": "powf", + "llvm.pow.f64": "pow", + + "llvm.sin.f32": "sinf", + "llvm.sin.f64": "sin", + + "llvm.cos.f32": "cosf", + "llvm.cos.f64": "cos", + + "llvm.sqrt.f32": "sqrtf", + "llvm.sqrt.f64": "sqrt", + + "llvm.exp.f32": "expf", + "llvm.exp.f64": "exp", + + "llvm.log.f32": "logf", + "llvm.log.f64": "log", + + "llvm.log10.f32": "log10f", + "llvm.log10.f64": "log10", + + "llvm.fabs.f32": "fabsf", + "llvm.fabs.f64": "fabs", + + "llvm.floor.f32": "floorf", + "llvm.floor.f64": "floor", + + "llvm.ceil.f32": "ceilf", + "llvm.ceil.f64": "ceil", + + "llvm.trunc.f32": "truncf", + "llvm.trunc.f64": "trunc", +} + +OTHER_CMATHS = ''' +tan +tanf +sinh +sinhf +cosh +coshf +tanh +tanhf +asin +asinf +acos +acosf +atan +atanf +atan2 +atan2f +asinh +asinhf +acosh +acoshf +atanh +atanhf +expm1 +expm1f +log1p +log1pf +log10 +log10f +fmod +fmodf +round +roundf +'''.split() + +INTR_MATH = frozenset(INTR_TO_CMATH.values()) | frozenset(OTHER_CMATHS) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/ir.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/ir.py new file mode 100644 index 0000000000000000000000000000000000000000..d2b546996a2b1a0225cc951387045af1041a8cfc --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/ir.py @@ -0,0 +1,1630 @@ +from collections import defaultdict +import copy +import itertools +import os +import linecache +import pprint +import re +import sys +import operator +from types import FunctionType, BuiltinFunctionType +from functools import total_ordering +from io import StringIO + +from numba.core import errors, config +from numba.core.utils import (BINOPS_TO_OPERATORS, INPLACE_BINOPS_TO_OPERATORS, + UNARY_BUITINS_TO_OPERATORS, OPERATORS_TO_BUILTINS) +from numba.core.errors import (NotDefinedError, RedefinedError, + VerificationError, ConstantInferenceError) +from numba.core import consts + +# terminal color markup +_termcolor = errors.termcolor() + + +class Loc(object): + """Source location + + """ + _defmatcher = re.compile(r'def\s+(\w+)\(.*') + + def __init__(self, filename, line, col=None, maybe_decorator=False): + """ Arguments: + filename - name of the file + line - line in file + col - column + maybe_decorator - Set to True if location is likely a jit decorator + """ + self.filename = filename + self.line = line + self.col = col + self.lines = None # the source lines from the linecache + self.maybe_decorator = maybe_decorator + + def __eq__(self, other): + # equivalence is solely based on filename, line and col + if type(self) is not type(other): return False + if self.filename != other.filename: return False + if self.line != other.line: return False + if self.col != other.col: return False + return True + + def __ne__(self, other): + return not self.__eq__(other) + + @classmethod + def from_function_id(cls, func_id): + return cls(func_id.filename, func_id.firstlineno, maybe_decorator=True) + + def __repr__(self): + return "Loc(filename=%s, line=%s, col=%s)" % (self.filename, + self.line, self.col) + + def __str__(self): + if self.col is not None: + return "%s (%s:%s)" % (self.filename, self.line, self.col) + else: + return "%s (%s)" % (self.filename, self.line) + + def _find_definition(self): + # try and find a def, go backwards from error line + fn_name = None + lines = self.get_lines() + for x in reversed(lines[:self.line - 1]): + # the strip and startswith is to handle user code with commented out + # 'def' or use of 'def' in a docstring. + if x.strip().startswith('def '): + fn_name = x + break + + return fn_name + + def _raw_function_name(self): + defn = self._find_definition() + if defn: + return self._defmatcher.match(defn.strip()).groups()[0] + else: + # Probably exec() or REPL. + return None + + def get_lines(self): + if self.lines is None: + + self.lines = linecache.getlines(self._get_path()) + + return self.lines + + def _get_path(self): + path = None + try: + # Try to get a relative path + # ipython/jupyter input just returns as self.filename + path = os.path.relpath(self.filename) + except ValueError: + # Fallback to absolute path if error occurred in getting the + # relative path. + # This may happen on windows if the drive is different + path = os.path.abspath(self.filename) + return path + + + def strformat(self, nlines_up=2): + + lines = self.get_lines() + + use_line = self.line + + if self.maybe_decorator: + # try and sort out a better `loc`, if it's suspected that this loc + # points at a jit decorator by virtue of + # `__code__.co_firstlineno` + + # get lines, add a dummy entry at the start as lines count from + # 1 but list index counts from 0 + tmplines = [''] + lines + + if lines and use_line and 'def ' not in tmplines[use_line]: + # look forward 10 lines, unlikely anyone managed to stretch + # a jit call declaration over >10 lines?! + min_line = max(0, use_line) + max_line = use_line + 10 + selected = tmplines[min_line : max_line] + index = 0 + for idx, x in enumerate(selected): + if 'def ' in x: + index = idx + break + use_line = use_line + index + + + ret = [] # accumulates output + if lines and use_line: + + def count_spaces(string): + spaces = 0 + for x in itertools.takewhile(str.isspace, str(string)): + spaces += 1 + return spaces + + # A few places in the code still use no `loc` or default to line 1 + # this is often in places where exceptions are used for the purposes + # of flow control. As a result max is in use to prevent slice from + # `[negative: positive]` + selected = lines[max(0, use_line - nlines_up):use_line] + + # see if selected contains a definition + def_found = False + for x in selected: + if 'def ' in x: + def_found = True + + # no definition found, try and find one + if not def_found: + # try and find a def, go backwards from error line + fn_name = None + for x in reversed(lines[:use_line - 1]): + if 'def ' in x: + fn_name = x + break + if fn_name: + ret.append(fn_name) + spaces = count_spaces(x) + ret.append(' '*(4 + spaces) + '\n') + + if selected: + ret.extend(selected[:-1]) + ret.append(_termcolor.highlight(selected[-1])) + + # point at the problem with a caret + spaces = count_spaces(selected[-1]) + ret.append(' '*(spaces) + _termcolor.indicate("^")) + + # if in the REPL source may not be available + if not ret: + ret = "" + + err = _termcolor.filename('\nFile "%s", line %d:')+'\n%s' + tmp = err % (self._get_path(), use_line, _termcolor.code(''.join(ret))) + return tmp + + def with_lineno(self, line, col=None): + """ + Return a new Loc with this line number. + """ + return type(self)(self.filename, line, col) + + def short(self): + """ + Returns a short string + """ + shortfilename = os.path.basename(self.filename) + return "%s:%s" % (shortfilename, self.line) + + +# Used for annotating errors when source location is unknown. +unknown_loc = Loc("unknown location", 0, 0) + + +@total_ordering +class SlotEqualityCheckMixin(object): + # some ir nodes are __dict__ free using __slots__ instead, this mixin + # should not trigger the unintended creation of __dict__. + __slots__ = tuple() + + def __eq__(self, other): + if type(self) is type(other): + for name in self.__slots__: + if getattr(self, name) != getattr(other, name): + return False + else: + return True + return False + + def __le__(self, other): + return str(self) <= str(other) + + def __hash__(self): + return id(self) + + +@total_ordering +class EqualityCheckMixin(object): + """ Mixin for basic equality checking """ + + def __eq__(self, other): + if type(self) is type(other): + def fixup(adict): + bad = ('loc', 'scope') + d = dict(adict) + for x in bad: + d.pop(x, None) + return d + d1 = fixup(self.__dict__) + d2 = fixup(other.__dict__) + if d1 == d2: + return True + return False + + def __le__(self, other): + return str(self) < str(other) + + def __hash__(self): + return id(self) + + +class VarMap(object): + def __init__(self): + self._con = {} + + def define(self, name, var): + if name in self._con: + raise RedefinedError(name) + else: + self._con[name] = var + + def get(self, name): + try: + return self._con[name] + except KeyError: + raise NotDefinedError(name) + + def __contains__(self, name): + return name in self._con + + def __len__(self): + return len(self._con) + + def __repr__(self): + return pprint.pformat(self._con) + + def __hash__(self): + return hash(self.name) + + def __iter__(self): + return self._con.iterkeys() + + def __eq__(self, other): + if type(self) is type(other): + # check keys only, else __eq__ ref cycles, scope -> varmap -> var + return self._con.keys() == other._con.keys() + return False + + def __ne__(self, other): + return not self.__eq__(other) + + +class AbstractRHS(object): + """Abstract base class for anything that can be the RHS of an assignment. + This class **does not** define any methods. + """ + + +class Inst(EqualityCheckMixin, AbstractRHS): + """ + Base class for all IR instructions. + """ + + def list_vars(self): + """ + List the variables used (read or written) by the instruction. + """ + raise NotImplementedError + + def _rec_list_vars(self, val): + """ + A recursive helper used to implement list_vars() in subclasses. + """ + if isinstance(val, Var): + return [val] + elif isinstance(val, Inst): + return val.list_vars() + elif isinstance(val, (list, tuple)): + lst = [] + for v in val: + lst.extend(self._rec_list_vars(v)) + return lst + elif isinstance(val, dict): + lst = [] + for v in val.values(): + lst.extend(self._rec_list_vars(v)) + return lst + else: + return [] + + +class Stmt(Inst): + """ + Base class for IR statements (instructions which can appear on their + own in a Block). + """ + # Whether this statement ends its basic block (i.e. it will either jump + # to another block or exit the function). + is_terminator = False + # Whether this statement exits the function. + is_exit = False + + def list_vars(self): + return self._rec_list_vars(self.__dict__) + + +class Terminator(Stmt): + """ + IR statements that are terminators: the last statement in a block. + A terminator must either: + - exit the function + - jump to a block + + All subclass of Terminator must override `.get_targets()` to return a list + of jump targets. + """ + is_terminator = True + + def get_targets(self): + raise NotImplementedError(type(self)) + + +class Expr(Inst): + """ + An IR expression (an instruction which can only be part of a larger + statement). + """ + + def __init__(self, op, loc, **kws): + assert isinstance(op, str) + assert isinstance(loc, Loc) + self.op = op + self.loc = loc + self._kws = kws + + def __getattr__(self, name): + if name.startswith('_'): + return Inst.__getattr__(self, name) + return self._kws[name] + + def __setattr__(self, name, value): + if name in ('op', 'loc', '_kws'): + self.__dict__[name] = value + else: + self._kws[name] = value + + @classmethod + def binop(cls, fn, lhs, rhs, loc): + assert isinstance(fn, BuiltinFunctionType) + assert isinstance(lhs, Var) + assert isinstance(rhs, Var) + assert isinstance(loc, Loc) + op = 'binop' + return cls(op=op, loc=loc, fn=fn, lhs=lhs, rhs=rhs, + static_lhs=UNDEFINED, static_rhs=UNDEFINED) + + @classmethod + def inplace_binop(cls, fn, immutable_fn, lhs, rhs, loc): + assert isinstance(fn, BuiltinFunctionType) + assert isinstance(immutable_fn, BuiltinFunctionType) + assert isinstance(lhs, Var) + assert isinstance(rhs, Var) + assert isinstance(loc, Loc) + op = 'inplace_binop' + return cls(op=op, loc=loc, fn=fn, immutable_fn=immutable_fn, + lhs=lhs, rhs=rhs, + static_lhs=UNDEFINED, static_rhs=UNDEFINED) + + @classmethod + def unary(cls, fn, value, loc): + assert isinstance(value, (str, Var, FunctionType)) + assert isinstance(loc, Loc) + op = 'unary' + fn = UNARY_BUITINS_TO_OPERATORS.get(fn, fn) + return cls(op=op, loc=loc, fn=fn, value=value) + + @classmethod + def call(cls, func, args, kws, loc, vararg=None, varkwarg=None, target=None): + assert isinstance(func, Var) + assert isinstance(loc, Loc) + op = 'call' + return cls(op=op, loc=loc, func=func, args=args, kws=kws, + vararg=vararg, varkwarg=varkwarg, target=target) + + @classmethod + def build_tuple(cls, items, loc): + assert isinstance(loc, Loc) + op = 'build_tuple' + return cls(op=op, loc=loc, items=items) + + @classmethod + def build_list(cls, items, loc): + assert isinstance(loc, Loc) + op = 'build_list' + return cls(op=op, loc=loc, items=items) + + @classmethod + def build_set(cls, items, loc): + assert isinstance(loc, Loc) + op = 'build_set' + return cls(op=op, loc=loc, items=items) + + @classmethod + def build_map(cls, items, size, literal_value, value_indexes, loc): + assert isinstance(loc, Loc) + op = 'build_map' + return cls(op=op, loc=loc, items=items, size=size, + literal_value=literal_value, value_indexes=value_indexes) + + @classmethod + def pair_first(cls, value, loc): + assert isinstance(value, Var) + op = 'pair_first' + return cls(op=op, loc=loc, value=value) + + @classmethod + def pair_second(cls, value, loc): + assert isinstance(value, Var) + assert isinstance(loc, Loc) + op = 'pair_second' + return cls(op=op, loc=loc, value=value) + + @classmethod + def getiter(cls, value, loc): + assert isinstance(value, Var) + assert isinstance(loc, Loc) + op = 'getiter' + return cls(op=op, loc=loc, value=value) + + @classmethod + def iternext(cls, value, loc): + assert isinstance(value, Var) + assert isinstance(loc, Loc) + op = 'iternext' + return cls(op=op, loc=loc, value=value) + + @classmethod + def exhaust_iter(cls, value, count, loc): + assert isinstance(value, Var) + assert isinstance(count, int) + assert isinstance(loc, Loc) + op = 'exhaust_iter' + return cls(op=op, loc=loc, value=value, count=count) + + @classmethod + def getattr(cls, value, attr, loc): + assert isinstance(value, Var) + assert isinstance(attr, str) + assert isinstance(loc, Loc) + op = 'getattr' + return cls(op=op, loc=loc, value=value, attr=attr) + + @classmethod + def getitem(cls, value, index, loc): + assert isinstance(value, Var) + assert isinstance(index, Var) + assert isinstance(loc, Loc) + op = 'getitem' + fn = operator.getitem + return cls(op=op, loc=loc, value=value, index=index, fn=fn) + + @classmethod + def typed_getitem(cls, value, dtype, index, loc): + assert isinstance(value, Var) + assert isinstance(loc, Loc) + op = 'typed_getitem' + return cls(op=op, loc=loc, value=value, dtype=dtype, + index=index) + + @classmethod + def static_getitem(cls, value, index, index_var, loc): + assert isinstance(value, Var) + assert index_var is None or isinstance(index_var, Var) + assert isinstance(loc, Loc) + op = 'static_getitem' + fn = operator.getitem + return cls(op=op, loc=loc, value=value, index=index, + index_var=index_var, fn=fn) + + @classmethod + def cast(cls, value, loc): + """ + A node for implicit casting at the return statement + """ + assert isinstance(value, Var) + assert isinstance(loc, Loc) + op = 'cast' + return cls(op=op, value=value, loc=loc) + + @classmethod + def phi(cls, loc): + """Phi node + """ + assert isinstance(loc, Loc) + return cls(op='phi', incoming_values=[], incoming_blocks=[], loc=loc) + + @classmethod + def make_function(cls, name, code, closure, defaults, loc): + """ + A node for making a function object. + """ + assert isinstance(loc, Loc) + op = 'make_function' + return cls(op=op, name=name, code=code, closure=closure, defaults=defaults, loc=loc) + + @classmethod + def null(cls, loc): + """ + A node for null value. + + This node is not handled by type inference. It is only added by + post-typing passes. + """ + assert isinstance(loc, Loc) + op = 'null' + return cls(op=op, loc=loc) + + @classmethod + def dummy(cls, op, info, loc): + """ + A node for a dummy value. + + This node is a place holder for carrying information through to a point + where it is rewritten into something valid. This node is not handled + by type inference or lowering. It's presence outside of the interpreter + renders IR as illegal. + """ + assert isinstance(loc, Loc) + assert isinstance(op, str) + return cls(op=op, info=info, loc=loc) + + def __repr__(self): + if self.op == 'call': + args = ', '.join(str(a) for a in self.args) + pres_order = self._kws.items() if config.DIFF_IR == 0 else sorted(self._kws.items()) + kws = ', '.join('%s=%s' % (k, v) for k, v in pres_order) + vararg = '*%s' % (self.vararg,) if self.vararg is not None else '' + arglist = ', '.join(filter(None, [args, vararg, kws])) + return 'call %s(%s)' % (self.func, arglist) + elif self.op == 'binop': + lhs, rhs = self.lhs, self.rhs + if self.fn == operator.contains: + lhs, rhs = rhs, lhs + fn = OPERATORS_TO_BUILTINS.get(self.fn, self.fn) + return '%s %s %s' % (lhs, fn, rhs) + else: + pres_order = self._kws.items() if config.DIFF_IR == 0 else sorted(self._kws.items()) + args = ('%s=%s' % (k, v) for k, v in pres_order) + return '%s(%s)' % (self.op, ', '.join(args)) + + def list_vars(self): + return self._rec_list_vars(self._kws) + + def infer_constant(self): + raise ConstantInferenceError('%s' % self, loc=self.loc) + + +class SetItem(Stmt): + """ + target[index] = value + """ + + def __init__(self, target, index, value, loc): + assert isinstance(target, Var) + assert isinstance(index, Var) + assert isinstance(value, Var) + assert isinstance(loc, Loc) + self.target = target + self.index = index + self.value = value + self.loc = loc + + def __repr__(self): + return '%s[%s] = %s' % (self.target, self.index, self.value) + + +class StaticSetItem(Stmt): + """ + target[constant index] = value + """ + + def __init__(self, target, index, index_var, value, loc): + assert isinstance(target, Var) + assert not isinstance(index, Var) + assert isinstance(index_var, Var) + assert isinstance(value, Var) + assert isinstance(loc, Loc) + self.target = target + self.index = index + self.index_var = index_var + self.value = value + self.loc = loc + + def __repr__(self): + return '%s[%r] = %s' % (self.target, self.index, self.value) + + +class DelItem(Stmt): + """ + del target[index] + """ + + def __init__(self, target, index, loc): + assert isinstance(target, Var) + assert isinstance(index, Var) + assert isinstance(loc, Loc) + self.target = target + self.index = index + self.loc = loc + + def __repr__(self): + return 'del %s[%s]' % (self.target, self.index) + + +class SetAttr(Stmt): + def __init__(self, target, attr, value, loc): + assert isinstance(target, Var) + assert isinstance(attr, str) + assert isinstance(value, Var) + assert isinstance(loc, Loc) + self.target = target + self.attr = attr + self.value = value + self.loc = loc + + def __repr__(self): + return '(%s).%s = %s' % (self.target, self.attr, self.value) + + +class DelAttr(Stmt): + def __init__(self, target, attr, loc): + assert isinstance(target, Var) + assert isinstance(attr, str) + assert isinstance(loc, Loc) + self.target = target + self.attr = attr + self.loc = loc + + def __repr__(self): + return 'del (%s).%s' % (self.target, self.attr) + + +class StoreMap(Stmt): + def __init__(self, dct, key, value, loc): + assert isinstance(dct, Var) + assert isinstance(key, Var) + assert isinstance(value, Var) + assert isinstance(loc, Loc) + self.dct = dct + self.key = key + self.value = value + self.loc = loc + + def __repr__(self): + return '%s[%s] = %s' % (self.dct, self.key, self.value) + + +class Del(Stmt): + def __init__(self, value, loc): + assert isinstance(value, str) + assert isinstance(loc, Loc) + self.value = value + self.loc = loc + + def __str__(self): + return "del %s" % self.value + + +class Raise(Terminator): + is_exit = True + + def __init__(self, exception, loc): + assert exception is None or isinstance(exception, Var) + assert isinstance(loc, Loc) + self.exception = exception + self.loc = loc + + def __str__(self): + return "raise %s" % self.exception + + def get_targets(self): + return [] + + +class StaticRaise(Terminator): + """ + Raise an exception class and arguments known at compile-time. + Note that if *exc_class* is None, a bare "raise" statement is implied + (i.e. re-raise the current exception). + """ + is_exit = True + + def __init__(self, exc_class, exc_args, loc): + assert exc_class is None or isinstance(exc_class, type) + assert isinstance(loc, Loc) + assert exc_args is None or isinstance(exc_args, tuple) + self.exc_class = exc_class + self.exc_args = exc_args + self.loc = loc + + def __str__(self): + if self.exc_class is None: + return " raise" + elif self.exc_args is None: + return " raise %s" % (self.exc_class,) + else: + return " raise %s(%s)" % (self.exc_class, + ", ".join(map(repr, self.exc_args))) + + def get_targets(self): + return [] + + +class TryRaise(Stmt): + """A raise statement inside a try-block + Similar to ``Raise`` but does not terminate. + """ + def __init__(self, exception, loc): + assert exception is None or isinstance(exception, Var) + assert isinstance(loc, Loc) + self.exception = exception + self.loc = loc + + def __str__(self): + return "try_raise %s" % self.exception + + +class StaticTryRaise(Stmt): + """A raise statement inside a try-block. + Similar to ``StaticRaise`` but does not terminate. + """ + + def __init__(self, exc_class, exc_args, loc): + assert exc_class is None or isinstance(exc_class, type) + assert isinstance(loc, Loc) + assert exc_args is None or isinstance(exc_args, tuple) + self.exc_class = exc_class + self.exc_args = exc_args + self.loc = loc + + def __str__(self): + if self.exc_class is None: + return "static_try_raise" + elif self.exc_args is None: + return "static_try_raise %s" % (self.exc_class,) + else: + return "static_try_raise %s(%s)" % (self.exc_class, + ", ".join(map(repr, self.exc_args))) + + +class Return(Terminator): + """ + Return to caller. + """ + is_exit = True + + def __init__(self, value, loc): + assert isinstance(value, Var), type(value) + assert isinstance(loc, Loc) + self.value = value + self.loc = loc + + def __str__(self): + return 'return %s' % self.value + + def get_targets(self): + return [] + + +class Jump(Terminator): + """ + Unconditional branch. + """ + + def __init__(self, target, loc): + assert isinstance(loc, Loc) + self.target = target + self.loc = loc + + def __str__(self): + return 'jump %s' % self.target + + def get_targets(self): + return [self.target] + + +class Branch(Terminator): + """ + Conditional branch. + """ + + def __init__(self, cond, truebr, falsebr, loc): + assert isinstance(cond, Var) + assert isinstance(loc, Loc) + self.cond = cond + self.truebr = truebr + self.falsebr = falsebr + self.loc = loc + + def __str__(self): + return 'branch %s, %s, %s' % (self.cond, self.truebr, self.falsebr) + + def get_targets(self): + return [self.truebr, self.falsebr] + + +class Assign(Stmt): + """ + Assign to a variable. + """ + def __init__(self, value, target, loc): + assert isinstance(value, AbstractRHS) + assert isinstance(target, Var) + assert isinstance(loc, Loc) + self.value = value + self.target = target + self.loc = loc + + def __str__(self): + return '%s = %s' % (self.target, self.value) + + +class Print(Stmt): + """ + Print some values. + """ + def __init__(self, args, vararg, loc): + assert all(isinstance(x, Var) for x in args) + assert vararg is None or isinstance(vararg, Var) + assert isinstance(loc, Loc) + self.args = tuple(args) + self.vararg = vararg + # Constant-inferred arguments + self.consts = {} + self.loc = loc + + def __str__(self): + return 'print(%s)' % ', '.join(str(v) for v in self.args) + + +class Yield(Inst): + def __init__(self, value, loc, index): + assert isinstance(value, Var) + assert isinstance(loc, Loc) + self.value = value + self.loc = loc + self.index = index + + def __str__(self): + return 'yield %s' % (self.value,) + + def list_vars(self): + return [self.value] + + +class EnterWith(Stmt): + """Enter a "with" context + """ + def __init__(self, contextmanager, begin, end, loc): + """ + Parameters + ---------- + contextmanager : IR value + begin, end : int + The beginning and the ending offset of the with-body. + loc : ir.Loc instance + Source location + """ + assert isinstance(contextmanager, Var) + assert isinstance(loc, Loc) + self.contextmanager = contextmanager + self.begin = begin + self.end = end + self.loc = loc + + def __str__(self): + return 'enter_with {}'.format(self.contextmanager) + + def list_vars(self): + return [self.contextmanager] + + +class PopBlock(Stmt): + """Marker statement for a pop block op code""" + def __init__(self, loc): + assert isinstance(loc, Loc) + self.loc = loc + + def __str__(self): + return 'pop_block' + + +class Arg(EqualityCheckMixin, AbstractRHS): + def __init__(self, name, index, loc): + assert isinstance(name, str) + assert isinstance(index, int) + assert isinstance(loc, Loc) + self.name = name + self.index = index + self.loc = loc + + def __repr__(self): + return 'arg(%d, name=%s)' % (self.index, self.name) + + def infer_constant(self): + raise ConstantInferenceError('%s' % self, loc=self.loc) + + +class Const(EqualityCheckMixin, AbstractRHS): + def __init__(self, value, loc, use_literal_type=True): + assert isinstance(loc, Loc) + self.value = value + self.loc = loc + # Note: need better way to tell if this is a literal or not. + self.use_literal_type = use_literal_type + + def __repr__(self): + return 'const(%s, %s)' % (type(self.value).__name__, self.value) + + def infer_constant(self): + return self.value + + def __deepcopy__(self, memo): + # Override to not copy constant values in code + return Const( + value=self.value, loc=self.loc, + use_literal_type=self.use_literal_type, + ) + + +class Global(EqualityCheckMixin, AbstractRHS): + def __init__(self, name, value, loc): + assert isinstance(loc, Loc) + self.name = name + self.value = value + self.loc = loc + + def __str__(self): + return 'global(%s: %s)' % (self.name, self.value) + + def infer_constant(self): + return self.value + + def __deepcopy__(self, memo): + # don't copy value since it can fail (e.g. modules) + # value is readonly and doesn't need copying + return Global(self.name, self.value, copy.deepcopy(self.loc)) + + +class FreeVar(EqualityCheckMixin, AbstractRHS): + """ + A freevar, as loaded by LOAD_DECREF. + (i.e. a variable defined in an enclosing non-global scope) + """ + + def __init__(self, index, name, value, loc): + assert isinstance(index, int) + assert isinstance(name, str) + assert isinstance(loc, Loc) + # index inside __code__.co_freevars + self.index = index + # variable name + self.name = name + # frozen value + self.value = value + self.loc = loc + + def __str__(self): + return 'freevar(%s: %s)' % (self.name, self.value) + + def infer_constant(self): + return self.value + + def __deepcopy__(self, memo): + # Override to not copy constant values in code + return FreeVar(index=self.index, name=self.name, value=self.value, + loc=self.loc) + + + +class Var(EqualityCheckMixin, AbstractRHS): + """ + Attributes + ----------- + - scope: Scope + + - name: str + + - loc: Loc + Definition location + """ + + def __init__(self, scope, name, loc): + # NOTE: Use of scope=None should be removed. + assert scope is None or isinstance(scope, Scope) + assert isinstance(name, str) + assert isinstance(loc, Loc) + self.scope = scope + self.name = name + self.loc = loc + + def __repr__(self): + return 'Var(%s, %s)' % (self.name, self.loc.short()) + + def __str__(self): + return self.name + + @property + def is_temp(self): + return self.name.startswith("$") + + @property + def unversioned_name(self): + """The unversioned name of this variable, i.e. SSA renaming removed + """ + for k, redef_set in self.scope.var_redefinitions.items(): + if self.name in redef_set: + return k + return self.name + + @property + def versioned_names(self): + """Known versioned names for this variable, i.e. known variable names in + the scope that have been formed from applying SSA to this variable + """ + return self.scope.get_versions_of(self.unversioned_name) + + @property + def all_names(self): + """All known versioned and unversioned names for this variable + """ + return self.versioned_names | {self.unversioned_name,} + + +class Scope(EqualityCheckMixin): + """ + Attributes + ----------- + - parent: Scope + Parent scope + + - localvars: VarMap + Scope-local variable map + + - loc: Loc + Start of scope location + + """ + + def __init__(self, parent, loc): + assert parent is None or isinstance(parent, Scope) + assert isinstance(loc, Loc) + self.parent = parent + self.localvars = VarMap() + self.loc = loc + self.redefined = defaultdict(int) + self.var_redefinitions = defaultdict(set) + + def define(self, name, loc): + """ + Define a variable + """ + v = Var(scope=self, name=name, loc=loc) + self.localvars.define(v.name, v) + return v + + def get(self, name): + """ + Refer to a variable. Returns the latest version. + """ + if name in self.redefined: + name = "%s.%d" % (name, self.redefined[name]) + return self.get_exact(name) + + def get_exact(self, name): + """ + Refer to a variable. The returned variable has the exact + name (exact variable version). + """ + try: + return self.localvars.get(name) + except NotDefinedError: + if self.has_parent: + return self.parent.get(name) + else: + raise + + def get_or_define(self, name, loc): + if name in self.redefined: + name = "%s.%d" % (name, self.redefined[name]) + + if name not in self.localvars: + return self.define(name, loc) + else: + return self.localvars.get(name) + + def redefine(self, name, loc, rename=True): + """ + Redefine if the name is already defined + """ + if name not in self.localvars: + return self.define(name, loc) + elif not rename: + # Must use the same name if the variable is a cellvar, which + # means it could be captured in a closure. + return self.localvars.get(name) + else: + while True: + ct = self.redefined[name] + self.redefined[name] = ct + 1 + newname = "%s.%d" % (name, ct + 1) + try: + res = self.define(newname, loc) + except RedefinedError: + continue + else: + self.var_redefinitions[name].add(newname) + return res + + def get_versions_of(self, name): + """ + Gets all known versions of a given name + """ + vers = set() + def walk(thename): + redefs = self.var_redefinitions.get(thename, None) + if redefs: + for v in redefs: + vers.add(v) + walk(v) + walk(name) + return vers + + def make_temp(self, loc): + n = len(self.localvars) + v = Var(scope=self, name='$%d' % n, loc=loc) + self.localvars.define(v.name, v) + return v + + @property + def has_parent(self): + return self.parent is not None + + def __repr__(self): + return "Scope(has_parent=%r, num_vars=%d, %s)" % (self.has_parent, + len(self.localvars), + self.loc) + + +class Block(EqualityCheckMixin): + """A code block + + """ + + def __init__(self, scope, loc): + assert isinstance(scope, Scope) + assert isinstance(loc, Loc) + self.scope = scope + self.body = [] + self.loc = loc + + def copy(self): + block = Block(self.scope, self.loc) + block.body = self.body[:] + return block + + def find_exprs(self, op=None): + """ + Iterate over exprs of the given *op* in this block. + """ + for inst in self.body: + if isinstance(inst, Assign): + expr = inst.value + if isinstance(expr, Expr): + if op is None or expr.op == op: + yield expr + + def find_insts(self, cls=None): + """ + Iterate over insts of the given class in this block. + """ + for inst in self.body: + if isinstance(inst, cls): + yield inst + + def find_variable_assignment(self, name): + """ + Returns the assignment inst associated with variable "name", None if + it cannot be found. + """ + for x in self.find_insts(cls=Assign): + if x.target.name == name: + return x + return None + + def prepend(self, inst): + assert isinstance(inst, Stmt) + self.body.insert(0, inst) + + def append(self, inst): + assert isinstance(inst, Stmt) + self.body.append(inst) + + def remove(self, inst): + assert isinstance(inst, Stmt) + del self.body[self.body.index(inst)] + + def clear(self): + del self.body[:] + + def dump(self, file=None): + # Avoid early bind of sys.stdout as default value + file = file or sys.stdout + for inst in self.body: + if hasattr(inst, 'dump'): + inst.dump(file) + else: + inst_vars = sorted(str(v) for v in inst.list_vars()) + print(' %-40s %s' % (inst, inst_vars), file=file) + + @property + def terminator(self): + return self.body[-1] + + @property + def is_terminated(self): + return self.body and self.body[-1].is_terminator + + def verify(self): + if not self.is_terminated: + raise VerificationError("Missing block terminator") + # Only the last instruction can be a terminator + for inst in self.body[:-1]: + if inst.is_terminator: + raise VerificationError("Terminator before the last " + "instruction") + + def insert_after(self, stmt, other): + """ + Insert *stmt* after *other*. + """ + index = self.body.index(other) + self.body.insert(index + 1, stmt) + + def insert_before_terminator(self, stmt): + assert isinstance(stmt, Stmt) + assert self.is_terminated + self.body.insert(-1, stmt) + + def __repr__(self): + return "" % (self.loc,) + + +class Loop(SlotEqualityCheckMixin): + """Describes a loop-block + """ + __slots__ = "entry", "exit" + + def __init__(self, entry, exit): + self.entry = entry + self.exit = exit + + def __repr__(self): + args = self.entry, self.exit + return "Loop(entry=%s, exit=%s)" % args + + +class With(SlotEqualityCheckMixin): + """Describes a with-block + """ + __slots__ = "entry", "exit" + + def __init__(self, entry, exit): + self.entry = entry + self.exit = exit + + def __repr__(self): + args = self.entry, self.exit + return "With(entry=%s, exit=%s)" % args + + +class FunctionIR(object): + + def __init__(self, blocks, is_generator, func_id, loc, + definitions, arg_count, arg_names): + self.blocks = blocks + self.is_generator = is_generator + self.func_id = func_id + self.loc = loc + self.arg_count = arg_count + self.arg_names = arg_names + + self._definitions = definitions + + self._reset_analysis_variables() + + def equal_ir(self, other): + """ Checks that the IR contained within is equal to the IR in other. + Equality is defined by being equal in fundamental structure (blocks, + labels, IR node type and the order in which they are defined) and the + IR nodes being equal. IR node equality essentially comes down to + ensuring a node's `.__dict__` or `.__slots__` is equal, with the + exception of ignoring 'loc' and 'scope' entries. The upshot is that the + comparison is essentially location and scope invariant, but otherwise + behaves as unsurprisingly as possible. + """ + if type(self) is type(other): + return self.blocks == other.blocks + return False + + def diff_str(self, other): + """ + Compute a human readable difference in the IR, returns a formatted + string ready for printing. + """ + msg = [] + for label, block in self.blocks.items(): + other_blk = other.blocks.get(label, None) + if other_blk is not None: + if block != other_blk: + msg.append(("Block %s differs" % label).center(80, '-')) + # see if the instructions are just a permutation + block_del = [x for x in block.body if isinstance(x, Del)] + oth_del = [x for x in other_blk.body if isinstance(x, Del)] + if block_del != oth_del: + # this is a common issue, dels are all present, but + # order shuffled. + if sorted(block_del) == sorted(oth_del): + msg.append(("Block %s contains the same dels but " + "their order is different") % label) + if len(block.body) > len(other_blk.body): + msg.append("This block contains more statements") + elif len(block.body) < len(other_blk.body): + msg.append("Other block contains more statements") + + # find the indexes where they don't match + tmp = [] + for idx, stmts in enumerate(zip(block.body, + other_blk.body)): + b_s, o_s = stmts + if b_s != o_s: + tmp.append(idx) + + def get_pad(ablock, l): + pointer = '-> ' + sp = len(pointer) * ' ' + pad = [] + nstmt = len(ablock) + for i in range(nstmt): + if i in tmp: + item = pointer + elif i >= l: + item = pointer + else: + item = sp + pad.append(item) + return pad + + min_stmt_len = min(len(block.body), len(other_blk.body)) + + with StringIO() as buf: + it = [("self", block), ("other", other_blk)] + for name, _block in it: + buf.truncate(0) + _block.dump(file=buf) + stmts = buf.getvalue().splitlines() + pad = get_pad(_block.body, min_stmt_len) + title = ("%s: block %s" % (name, label)) + msg.append(title.center(80, '-')) + msg.extend(["{0}{1}".format(a, b) for a, b in + zip(pad, stmts)]) + if msg == []: + msg.append("IR is considered equivalent.") + return '\n'.join(msg) + + def _reset_analysis_variables(self): + + self._consts = consts.ConstantInference(self) + + # Will be computed by PostProcessor + self.generator_info = None + self.variable_lifetime = None + # { ir.Block: { variable names (potentially) alive at start of block } } + self.block_entry_vars = {} + + def derive(self, blocks, arg_count=None, arg_names=None, + force_non_generator=False): + """ + Derive a new function IR from this one, using the given blocks, + and possibly modifying the argument count and generator flag. + + Post-processing will have to be run again on the new IR. + """ + firstblock = blocks[min(blocks)] + + new_ir = copy.copy(self) + new_ir.blocks = blocks + new_ir.loc = firstblock.loc + if force_non_generator: + new_ir.is_generator = False + if arg_count is not None: + new_ir.arg_count = arg_count + if arg_names is not None: + new_ir.arg_names = arg_names + new_ir._reset_analysis_variables() + # Make fresh func_id + new_ir.func_id = new_ir.func_id.derive() + return new_ir + + def copy(self): + new_ir = copy.copy(self) + blocks = {} + block_entry_vars = {} + for label, block in self.blocks.items(): + new_block = block.copy() + blocks[label] = new_block + if block in self.block_entry_vars: + block_entry_vars[new_block] = self.block_entry_vars[block] + new_ir.blocks = blocks + new_ir.block_entry_vars = block_entry_vars + return new_ir + + def get_block_entry_vars(self, block): + """ + Return a set of variable names possibly alive at the beginning of + the block. + """ + return self.block_entry_vars[block] + + def infer_constant(self, name): + """ + Try to infer the constant value of a given variable. + """ + if isinstance(name, Var): + name = name.name + return self._consts.infer_constant(name) + + def get_definition(self, value, lhs_only=False): + """ + Get the definition site for the given variable name or instance. + A Expr instance is returned by default, but if lhs_only is set + to True, the left-hand-side variable is returned instead. + """ + lhs = value + while True: + if isinstance(value, Var): + lhs = value + name = value.name + elif isinstance(value, str): + lhs = value + name = value + else: + return lhs if lhs_only else value + defs = self._definitions[name] + if len(defs) == 0: + raise KeyError("no definition for %r" + % (name,)) + if len(defs) > 1: + raise KeyError("more than one definition for %r" + % (name,)) + value = defs[0] + + def get_assignee(self, rhs_value, in_blocks=None): + """ + Finds the assignee for a given RHS value. If in_blocks is given the + search will be limited to the specified blocks. + """ + if in_blocks is None: + blocks = self.blocks.values() + elif isinstance(in_blocks, int): + blocks = [self.blocks[in_blocks]] + else: + blocks = [self.blocks[blk] for blk in list(in_blocks)] + + assert isinstance(rhs_value, AbstractRHS) + + for blk in blocks: + for assign in blk.find_insts(Assign): + if assign.value == rhs_value: + return assign.target + + raise ValueError("Could not find an assignee for %s" % rhs_value) + + + def dump(self, file=None): + nofile = file is None + # Avoid early bind of sys.stdout as default value + file = file or StringIO() + for offset, block in sorted(self.blocks.items()): + print('label %s:' % (offset,), file=file) + block.dump(file=file) + if nofile: + text = file.getvalue() + if config.HIGHLIGHT_DUMPS: + try: + import pygments + except ImportError: + msg = "Please install pygments to see highlighted dumps" + raise ValueError(msg) + else: + from pygments import highlight + from numba.misc.dump_style import NumbaIRLexer as lexer + from numba.misc.dump_style import by_colorscheme + from pygments.formatters import Terminal256Formatter + print(highlight(text, lexer(), Terminal256Formatter( + style=by_colorscheme()))) + else: + print(text) + + + def dump_to_string(self): + with StringIO() as sb: + self.dump(file=sb) + return sb.getvalue() + + def dump_generator_info(self, file=None): + file = file or sys.stdout + gi = self.generator_info + print("generator state variables:", sorted(gi.state_vars), file=file) + for index, yp in sorted(gi.yield_points.items()): + print("yield point #%d: live variables = %s, weak live variables = %s" + % (index, sorted(yp.live_vars), sorted(yp.weak_live_vars)), + file=file) + + def render_dot(self, filename_prefix="numba_ir", include_ir=True): + """Render the CFG of the IR with GraphViz DOT via the + ``graphviz`` python binding. + + Returns + ------- + g : graphviz.Digraph + Use `g.view()` to open the graph in the default PDF application. + """ + + try: + import graphviz as gv + except ImportError: + raise ImportError( + "The feature requires `graphviz` but it is not available. " + "Please install with `pip install graphviz`" + ) + g = gv.Digraph( + filename="{}{}.dot".format( + filename_prefix, + self.func_id.unique_name, + ) + ) + # Populate the nodes + for k, blk in self.blocks.items(): + with StringIO() as sb: + blk.dump(sb) + label = sb.getvalue() + if include_ir: + label = ''.join( + [r' {}\l'.format(x) for x in label.splitlines()], + ) + label = r"block {}\l".format(k) + label + g.node(str(k), label=label, shape='rect') + else: + label = r"{}\l".format(k) + g.node(str(k), label=label, shape='circle') + # Populate the edges + for src, blk in self.blocks.items(): + for dst in blk.terminator.get_targets(): + g.edge(str(src), str(dst)) + return g + + +# A stub for undefined global reference +class UndefinedType(EqualityCheckMixin): + + _singleton = None + + def __new__(cls): + obj = cls._singleton + if obj is not None: + return obj + else: + obj = object.__new__(cls) + cls._singleton = obj + return obj + + def __repr__(self): + return "Undefined" + + +UNDEFINED = UndefinedType() diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/ir_utils.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/ir_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9d58bc5dae0f83cca983274c32048e28df3769d3 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/ir_utils.py @@ -0,0 +1,2350 @@ +# +# Copyright (c) 2017 Intel Corporation +# SPDX-License-Identifier: BSD-2-Clause +# + +import numpy + +import types as pytypes +import collections +import warnings + +import numba +from numba.core.extending import _Intrinsic +from numba.core import types, typing, ir, analysis, postproc, rewrites, config +from numba.core.typing.templates import signature +from numba.core.analysis import (compute_live_map, compute_use_defs, + compute_cfg_from_blocks) +from numba.core.errors import (TypingError, UnsupportedError, + NumbaPendingDeprecationWarning, + CompilerError) + +import copy + +_unique_var_count = 0 + + +def mk_unique_var(prefix): + global _unique_var_count + var = prefix + "." + str(_unique_var_count) + _unique_var_count = _unique_var_count + 1 + return var + + +class _MaxLabel: + def __init__(self, value=0): + self._value = value + + def next(self): + self._value += 1 + return self._value + + def update(self, newval): + self._value = max(newval, self._value) + + +_the_max_label = _MaxLabel() +del _MaxLabel + + +def get_unused_var_name(prefix, var_table): + """ Get a new var name with a given prefix and + make sure it is unused in the given variable table. + """ + cur = 0 + while True: + var = prefix + str(cur) + if var not in var_table: + return var + cur += 1 + + +def next_label(): + return _the_max_label.next() + + +def mk_alloc(typingctx, typemap, calltypes, lhs, size_var, dtype, scope, loc, + lhs_typ): + """generate an array allocation with np.empty() and return list of nodes. + size_var can be an int variable or tuple of int variables. + lhs_typ is the type of the array being allocated. + """ + out = [] + ndims = 1 + size_typ = types.intp + if isinstance(size_var, tuple): + if len(size_var) == 1: + size_var = size_var[0] + size_var = convert_size_to_var(size_var, typemap, scope, loc, out) + else: + # tuple_var = build_tuple([size_var...]) + ndims = len(size_var) + tuple_var = ir.Var(scope, mk_unique_var("$tuple_var"), loc) + if typemap: + typemap[tuple_var.name] = types.containers.UniTuple( + types.intp, ndims) + # constant sizes need to be assigned to vars + new_sizes = [convert_size_to_var(s, typemap, scope, loc, out) + for s in size_var] + tuple_call = ir.Expr.build_tuple(new_sizes, loc) + tuple_assign = ir.Assign(tuple_call, tuple_var, loc) + out.append(tuple_assign) + size_var = tuple_var + size_typ = types.containers.UniTuple(types.intp, ndims) + # g_np_var = Global(numpy) + g_np_var = ir.Var(scope, mk_unique_var("$np_g_var"), loc) + if typemap: + typemap[g_np_var.name] = types.misc.Module(numpy) + g_np = ir.Global('np', numpy, loc) + g_np_assign = ir.Assign(g_np, g_np_var, loc) + # attr call: empty_attr = getattr(g_np_var, empty) + empty_attr_call = ir.Expr.getattr(g_np_var, "empty", loc) + attr_var = ir.Var(scope, mk_unique_var("$empty_attr_attr"), loc) + if typemap: + typemap[attr_var.name] = get_np_ufunc_typ(numpy.empty) + attr_assign = ir.Assign(empty_attr_call, attr_var, loc) + # Assume str(dtype) returns a valid type + dtype_str = str(dtype) + # alloc call: lhs = empty_attr(size_var, typ_var) + typ_var = ir.Var(scope, mk_unique_var("$np_typ_var"), loc) + if typemap: + typemap[typ_var.name] = types.functions.NumberClass(dtype) + # If dtype is a datetime/timedelta with a unit, + # then it won't return a valid type and instead can be created + # with a string. i.e. "datetime64[ns]") + if (isinstance(dtype, (types.NPDatetime, types.NPTimedelta)) and + dtype.unit != ''): + typename_const = ir.Const(dtype_str, loc) + typ_var_assign = ir.Assign(typename_const, typ_var, loc) + else: + if dtype_str=='bool': + # empty doesn't like 'bool' sometimes (e.g. kmeans example) + dtype_str = 'bool_' + np_typ_getattr = ir.Expr.getattr(g_np_var, dtype_str, loc) + typ_var_assign = ir.Assign(np_typ_getattr, typ_var, loc) + alloc_call = ir.Expr.call(attr_var, [size_var, typ_var], (), loc) + + if calltypes: + cac = typemap[attr_var.name].get_call_type( + typingctx, [size_typ, types.functions.NumberClass(dtype)], {}) + # By default, all calls to "empty" are typed as returning a standard + # NumPy ndarray. If we are allocating a ndarray subclass here then + # just change the return type to be that of the subclass. + cac._return_type = (lhs_typ.copy(layout='C') + if lhs_typ.layout == 'F' + else lhs_typ) + calltypes[alloc_call] = cac + if lhs_typ.layout == 'F': + empty_c_typ = lhs_typ.copy(layout='C') + empty_c_var = ir.Var(scope, mk_unique_var("$empty_c_var"), loc) + if typemap: + typemap[empty_c_var.name] = lhs_typ.copy(layout='C') + empty_c_assign = ir.Assign(alloc_call, empty_c_var, loc) + + # attr call: asfortranarray = getattr(g_np_var, asfortranarray) + asfortranarray_attr_call = ir.Expr.getattr(g_np_var, "asfortranarray", loc) + afa_attr_var = ir.Var(scope, mk_unique_var("$asfortran_array_attr"), loc) + if typemap: + typemap[afa_attr_var.name] = get_np_ufunc_typ(numpy.asfortranarray) + afa_attr_assign = ir.Assign(asfortranarray_attr_call, afa_attr_var, loc) + # call asfortranarray + asfortranarray_call = ir.Expr.call(afa_attr_var, [empty_c_var], (), loc) + if calltypes: + calltypes[asfortranarray_call] = typemap[afa_attr_var.name].get_call_type( + typingctx, [empty_c_typ], {}) + + asfortranarray_assign = ir.Assign(asfortranarray_call, lhs, loc) + + out.extend([g_np_assign, attr_assign, typ_var_assign, empty_c_assign, + afa_attr_assign, asfortranarray_assign]) + else: + alloc_assign = ir.Assign(alloc_call, lhs, loc) + out.extend([g_np_assign, attr_assign, typ_var_assign, alloc_assign]) + + return out + + +def convert_size_to_var(size_var, typemap, scope, loc, nodes): + if isinstance(size_var, int): + new_size = ir.Var(scope, mk_unique_var("$alloc_size"), loc) + if typemap: + typemap[new_size.name] = types.intp + size_assign = ir.Assign(ir.Const(size_var, loc), new_size, loc) + nodes.append(size_assign) + return new_size + assert isinstance(size_var, ir.Var) + return size_var + + +def get_np_ufunc_typ(func): + """get type of the incoming function from builtin registry""" + for (k, v) in typing.npydecl.registry.globals: + if k == func: + return v + for (k, v) in typing.templates.builtin_registry.globals: + if k == func: + return v + raise RuntimeError("type for func ", func, " not found") + + +def mk_range_block(typemap, start, stop, step, calltypes, scope, loc): + """make a block that initializes loop range and iteration variables. + target label in jump needs to be set. + """ + # g_range_var = Global(range) + g_range_var = ir.Var(scope, mk_unique_var("$range_g_var"), loc) + typemap[g_range_var.name] = get_global_func_typ(range) + g_range = ir.Global('range', range, loc) + g_range_assign = ir.Assign(g_range, g_range_var, loc) + arg_nodes, args = _mk_range_args(typemap, start, stop, step, scope, loc) + # range_call_var = call g_range_var(start, stop, step) + range_call = ir.Expr.call(g_range_var, args, (), loc) + calltypes[range_call] = typemap[g_range_var.name].get_call_type( + typing.Context(), [types.intp] * len(args), {}) + #signature(types.range_state64_type, types.intp) + range_call_var = ir.Var(scope, mk_unique_var("$range_c_var"), loc) + typemap[range_call_var.name] = types.iterators.RangeType(types.intp) + range_call_assign = ir.Assign(range_call, range_call_var, loc) + # iter_var = getiter(range_call_var) + iter_call = ir.Expr.getiter(range_call_var, loc) + calltypes[iter_call] = signature(types.range_iter64_type, + types.range_state64_type) + iter_var = ir.Var(scope, mk_unique_var("$iter_var"), loc) + typemap[iter_var.name] = types.iterators.RangeIteratorType(types.intp) + iter_call_assign = ir.Assign(iter_call, iter_var, loc) + # $phi = iter_var + phi_var = ir.Var(scope, mk_unique_var("$phi"), loc) + typemap[phi_var.name] = types.iterators.RangeIteratorType(types.intp) + phi_assign = ir.Assign(iter_var, phi_var, loc) + # jump to header + jump_header = ir.Jump(-1, loc) + range_block = ir.Block(scope, loc) + range_block.body = arg_nodes + [g_range_assign, range_call_assign, + iter_call_assign, phi_assign, jump_header] + return range_block + + +def _mk_range_args(typemap, start, stop, step, scope, loc): + nodes = [] + if isinstance(stop, ir.Var): + g_stop_var = stop + else: + assert isinstance(stop, int) + g_stop_var = ir.Var(scope, mk_unique_var("$range_stop"), loc) + if typemap: + typemap[g_stop_var.name] = types.intp + stop_assign = ir.Assign(ir.Const(stop, loc), g_stop_var, loc) + nodes.append(stop_assign) + if start == 0 and step == 1: + return nodes, [g_stop_var] + + if isinstance(start, ir.Var): + g_start_var = start + else: + assert isinstance(start, int) + g_start_var = ir.Var(scope, mk_unique_var("$range_start"), loc) + if typemap: + typemap[g_start_var.name] = types.intp + start_assign = ir.Assign(ir.Const(start, loc), g_start_var, loc) + nodes.append(start_assign) + if step == 1: + return nodes, [g_start_var, g_stop_var] + + if isinstance(step, ir.Var): + g_step_var = step + else: + assert isinstance(step, int) + g_step_var = ir.Var(scope, mk_unique_var("$range_step"), loc) + if typemap: + typemap[g_step_var.name] = types.intp + step_assign = ir.Assign(ir.Const(step, loc), g_step_var, loc) + nodes.append(step_assign) + + return nodes, [g_start_var, g_stop_var, g_step_var] + + +def get_global_func_typ(func): + """get type variable for func() from builtin registry""" + for (k, v) in typing.templates.builtin_registry.globals: + if k == func: + return v + raise RuntimeError("func type not found {}".format(func)) + + +def mk_loop_header(typemap, phi_var, calltypes, scope, loc): + """make a block that is a loop header updating iteration variables. + target labels in branch need to be set. + """ + # iternext_var = iternext(phi_var) + iternext_var = ir.Var(scope, mk_unique_var("$iternext_var"), loc) + typemap[iternext_var.name] = types.containers.Pair( + types.intp, types.boolean) + iternext_call = ir.Expr.iternext(phi_var, loc) + calltypes[iternext_call] = signature( + types.containers.Pair( + types.intp, + types.boolean), + types.range_iter64_type) + iternext_assign = ir.Assign(iternext_call, iternext_var, loc) + # pair_first_var = pair_first(iternext_var) + pair_first_var = ir.Var(scope, mk_unique_var("$pair_first_var"), loc) + typemap[pair_first_var.name] = types.intp + pair_first_call = ir.Expr.pair_first(iternext_var, loc) + pair_first_assign = ir.Assign(pair_first_call, pair_first_var, loc) + # pair_second_var = pair_second(iternext_var) + pair_second_var = ir.Var(scope, mk_unique_var("$pair_second_var"), loc) + typemap[pair_second_var.name] = types.boolean + pair_second_call = ir.Expr.pair_second(iternext_var, loc) + pair_second_assign = ir.Assign(pair_second_call, pair_second_var, loc) + # phi_b_var = pair_first_var + phi_b_var = ir.Var(scope, mk_unique_var("$phi"), loc) + typemap[phi_b_var.name] = types.intp + phi_b_assign = ir.Assign(pair_first_var, phi_b_var, loc) + # branch pair_second_var body_block out_block + branch = ir.Branch(pair_second_var, -1, -1, loc) + header_block = ir.Block(scope, loc) + header_block.body = [iternext_assign, pair_first_assign, + pair_second_assign, phi_b_assign, branch] + return header_block + + +def legalize_names(varnames): + """returns a dictionary for conversion of variable names to legal + parameter names. + """ + var_map = {} + for var in varnames: + new_name = var.replace("_", "__").replace("$", "_").replace(".", "_") + assert new_name not in var_map + var_map[var] = new_name + return var_map + + +def get_name_var_table(blocks): + """create a mapping from variable names to their ir.Var objects""" + def get_name_var_visit(var, namevar): + namevar[var.name] = var + return var + namevar = {} + visit_vars(blocks, get_name_var_visit, namevar) + return namevar + + +def replace_var_names(blocks, namedict): + """replace variables (ir.Var to ir.Var) from dictionary (name -> name)""" + # remove identity values to avoid infinite loop + new_namedict = {} + for l, r in namedict.items(): + if l != r: + new_namedict[l] = r + + def replace_name(var, namedict): + assert isinstance(var, ir.Var) + while var.name in namedict: + var = ir.Var(var.scope, namedict[var.name], var.loc) + return var + visit_vars(blocks, replace_name, new_namedict) + + +def replace_var_callback(var, vardict): + assert isinstance(var, ir.Var) + while var.name in vardict.keys(): + assert(vardict[var.name].name != var.name) + new_var = vardict[var.name] + var = ir.Var(new_var.scope, new_var.name, new_var.loc) + return var + + +def replace_vars(blocks, vardict): + """replace variables (ir.Var to ir.Var) from dictionary (name -> ir.Var)""" + # remove identity values to avoid infinite loop + new_vardict = {} + for l, r in vardict.items(): + if l != r.name: + new_vardict[l] = r + visit_vars(blocks, replace_var_callback, new_vardict) + + +def replace_vars_stmt(stmt, vardict): + visit_vars_stmt(stmt, replace_var_callback, vardict) + + +def replace_vars_inner(node, vardict): + return visit_vars_inner(node, replace_var_callback, vardict) + + +# other packages that define new nodes add calls to visit variables in them +# format: {type:function} +visit_vars_extensions = {} + + +def visit_vars(blocks, callback, cbdata): + """go over statements of block bodies and replace variable names with + dictionary. + """ + for block in blocks.values(): + for stmt in block.body: + visit_vars_stmt(stmt, callback, cbdata) + return + + +def visit_vars_stmt(stmt, callback, cbdata): + # let external calls handle stmt if type matches + for t, f in visit_vars_extensions.items(): + if isinstance(stmt, t): + f(stmt, callback, cbdata) + return + if isinstance(stmt, ir.Assign): + stmt.target = visit_vars_inner(stmt.target, callback, cbdata) + stmt.value = visit_vars_inner(stmt.value, callback, cbdata) + elif isinstance(stmt, ir.Arg): + stmt.name = visit_vars_inner(stmt.name, callback, cbdata) + elif isinstance(stmt, ir.Return): + stmt.value = visit_vars_inner(stmt.value, callback, cbdata) + elif isinstance(stmt, ir.Raise): + stmt.exception = visit_vars_inner(stmt.exception, callback, cbdata) + elif isinstance(stmt, ir.Branch): + stmt.cond = visit_vars_inner(stmt.cond, callback, cbdata) + elif isinstance(stmt, ir.Jump): + stmt.target = visit_vars_inner(stmt.target, callback, cbdata) + elif isinstance(stmt, ir.Del): + # Because Del takes only a var name, we make up by + # constructing a temporary variable. + var = ir.Var(None, stmt.value, stmt.loc) + var = visit_vars_inner(var, callback, cbdata) + stmt.value = var.name + elif isinstance(stmt, ir.DelAttr): + stmt.target = visit_vars_inner(stmt.target, callback, cbdata) + stmt.attr = visit_vars_inner(stmt.attr, callback, cbdata) + elif isinstance(stmt, ir.SetAttr): + stmt.target = visit_vars_inner(stmt.target, callback, cbdata) + stmt.attr = visit_vars_inner(stmt.attr, callback, cbdata) + stmt.value = visit_vars_inner(stmt.value, callback, cbdata) + elif isinstance(stmt, ir.DelItem): + stmt.target = visit_vars_inner(stmt.target, callback, cbdata) + stmt.index = visit_vars_inner(stmt.index, callback, cbdata) + elif isinstance(stmt, ir.StaticSetItem): + stmt.target = visit_vars_inner(stmt.target, callback, cbdata) + stmt.index_var = visit_vars_inner(stmt.index_var, callback, cbdata) + stmt.value = visit_vars_inner(stmt.value, callback, cbdata) + elif isinstance(stmt, ir.SetItem): + stmt.target = visit_vars_inner(stmt.target, callback, cbdata) + stmt.index = visit_vars_inner(stmt.index, callback, cbdata) + stmt.value = visit_vars_inner(stmt.value, callback, cbdata) + elif isinstance(stmt, ir.Print): + stmt.args = [visit_vars_inner(x, callback, cbdata) for x in stmt.args] + else: + # TODO: raise NotImplementedError("no replacement for IR node: ", stmt) + pass + return + + +def visit_vars_inner(node, callback, cbdata): + if isinstance(node, ir.Var): + return callback(node, cbdata) + elif isinstance(node, list): + return [visit_vars_inner(n, callback, cbdata) for n in node] + elif isinstance(node, tuple): + return tuple([visit_vars_inner(n, callback, cbdata) for n in node]) + elif isinstance(node, ir.Expr): + # if node.op in ['binop', 'inplace_binop']: + # lhs = node.lhs.name + # rhs = node.rhs.name + # node.lhs.name = callback, cbdata.get(lhs, lhs) + # node.rhs.name = callback, cbdata.get(rhs, rhs) + for arg in node._kws.keys(): + node._kws[arg] = visit_vars_inner(node._kws[arg], callback, cbdata) + elif isinstance(node, ir.Yield): + node.value = visit_vars_inner(node.value, callback, cbdata) + return node + + +add_offset_to_labels_extensions = {} + + +def add_offset_to_labels(blocks, offset): + """add an offset to all block labels and jump/branch targets + """ + new_blocks = {} + for l, b in blocks.items(): + # some parfor last blocks might be empty + term = None + if b.body: + term = b.body[-1] + for inst in b.body: + for T, f in add_offset_to_labels_extensions.items(): + if isinstance(inst, T): + f_max = f(inst, offset) + if isinstance(term, ir.Jump): + b.body[-1] = ir.Jump(term.target + offset, term.loc) + if isinstance(term, ir.Branch): + b.body[-1] = ir.Branch(term.cond, term.truebr + offset, + term.falsebr + offset, term.loc) + new_blocks[l + offset] = b + return new_blocks + + +find_max_label_extensions = {} + + +def find_max_label(blocks): + max_label = 0 + for l, b in blocks.items(): + term = None + if b.body: + term = b.body[-1] + for inst in b.body: + for T, f in find_max_label_extensions.items(): + if isinstance(inst, T): + f_max = f(inst) + if f_max > max_label: + max_label = f_max + if l > max_label: + max_label = l + return max_label + + +def flatten_labels(blocks): + """makes the labels in range(0, len(blocks)), useful to compare CFGs + """ + # first bulk move the labels out of the rewrite range + blocks = add_offset_to_labels(blocks, find_max_label(blocks) + 1) + # order them in topo order because it's easier to read + new_blocks = {} + topo_order = find_topo_order(blocks) + l_map = dict() + idx = 0 + for x in topo_order: + l_map[x] = idx + idx += 1 + + for t_node in topo_order: + b = blocks[t_node] + # some parfor last blocks might be empty + term = None + if b.body: + term = b.body[-1] + if isinstance(term, ir.Jump): + b.body[-1] = ir.Jump(l_map[term.target], term.loc) + if isinstance(term, ir.Branch): + b.body[-1] = ir.Branch(term.cond, l_map[term.truebr], + l_map[term.falsebr], term.loc) + new_blocks[l_map[t_node]] = b + return new_blocks + + +def remove_dels(blocks): + """remove ir.Del nodes""" + for block in blocks.values(): + new_body = [] + for stmt in block.body: + if not isinstance(stmt, ir.Del): + new_body.append(stmt) + block.body = new_body + return + + +def remove_args(blocks): + """remove ir.Arg nodes""" + for block in blocks.values(): + new_body = [] + for stmt in block.body: + if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Arg): + continue + new_body.append(stmt) + block.body = new_body + return + + +def dead_code_elimination(func_ir, typemap=None, alias_map=None, + arg_aliases=None): + """ Performs dead code elimination and leaves the IR in a valid state on + exit + """ + do_post_proc = False + while (remove_dead(func_ir.blocks, func_ir.arg_names, func_ir, typemap, + alias_map, arg_aliases)): + do_post_proc = True + + if do_post_proc: + post_proc = postproc.PostProcessor(func_ir) + post_proc.run() + + +def remove_dead(blocks, args, func_ir, typemap=None, alias_map=None, arg_aliases=None): + """dead code elimination using liveness and CFG info. + Returns True if something has been removed, or False if nothing is removed. + """ + cfg = compute_cfg_from_blocks(blocks) + usedefs = compute_use_defs(blocks) + live_map = compute_live_map(cfg, blocks, usedefs.usemap, usedefs.defmap) + call_table, _ = get_call_table(blocks) + if alias_map is None or arg_aliases is None: + alias_map, arg_aliases = find_potential_aliases(blocks, args, typemap, + func_ir) + if config.DEBUG_ARRAY_OPT >= 1: + print("args:", args) + print("alias map:", alias_map) + print("arg_aliases:", arg_aliases) + print("live_map:", live_map) + print("usemap:", usedefs.usemap) + print("defmap:", usedefs.defmap) + # keep set for easier search + alias_set = set(alias_map.keys()) + + removed = False + for label, block in blocks.items(): + # find live variables at each statement to delete dead assignment + lives = {v.name for v in block.terminator.list_vars()} + if config.DEBUG_ARRAY_OPT >= 2: + print("remove_dead processing block", label, lives) + # find live variables at the end of block + for out_blk, _data in cfg.successors(label): + if config.DEBUG_ARRAY_OPT >= 2: + print("succ live_map", out_blk, live_map[out_blk]) + lives |= live_map[out_blk] + removed |= remove_dead_block(block, lives, call_table, arg_aliases, + alias_map, alias_set, func_ir, typemap) + + return removed + + +# other packages that define new nodes add calls to remove dead code in them +# format: {type:function} +remove_dead_extensions = {} + + +def remove_dead_block(block, lives, call_table, arg_aliases, alias_map, + alias_set, func_ir, typemap): + """remove dead code using liveness info. + Mutable arguments (e.g. arrays) that are not definitely assigned are live + after return of function. + """ + # TODO: find mutable args that are not definitely assigned instead of + # assuming all args are live after return + removed = False + + # add statements in reverse order + new_body = [block.terminator] + # for each statement in reverse order, excluding terminator + for stmt in reversed(block.body[:-1]): + if config.DEBUG_ARRAY_OPT >= 2: + print("remove_dead_block", stmt) + # aliases of lives are also live + alias_lives = set() + init_alias_lives = lives & alias_set + for v in init_alias_lives: + alias_lives |= alias_map[v] + lives_n_aliases = lives | alias_lives | arg_aliases + + # let external calls handle stmt if type matches + if type(stmt) in remove_dead_extensions: + f = remove_dead_extensions[type(stmt)] + stmt = f(stmt, lives, lives_n_aliases, arg_aliases, alias_map, func_ir, + typemap) + if stmt is None: + if config.DEBUG_ARRAY_OPT >= 2: + print("Statement was removed.") + removed = True + continue + + # ignore assignments that their lhs is not live or lhs==rhs + if isinstance(stmt, ir.Assign): + lhs = stmt.target + rhs = stmt.value + if lhs.name not in lives and has_no_side_effect( + rhs, lives_n_aliases, call_table): + if config.DEBUG_ARRAY_OPT >= 2: + print("Statement was removed.") + removed = True + continue + if isinstance(rhs, ir.Var) and lhs.name == rhs.name: + if config.DEBUG_ARRAY_OPT >= 2: + print("Statement was removed.") + removed = True + continue + # TODO: remove other nodes like SetItem etc. + + if isinstance(stmt, ir.Del): + if stmt.value not in lives: + if config.DEBUG_ARRAY_OPT >= 2: + print("Statement was removed.") + removed = True + continue + + if isinstance(stmt, ir.SetItem): + name = stmt.target.name + if name not in lives_n_aliases: + if config.DEBUG_ARRAY_OPT >= 2: + print("Statement was removed.") + continue + + if type(stmt) in analysis.ir_extension_usedefs: + def_func = analysis.ir_extension_usedefs[type(stmt)] + uses, defs = def_func(stmt) + lives -= defs + lives |= uses + else: + lives |= {v.name for v in stmt.list_vars()} + if isinstance(stmt, ir.Assign): + # make sure lhs is not used in rhs, e.g. a = g(a) + if isinstance(stmt.value, ir.Expr): + rhs_vars = {v.name for v in stmt.value.list_vars()} + if lhs.name not in rhs_vars: + lives.remove(lhs.name) + else: + lives.remove(lhs.name) + + new_body.append(stmt) + new_body.reverse() + block.body = new_body + return removed + +# list of functions +remove_call_handlers = [] + +def remove_dead_random_call(rhs, lives, call_list): + if len(call_list) == 3 and call_list[1:] == ['random', numpy]: + return call_list[0] not in {'seed', 'shuffle'} + return False + +remove_call_handlers.append(remove_dead_random_call) + +def has_no_side_effect(rhs, lives, call_table): + """ Returns True if this expression has no side effects that + would prevent re-ordering. + """ + from numba.parfors import array_analysis, parfor + from numba.misc.special import prange + if isinstance(rhs, ir.Expr) and rhs.op == 'call': + func_name = rhs.func.name + if func_name not in call_table or call_table[func_name] == []: + return False + call_list = call_table[func_name] + if (call_list == ['empty', numpy] or + call_list == [slice] or + call_list == ['stencil', numba] or + call_list == ['log', numpy] or + call_list == ['dtype', numpy] or + call_list == [array_analysis.wrap_index] or + call_list == [prange] or + call_list == ['prange', numba] or + call_list == [parfor.internal_prange]): + return True + elif (isinstance(call_list[0], _Intrinsic) and + (call_list[0]._name == 'empty_inferred' or + call_list[0]._name == 'unsafe_empty_inferred')): + return True + from numba.core.registry import CPUDispatcher + from numba.np.linalg import dot_3_mv_check_args + if isinstance(call_list[0], CPUDispatcher): + py_func = call_list[0].py_func + if py_func == dot_3_mv_check_args: + return True + for f in remove_call_handlers: + if f(rhs, lives, call_list): + return True + return False + if isinstance(rhs, ir.Expr) and rhs.op == 'inplace_binop': + return rhs.lhs.name not in lives + if isinstance(rhs, ir.Yield): + return False + if isinstance(rhs, ir.Expr) and rhs.op == 'pair_first': + # don't remove pair_first since prange looks for it + return False + return True + +is_pure_extensions = [] + +def is_pure(rhs, lives, call_table): + """ Returns True if every time this expression is evaluated it + returns the same result. This is not the case for things + like calls to numpy.random. + """ + if isinstance(rhs, ir.Expr): + if rhs.op == 'call': + func_name = rhs.func.name + if func_name not in call_table or call_table[func_name] == []: + return False + call_list = call_table[func_name] + if (call_list == [slice] or + call_list == ['log', numpy] or + call_list == ['empty', numpy]): + return True + for f in is_pure_extensions: + if f(rhs, lives, call_list): + return True + return False + elif rhs.op == 'getiter' or rhs.op == 'iternext': + return False + if isinstance(rhs, ir.Yield): + return False + return True + +def is_const_call(module_name, func_name): + # Returns True if there is no state in the given module changed by the given function. + if module_name == 'numpy': + if func_name in ['empty']: + return True + return False + +alias_analysis_extensions = {} +alias_func_extensions = {} + +def get_canonical_alias(v, alias_map): + if v not in alias_map: + return v + + v_aliases = sorted(list(alias_map[v])) + return v_aliases[0] + +def find_potential_aliases(blocks, args, typemap, func_ir, alias_map=None, + arg_aliases=None): + "find all array aliases and argument aliases to avoid remove as dead" + if alias_map is None: + alias_map = {} + if arg_aliases is None: + arg_aliases = set(a for a in args if not is_immutable_type(a, typemap)) + + # update definitions since they are not guaranteed to be up-to-date + # FIXME keep definitions up-to-date to avoid the need for rebuilding + func_ir._definitions = build_definitions(func_ir.blocks) + np_alias_funcs = ['ravel', 'transpose', 'reshape'] + + for bl in blocks.values(): + for instr in bl.body: + if type(instr) in alias_analysis_extensions: + f = alias_analysis_extensions[type(instr)] + f(instr, args, typemap, func_ir, alias_map, arg_aliases) + if isinstance(instr, ir.Assign): + expr = instr.value + lhs = instr.target.name + # only mutable types can alias + if is_immutable_type(lhs, typemap): + continue + if isinstance(expr, ir.Var) and lhs!=expr.name: + _add_alias(lhs, expr.name, alias_map, arg_aliases) + # subarrays like A = B[0] for 2D B + if (isinstance(expr, ir.Expr) and (expr.op == 'cast' or + expr.op in ['getitem', 'static_getitem'])): + _add_alias(lhs, expr.value.name, alias_map, arg_aliases) + if isinstance(expr, ir.Expr) and expr.op == 'inplace_binop': + _add_alias(lhs, expr.lhs.name, alias_map, arg_aliases) + # array attributes like A.T + if (isinstance(expr, ir.Expr) and expr.op == 'getattr' + and expr.attr in ['T', 'ctypes', 'flat']): + _add_alias(lhs, expr.value.name, alias_map, arg_aliases) + # a = b.c. a should alias b + if (isinstance(expr, ir.Expr) and expr.op == 'getattr' + and expr.attr not in ['shape'] + and expr.value.name in arg_aliases): + _add_alias(lhs, expr.value.name, alias_map, arg_aliases) + # calls that can create aliases such as B = A.ravel() + if isinstance(expr, ir.Expr) and expr.op == 'call': + fdef = guard(find_callname, func_ir, expr, typemap) + # TODO: sometimes gufunc backend creates duplicate code + # causing find_callname to fail. Example: test_argmax + # ignored here since those cases don't create aliases + # but should be fixed in general + if fdef is None: + continue + fname, fmod = fdef + if fdef in alias_func_extensions: + alias_func = alias_func_extensions[fdef] + alias_func(lhs, expr.args, alias_map, arg_aliases) + if fmod == 'numpy' and fname in np_alias_funcs: + _add_alias(lhs, expr.args[0].name, alias_map, arg_aliases) + if isinstance(fmod, ir.Var) and fname in np_alias_funcs: + _add_alias(lhs, fmod.name, alias_map, arg_aliases) + + # copy to avoid changing size during iteration + old_alias_map = copy.deepcopy(alias_map) + # combine all aliases transitively + for v in old_alias_map: + for w in old_alias_map[v]: + alias_map[v] |= alias_map[w] + for w in old_alias_map[v]: + alias_map[w] = alias_map[v] + + return alias_map, arg_aliases + +def _add_alias(lhs, rhs, alias_map, arg_aliases): + if rhs in arg_aliases: + arg_aliases.add(lhs) + else: + if rhs not in alias_map: + alias_map[rhs] = set() + if lhs not in alias_map: + alias_map[lhs] = set() + alias_map[rhs].add(lhs) + alias_map[lhs].add(rhs) + return + +def is_immutable_type(var, typemap): + # Conservatively, assume mutable if type not available + if typemap is None or var not in typemap: + return False + typ = typemap[var] + # TODO: add more immutable types + if isinstance(typ, (types.Number, types.scalars._NPDatetimeBase, + types.iterators.RangeType)): + return True + if typ==types.string: + return True + # conservatively, assume mutable + return False + +def copy_propagate(blocks, typemap): + """compute copy propagation information for each block using fixed-point + iteration on data flow equations: + in_b = intersect(predec(B)) + out_b = gen_b | (in_b - kill_b) + """ + cfg = compute_cfg_from_blocks(blocks) + entry = cfg.entry_point() + + # format: dict of block labels to copies as tuples + # label -> (l,r) + c_data = init_copy_propagate_data(blocks, entry, typemap) + (gen_copies, all_copies, kill_copies, in_copies, out_copies) = c_data + + old_point = None + new_point = copy.deepcopy(out_copies) + # comparison works since dictionary of built-in types + while old_point != new_point: + for label in blocks.keys(): + if label == entry: + continue + predecs = [i for i, _d in cfg.predecessors(label)] + # in_b = intersect(predec(B)) + in_copies[label] = out_copies[predecs[0]].copy() + for p in predecs: + in_copies[label] &= out_copies[p] + + # out_b = gen_b | (in_b - kill_b) + out_copies[label] = (gen_copies[label] + | (in_copies[label] - kill_copies[label])) + old_point = new_point + new_point = copy.deepcopy(out_copies) + if config.DEBUG_ARRAY_OPT >= 1: + print("copy propagate out_copies:", out_copies) + return in_copies, out_copies + + +def init_copy_propagate_data(blocks, entry, typemap): + """get initial condition of copy propagation data flow for each block. + """ + # gen is all definite copies, extra_kill is additional ones that may hit + # for example, parfors can have control flow so they may hit extra copies + gen_copies, extra_kill = get_block_copies(blocks, typemap) + # set of all program copies + all_copies = set() + for l, s in gen_copies.items(): + all_copies |= gen_copies[l] + kill_copies = {} + for label, gen_set in gen_copies.items(): + kill_copies[label] = set() + for lhs, rhs in all_copies: + if lhs in extra_kill[label] or rhs in extra_kill[label]: + kill_copies[label].add((lhs, rhs)) + # a copy is killed if it is not in this block and lhs or rhs are + # assigned in this block + assigned = {lhs for lhs, rhs in gen_set} + if ((lhs, rhs) not in gen_set + and (lhs in assigned or rhs in assigned)): + kill_copies[label].add((lhs, rhs)) + # set initial values + # all copies are in for all blocks except entry + in_copies = {l: all_copies.copy() for l in blocks.keys()} + in_copies[entry] = set() + out_copies = {} + for label in blocks.keys(): + # out_b = gen_b | (in_b - kill_b) + out_copies[label] = (gen_copies[label] + | (in_copies[label] - kill_copies[label])) + out_copies[entry] = gen_copies[entry] + return (gen_copies, all_copies, kill_copies, in_copies, out_copies) + + +# other packages that define new nodes add calls to get copies in them +# format: {type:function} +copy_propagate_extensions = {} + + +def get_block_copies(blocks, typemap): + """get copies generated and killed by each block + """ + block_copies = {} + extra_kill = {} + for label, block in blocks.items(): + assign_dict = {} + extra_kill[label] = set() + # assignments as dict to replace with latest value + for stmt in block.body: + for T, f in copy_propagate_extensions.items(): + if isinstance(stmt, T): + gen_set, kill_set = f(stmt, typemap) + for lhs, rhs in gen_set: + assign_dict[lhs] = rhs + # if a=b is in dict and b is killed, a is also killed + new_assign_dict = {} + for l, r in assign_dict.items(): + if l not in kill_set and r not in kill_set: + new_assign_dict[l] = r + if r in kill_set: + extra_kill[label].add(l) + assign_dict = new_assign_dict + extra_kill[label] |= kill_set + if isinstance(stmt, ir.Assign): + lhs = stmt.target.name + if isinstance(stmt.value, ir.Var): + rhs = stmt.value.name + # copy is valid only if same type (see + # TestCFunc.test_locals) + # Some transformations can produce assignments of the + # form A = A. We don't put these mapping in the + # copy propagation set because then you get cycles and + # infinite loops in the replacement phase. + if typemap[lhs] == typemap[rhs] and lhs != rhs: + assign_dict[lhs] = rhs + continue + if isinstance(stmt.value, + ir.Expr) and stmt.value.op == 'inplace_binop': + in1_var = stmt.value.lhs.name + in1_typ = typemap[in1_var] + # inplace_binop assigns first operand if mutable + if not (isinstance(in1_typ, types.Number) + or in1_typ == types.string): + extra_kill[label].add(in1_var) + # if a=b is in dict and b is killed, a is also killed + new_assign_dict = {} + for l, r in assign_dict.items(): + if l != in1_var and r != in1_var: + new_assign_dict[l] = r + if r == in1_var: + extra_kill[label].add(l) + assign_dict = new_assign_dict + extra_kill[label].add(lhs) + block_cps = set(assign_dict.items()) + block_copies[label] = block_cps + return block_copies, extra_kill + + +# other packages that define new nodes add calls to apply copy propagate in them +# format: {type:function} +apply_copy_propagate_extensions = {} + + +def apply_copy_propagate(blocks, in_copies, name_var_table, typemap, calltypes, + save_copies=None): + """apply copy propagation to IR: replace variables when copies available""" + # save_copies keeps an approximation of the copies that were applied, so + # that the variable names of removed user variables can be recovered to some + # extent. + if save_copies is None: + save_copies = [] + + for label, block in blocks.items(): + var_dict = {l: name_var_table[r] for l, r in in_copies[label]} + # assignments as dict to replace with latest value + for stmt in block.body: + if type(stmt) in apply_copy_propagate_extensions: + f = apply_copy_propagate_extensions[type(stmt)] + f(stmt, var_dict, name_var_table, + typemap, calltypes, save_copies) + # only rhs of assignments should be replaced + # e.g. if x=y is available, x in x=z shouldn't be replaced + elif isinstance(stmt, ir.Assign): + stmt.value = replace_vars_inner(stmt.value, var_dict) + else: + replace_vars_stmt(stmt, var_dict) + fix_setitem_type(stmt, typemap, calltypes) + for T, f in copy_propagate_extensions.items(): + if isinstance(stmt, T): + gen_set, kill_set = f(stmt, typemap) + for lhs, rhs in gen_set: + if rhs in name_var_table: + var_dict[lhs] = name_var_table[rhs] + for l, r in var_dict.copy().items(): + if l in kill_set or r.name in kill_set: + var_dict.pop(l) + if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Var): + lhs = stmt.target.name + rhs = stmt.value.name + # rhs could be replaced with lhs from previous copies + if lhs != rhs: + # copy is valid only if same type (see + # TestCFunc.test_locals) + if typemap[lhs] == typemap[rhs] and rhs in name_var_table: + var_dict[lhs] = name_var_table[rhs] + else: + var_dict.pop(lhs, None) + # a=b kills previous t=a + lhs_kill = [] + for k, v in var_dict.items(): + if v.name == lhs: + lhs_kill.append(k) + for k in lhs_kill: + var_dict.pop(k, None) + if (isinstance(stmt, ir.Assign) + and not isinstance(stmt.value, ir.Var)): + lhs = stmt.target.name + var_dict.pop(lhs, None) + # previous t=a is killed if a is killed + lhs_kill = [] + for k, v in var_dict.items(): + if v.name == lhs: + lhs_kill.append(k) + for k in lhs_kill: + var_dict.pop(k, None) + save_copies.extend(var_dict.items()) + + return save_copies + +def fix_setitem_type(stmt, typemap, calltypes): + """Copy propagation can replace setitem target variable, which can be array + with 'A' layout. The replaced variable can be 'C' or 'F', so we update + setitem call type reflect this (from matrix power test) + """ + if not isinstance(stmt, (ir.SetItem, ir.StaticSetItem)): + return + t_typ = typemap[stmt.target.name] + s_typ = calltypes[stmt].args[0] + # test_optional t_typ can be Optional with array + if not isinstance( + s_typ, + types.npytypes.Array) or not isinstance( + t_typ, + types.npytypes.Array): + return + if s_typ.layout == 'A' and t_typ.layout != 'A': + new_s_typ = s_typ.copy(layout=t_typ.layout) + calltypes[stmt].args = ( + new_s_typ, + calltypes[stmt].args[1], + calltypes[stmt].args[2]) + return + + +def dprint_func_ir(func_ir, title, blocks=None): + """Debug print function IR, with an optional blocks argument + that may differ from the IR's original blocks. + """ + if config.DEBUG_ARRAY_OPT >= 1: + ir_blocks = func_ir.blocks + func_ir.blocks = ir_blocks if blocks == None else blocks + name = func_ir.func_id.func_qualname + print(("IR %s: %s" % (title, name)).center(80, "-")) + func_ir.dump() + print("-" * 40) + func_ir.blocks = ir_blocks + + +def find_topo_order(blocks, cfg = None): + """find topological order of blocks such that true branches are visited + first (e.g. for_break test in test_dataflow). + """ + if cfg is None: + cfg = compute_cfg_from_blocks(blocks) + post_order = [] + seen = set() + + def _dfs_rec(node): + if node not in seen: + seen.add(node) + succs = cfg._succs[node] + last_inst = blocks[node].body[-1] + if isinstance(last_inst, ir.Branch): + succs = [last_inst.falsebr, last_inst.truebr] + for dest in succs: + if (node, dest) not in cfg._back_edges: + _dfs_rec(dest) + post_order.append(node) + + _dfs_rec(cfg.entry_point()) + post_order.reverse() + return post_order + + +# other packages that define new nodes add calls to get call table +# format: {type:function} +call_table_extensions = {} + + +def get_call_table(blocks, call_table=None, reverse_call_table=None, topological_ordering=True): + """returns a dictionary of call variables and their references. + """ + # call_table example: c = np.zeros becomes c:["zeroes", np] + # reverse_call_table example: c = np.zeros becomes np_var:c + if call_table is None: + call_table = {} + if reverse_call_table is None: + reverse_call_table = {} + + if topological_ordering: + order = find_topo_order(blocks) + else: + order = list(blocks.keys()) + + for label in reversed(order): + for inst in reversed(blocks[label].body): + if isinstance(inst, ir.Assign): + lhs = inst.target.name + rhs = inst.value + if isinstance(rhs, ir.Expr) and rhs.op == 'call': + call_table[rhs.func.name] = [] + if isinstance(rhs, ir.Expr) and rhs.op == 'getattr': + if lhs in call_table: + call_table[lhs].append(rhs.attr) + reverse_call_table[rhs.value.name] = lhs + if lhs in reverse_call_table: + call_var = reverse_call_table[lhs] + call_table[call_var].append(rhs.attr) + reverse_call_table[rhs.value.name] = call_var + if isinstance(rhs, ir.Global): + if lhs in call_table: + call_table[lhs].append(rhs.value) + if lhs in reverse_call_table: + call_var = reverse_call_table[lhs] + call_table[call_var].append(rhs.value) + if isinstance(rhs, ir.FreeVar): + if lhs in call_table: + call_table[lhs].append(rhs.value) + if lhs in reverse_call_table: + call_var = reverse_call_table[lhs] + call_table[call_var].append(rhs.value) + if isinstance(rhs, ir.Var): + if lhs in call_table: + call_table[lhs].append(rhs.name) + reverse_call_table[rhs.name] = lhs + if lhs in reverse_call_table: + call_var = reverse_call_table[lhs] + call_table[call_var].append(rhs.name) + for T, f in call_table_extensions.items(): + if isinstance(inst, T): + f(inst, call_table, reverse_call_table) + return call_table, reverse_call_table + + +# other packages that define new nodes add calls to get tuple table +# format: {type:function} +tuple_table_extensions = {} + + +def get_tuple_table(blocks, tuple_table=None): + """returns a dictionary of tuple variables and their values. + """ + if tuple_table is None: + tuple_table = {} + + for block in blocks.values(): + for inst in block.body: + if isinstance(inst, ir.Assign): + lhs = inst.target.name + rhs = inst.value + if isinstance(rhs, ir.Expr) and rhs.op == 'build_tuple': + tuple_table[lhs] = rhs.items + if isinstance(rhs, ir.Const) and isinstance(rhs.value, tuple): + tuple_table[lhs] = rhs.value + for T, f in tuple_table_extensions.items(): + if isinstance(inst, T): + f(inst, tuple_table) + return tuple_table + + +def get_stmt_writes(stmt): + writes = set() + if isinstance(stmt, (ir.Assign, ir.SetItem, ir.StaticSetItem)): + writes.add(stmt.target.name) + return writes + + +def rename_labels(blocks): + """rename labels of function body blocks according to topological sort. + The set of labels of these blocks will remain unchanged. + """ + topo_order = find_topo_order(blocks) + + # make a block with return last if available (just for readability) + return_label = -1 + for l, b in blocks.items(): + if isinstance(b.body[-1], ir.Return): + return_label = l + # some cases like generators can have no return blocks + if return_label != -1: + topo_order.remove(return_label) + topo_order.append(return_label) + + label_map = {} + all_labels = sorted(topo_order, reverse=True) + for label in topo_order: + label_map[label] = all_labels.pop() + # update target labels in jumps/branches + for b in blocks.values(): + term = b.terminator + if isinstance(term, ir.Jump): + term.target = label_map[term.target] + if isinstance(term, ir.Branch): + term.truebr = label_map[term.truebr] + term.falsebr = label_map[term.falsebr] + # update blocks dictionary keys + new_blocks = {} + for k, b in blocks.items(): + new_label = label_map[k] + new_blocks[new_label] = b + + return new_blocks + + +def simplify_CFG(blocks): + """transform chains of blocks that have no loop into a single block""" + # first, inline single-branch-block to its predecessors + cfg = compute_cfg_from_blocks(blocks) + def find_single_branch(label): + block = blocks[label] + return len(block.body) == 1 and isinstance(block.body[0], ir.Branch) + single_branch_blocks = list(filter(find_single_branch, blocks.keys())) + marked_for_del = set() + for label in single_branch_blocks: + inst = blocks[label].body[0] + predecessors = cfg.predecessors(label) + delete_block = True + for (p, q) in predecessors: + block = blocks[p] + if isinstance(block.body[-1], ir.Jump): + block.body[-1] = copy.copy(inst) + else: + delete_block = False + if delete_block: + marked_for_del.add(label) + # Delete marked labels + for label in marked_for_del: + del blocks[label] + merge_adjacent_blocks(blocks) + return rename_labels(blocks) + + +arr_math = ['min', 'max', 'sum', 'prod', 'mean', 'var', 'std', + 'cumsum', 'cumprod', 'argmax', 'argmin', 'argsort', + 'nonzero', 'ravel'] + + +def canonicalize_array_math(func_ir, typemap, calltypes, typingctx): + # save array arg to call + # call_varname -> array + blocks = func_ir.blocks + saved_arr_arg = {} + topo_order = find_topo_order(blocks) + for label in topo_order: + block = blocks[label] + new_body = [] + for stmt in block.body: + if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr): + lhs = stmt.target.name + rhs = stmt.value + # replace A.func with np.func, and save A in saved_arr_arg + if (rhs.op == 'getattr' and rhs.attr in arr_math + and isinstance( + typemap[rhs.value.name], types.npytypes.Array)): + rhs = stmt.value + arr = rhs.value + saved_arr_arg[lhs] = arr + scope = arr.scope + loc = arr.loc + # g_np_var = Global(numpy) + g_np_var = ir.Var(scope, mk_unique_var("$np_g_var"), loc) + typemap[g_np_var.name] = types.misc.Module(numpy) + g_np = ir.Global('np', numpy, loc) + g_np_assign = ir.Assign(g_np, g_np_var, loc) + rhs.value = g_np_var + new_body.append(g_np_assign) + func_ir._definitions[g_np_var.name] = [g_np] + # update func var type + func = getattr(numpy, rhs.attr) + func_typ = get_np_ufunc_typ(func) + typemap.pop(lhs) + typemap[lhs] = func_typ + if rhs.op == 'call' and rhs.func.name in saved_arr_arg: + # add array as first arg + arr = saved_arr_arg[rhs.func.name] + # update call type signature to include array arg + old_sig = calltypes.pop(rhs) + # argsort requires kws for typing so sig.args can't be used + # reusing sig.args since some types become Const in sig + argtyps = old_sig.args[:len(rhs.args)] + kwtyps = {name: typemap[v.name] for name, v in rhs.kws} + calltypes[rhs] = typemap[rhs.func.name].get_call_type( + typingctx, [typemap[arr.name]] + list(argtyps), kwtyps) + rhs.args = [arr] + rhs.args + + new_body.append(stmt) + block.body = new_body + return + + +# format: {type:function} +array_accesses_extensions = {} + + +def get_array_accesses(blocks, accesses=None): + """returns a set of arrays accessed and their indices. + """ + if accesses is None: + accesses = set() + + for block in blocks.values(): + for inst in block.body: + if isinstance(inst, ir.SetItem): + accesses.add((inst.target.name, inst.index.name)) + if isinstance(inst, ir.StaticSetItem): + accesses.add((inst.target.name, inst.index_var.name)) + if isinstance(inst, ir.Assign): + lhs = inst.target.name + rhs = inst.value + if isinstance(rhs, ir.Expr) and rhs.op == 'getitem': + accesses.add((rhs.value.name, rhs.index.name)) + if isinstance(rhs, ir.Expr) and rhs.op == 'static_getitem': + index = rhs.index + # slice is unhashable, so just keep the variable + if index is None or is_slice_index(index): + index = rhs.index_var.name + accesses.add((rhs.value.name, index)) + for T, f in array_accesses_extensions.items(): + if isinstance(inst, T): + f(inst, accesses) + return accesses + +def is_slice_index(index): + """see if index is a slice index or has slice in it""" + if isinstance(index, slice): + return True + if isinstance(index, tuple): + for i in index: + if isinstance(i, slice): + return True + return False + +def merge_adjacent_blocks(blocks): + cfg = compute_cfg_from_blocks(blocks) + # merge adjacent blocks + removed = set() + for label in list(blocks.keys()): + if label in removed: + continue + block = blocks[label] + succs = list(cfg.successors(label)) + while True: + if len(succs) != 1: + break + next_label = succs[0][0] + if next_label in removed: + break + preds = list(cfg.predecessors(next_label)) + succs = list(cfg.successors(next_label)) + if len(preds) != 1 or preds[0][0] != label: + break + next_block = blocks[next_label] + # XXX: commented out since scope objects are not consistent + # throughout the compiler. for example, pieces of code are compiled + # and inlined on the fly without proper scope merge. + # if block.scope != next_block.scope: + # break + # merge + block.body.pop() # remove Jump + block.body += next_block.body + del blocks[next_label] + removed.add(next_label) + label = next_label + + +def restore_copy_var_names(blocks, save_copies, typemap): + """ + restores variable names of user variables after applying copy propagation + """ + if not save_copies: + return {} + + rename_dict = {} + var_rename_map = {} + for (a, b) in save_copies: + # a is string name, b is variable + # if a is user variable and b is generated temporary and b is not + # already renamed + if (not a.startswith('$') and b.name.startswith('$') + and b.name not in rename_dict): + new_name = mk_unique_var('${}'.format(a)); + rename_dict[b.name] = new_name + var_rename_map[new_name] = a + typ = typemap.pop(b.name) + typemap[new_name] = typ + + replace_var_names(blocks, rename_dict) + return var_rename_map + + +def simplify(func_ir, typemap, calltypes, metadata): + # get copies in to blocks and out from blocks + in_cps, _ = copy_propagate(func_ir.blocks, typemap) + # table mapping variable names to ir.Var objects to help replacement + name_var_table = get_name_var_table(func_ir.blocks) + save_copies = apply_copy_propagate( + func_ir.blocks, + in_cps, + name_var_table, + typemap, + calltypes) + var_rename_map = restore_copy_var_names(func_ir.blocks, save_copies, typemap) + if "var_rename_map" not in metadata: + metadata["var_rename_map"] = {} + metadata["var_rename_map"].update(var_rename_map) + # remove dead code to enable fusion + if config.DEBUG_ARRAY_OPT >= 1: + dprint_func_ir(func_ir, "after copy prop") + remove_dead(func_ir.blocks, func_ir.arg_names, func_ir, typemap) + func_ir.blocks = simplify_CFG(func_ir.blocks) + if config.DEBUG_ARRAY_OPT >= 1: + dprint_func_ir(func_ir, "after simplify") + + +class GuardException(Exception): + pass + + +def require(cond): + """ + Raise GuardException if the given condition is False. + """ + if not cond: + raise GuardException + +def guard(func, *args, **kwargs): + """ + Run a function with given set of arguments, and guard against + any GuardException raised by the function by returning None, + or the expected return results if no such exception was raised. + """ + try: + return func(*args, **kwargs) + except GuardException: + return None + +def get_definition(func_ir, name, **kwargs): + """ + Same as func_ir.get_definition(name), but raise GuardException if + exception KeyError is caught. + """ + try: + return func_ir.get_definition(name, **kwargs) + except KeyError: + raise GuardException + +def build_definitions(blocks, definitions=None): + """Build the definitions table of the given blocks by scanning + through all blocks and instructions, useful when the definitions + table is out-of-sync. + Will return a new definition table if one is not passed. + """ + if definitions is None: + definitions = collections.defaultdict(list) + + for block in blocks.values(): + for inst in block.body: + if isinstance(inst, ir.Assign): + name = inst.target.name + definition = definitions.get(name, []) + if definition == []: + definitions[name] = definition + definition.append(inst.value) + if type(inst) in build_defs_extensions: + f = build_defs_extensions[type(inst)] + f(inst, definitions) + + return definitions + +build_defs_extensions = {} + +def find_callname(func_ir, expr, typemap=None, definition_finder=get_definition): + """Try to find a call expression's function and module names and return + them as strings for unbounded calls. If the call is a bounded call, return + the self object instead of module name. Raise GuardException if failed. + + Providing typemap can make the call matching more accurate in corner cases + such as bounded call on an object which is inside another object. + """ + require(isinstance(expr, ir.Expr) and expr.op == 'call') + callee = expr.func + callee_def = definition_finder(func_ir, callee) + attrs = [] + obj = None + while True: + if isinstance(callee_def, (ir.Global, ir.FreeVar)): + # require(callee_def.value == numpy) + # these checks support modules like numpy, numpy.random as well as + # calls like len() and intrinsics like assertEquiv + keys = ['name', '_name', '__name__'] + value = None + for key in keys: + if hasattr(callee_def.value, key): + value = getattr(callee_def.value, key) + break + if not value or not isinstance(value, str): + raise GuardException + attrs.append(value) + def_val = callee_def.value + # get the underlying definition of Intrinsic object to be able to + # find the module effectively. + # Otherwise, it will return numba.extending + if isinstance(def_val, _Intrinsic): + def_val = def_val._defn + if hasattr(def_val, '__module__'): + mod_name = def_val.__module__ + # The reason for first checking if the function is in NumPy's + # top level name space by module is that some functions are + # deprecated in NumPy but the functions' names are aliased with + # other common names. This prevents deprecation warnings on + # e.g. getattr(numpy, 'bool') were a bool the target. + # For context see #6175, impacts NumPy>=1.20. + mod_not_none = mod_name is not None + numpy_toplevel = (mod_not_none and + (mod_name == 'numpy' + or mod_name.startswith('numpy.'))) + # it might be a numpy function imported directly + if (numpy_toplevel and hasattr(numpy, value) + and def_val == getattr(numpy, value)): + attrs += ['numpy'] + # it might be a np.random function imported directly + elif (hasattr(numpy.random, value) + and def_val == getattr(numpy.random, value)): + attrs += ['random', 'numpy'] + elif mod_not_none: + attrs.append(mod_name) + else: + class_name = def_val.__class__.__name__ + if class_name == 'builtin_function_or_method': + class_name = 'builtin' + if class_name != 'module': + attrs.append(class_name) + break + elif isinstance(callee_def, ir.Expr) and callee_def.op == 'getattr': + obj = callee_def.value + attrs.append(callee_def.attr) + if typemap and obj.name in typemap: + typ = typemap[obj.name] + if not isinstance(typ, types.Module): + return attrs[0], obj + callee_def = definition_finder(func_ir, obj) + else: + # obj.func calls where obj is not np array + if obj is not None: + return '.'.join(reversed(attrs)), obj + raise GuardException + return attrs[0], '.'.join(reversed(attrs[1:])) + +def find_build_sequence(func_ir, var): + """Check if a variable is constructed via build_tuple or + build_list or build_set, and return the sequence and the + operator, or raise GuardException otherwise. + Note: only build_tuple is immutable, so use with care. + """ + require(isinstance(var, ir.Var)) + var_def = get_definition(func_ir, var) + require(isinstance(var_def, ir.Expr)) + build_ops = ['build_tuple', 'build_list', 'build_set'] + require(var_def.op in build_ops) + return var_def.items, var_def.op + +def find_const(func_ir, var): + """Check if a variable is defined as constant, and return + the constant value, or raise GuardException otherwise. + """ + require(isinstance(var, ir.Var)) + var_def = get_definition(func_ir, var) + require(isinstance(var_def, (ir.Const, ir.Global, ir.FreeVar))) + return var_def.value + +def compile_to_numba_ir(mk_func, glbls, typingctx=None, targetctx=None, + arg_typs=None, typemap=None, calltypes=None): + """ + Compile a function or a make_function node to Numba IR. + + Rename variables and + labels to avoid conflict if inlined somewhere else. Perform type inference + if typingctx and other typing inputs are available and update typemap and + calltypes. + """ + from numba.core import typed_passes + # mk_func can be actual function or make_function node, or a njit function + if hasattr(mk_func, 'code'): + code = mk_func.code + elif hasattr(mk_func, '__code__'): + code = mk_func.__code__ + else: + raise NotImplementedError("function type not recognized {}".format(mk_func)) + f_ir = get_ir_of_code(glbls, code) + remove_dels(f_ir.blocks) + + # relabel by adding an offset + f_ir.blocks = add_offset_to_labels(f_ir.blocks, _the_max_label.next()) + max_label = max(f_ir.blocks.keys()) + _the_max_label.update(max_label) + + # rename all variables to avoid conflict + var_table = get_name_var_table(f_ir.blocks) + new_var_dict = {} + for name, var in var_table.items(): + new_var_dict[name] = mk_unique_var(name) + replace_var_names(f_ir.blocks, new_var_dict) + + # perform type inference if typingctx is available and update type + # data structures typemap and calltypes + if typingctx: + f_typemap, f_return_type, f_calltypes, _ = typed_passes.type_inference_stage( + typingctx, targetctx, f_ir, arg_typs, None) + # remove argument entries like arg.a from typemap + arg_names = [vname for vname in f_typemap if vname.startswith("arg.")] + for a in arg_names: + f_typemap.pop(a) + typemap.update(f_typemap) + calltypes.update(f_calltypes) + return f_ir + +def _create_function_from_code_obj(fcode, func_env, func_arg, func_clo, glbls): + """ + Creates a function from a code object. Args: + * fcode - the code object + * func_env - string for the freevar placeholders + * func_arg - string for the function args (e.g. "a, b, c, d=None") + * func_clo - string for the closure args + * glbls - the function globals + """ + sanitized_co_name = fcode.co_name.replace('<', '_').replace('>', '_') + func_text = (f"def closure():\n{func_env}\n" + f"\tdef {sanitized_co_name}({func_arg}):\n" + f"\t\treturn ({func_clo})\n" + f"\treturn {sanitized_co_name}") + loc = {} + exec(func_text, glbls, loc) + + f = loc['closure']() + # replace the code body + f.__code__ = fcode + f.__name__ = fcode.co_name + return f + +def get_ir_of_code(glbls, fcode): + """ + Compile a code object to get its IR, ir.Del nodes are emitted + """ + nfree = len(fcode.co_freevars) + func_env = "\n".join(["\tc_%d = None" % i for i in range(nfree)]) + func_clo = ",".join(["c_%d" % i for i in range(nfree)]) + func_arg = ",".join(["x_%d" % i for i in range(fcode.co_argcount)]) + + f = _create_function_from_code_obj(fcode, func_env, func_arg, func_clo, + glbls) + + from numba.core import compiler + ir = compiler.run_frontend(f) + # we need to run the before inference rewrite pass to normalize the IR + # XXX: check rewrite pass flag? + # for example, Raise nodes need to become StaticRaise before type inference + class DummyPipeline(object): + def __init__(self, f_ir): + self.state = compiler.StateDict() + self.state.typingctx = None + self.state.targetctx = None + self.state.args = None + self.state.func_ir = f_ir + self.state.typemap = None + self.state.return_type = None + self.state.calltypes = None + state = DummyPipeline(ir).state + rewrites.rewrite_registry.apply('before-inference', state) + # call inline pass to handle cases like stencils and comprehensions + swapped = {} # TODO: get this from diagnostics store + import numba.core.inline_closurecall + inline_pass = numba.core.inline_closurecall.InlineClosureCallPass( + ir, numba.core.cpu.ParallelOptions(False), swapped) + inline_pass.run() + + # TODO: DO NOT ADD MORE THINGS HERE! + # If adding more things here is being contemplated, it really is time to + # retire this function and work on getting the InlineWorker class from + # numba.core.inline_closurecall into sufficient shape as a replacement. + # The issue with `get_ir_of_code` is that it doesn't run a full compilation + # pipeline and as a result various additional things keep needing to be + # added to create valid IR. + + # rebuild IR in SSA form + from numba.core.untyped_passes import ReconstructSSA + from numba.core.typed_passes import PreLowerStripPhis + reconstruct_ssa = ReconstructSSA() + phistrip = PreLowerStripPhis() + reconstruct_ssa.run_pass(state) + phistrip.run_pass(state) + + post_proc = postproc.PostProcessor(ir) + post_proc.run(True) + return ir + +def replace_arg_nodes(block, args): + """ + Replace ir.Arg(...) with variables + """ + for stmt in block.body: + if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Arg): + idx = stmt.value.index + assert(idx < len(args)) + stmt.value = args[idx] + return + + +def replace_returns(blocks, target, return_label): + """ + Return return statement by assigning directly to target, and a jump. + """ + for block in blocks.values(): + # some blocks may be empty during transformations + if not block.body: + continue + stmt = block.terminator + if isinstance(stmt, ir.Return): + block.body.pop() # remove return + cast_stmt = block.body.pop() + assert (isinstance(cast_stmt, ir.Assign) + and isinstance(cast_stmt.value, ir.Expr) + and cast_stmt.value.op == 'cast'), "invalid return cast" + block.body.append(ir.Assign(cast_stmt.value.value, target, stmt.loc)) + block.body.append(ir.Jump(return_label, stmt.loc)) + + +def gen_np_call(func_as_str, func, lhs, args, typingctx, typemap, calltypes): + scope = args[0].scope + loc = args[0].loc + + # g_np_var = Global(numpy) + g_np_var = ir.Var(scope, mk_unique_var("$np_g_var"), loc) + typemap[g_np_var.name] = types.misc.Module(numpy) + g_np = ir.Global('np', numpy, loc) + g_np_assign = ir.Assign(g_np, g_np_var, loc) + # attr call: _attr = getattr(g_np_var, func_as_str) + np_attr_call = ir.Expr.getattr(g_np_var, func_as_str, loc) + attr_var = ir.Var(scope, mk_unique_var("$np_attr_attr"), loc) + func_var_typ = get_np_ufunc_typ(func) + typemap[attr_var.name] = func_var_typ + attr_assign = ir.Assign(np_attr_call, attr_var, loc) + # np call: lhs = np_attr(*args) + np_call = ir.Expr.call(attr_var, args, (), loc) + arg_types = [typemap[x.name] for x in args] + func_typ = func_var_typ.get_call_type(typingctx, arg_types, {}) + calltypes[np_call] = func_typ + np_assign = ir.Assign(np_call, lhs, loc) + return [g_np_assign, attr_assign, np_assign] + +def dump_blocks(blocks): + for label, block in blocks.items(): + print(label, ":") + for stmt in block.body: + print(" ", stmt) + +def is_operator_or_getitem(expr): + """true if expr is unary or binary operator or getitem""" + return (isinstance(expr, ir.Expr) + and getattr(expr, 'op', False) + and expr.op in ['unary', 'binop', 'inplace_binop', 'getitem', 'static_getitem']) + +def is_get_setitem(stmt): + """stmt is getitem assignment or setitem (and static cases)""" + return is_getitem(stmt) or is_setitem(stmt) + + +def is_getitem(stmt): + """true if stmt is a getitem or static_getitem assignment""" + return (isinstance(stmt, ir.Assign) + and isinstance(stmt.value, ir.Expr) + and stmt.value.op in ['getitem', 'static_getitem']) + +def is_setitem(stmt): + """true if stmt is a SetItem or StaticSetItem node""" + return isinstance(stmt, (ir.SetItem, ir.StaticSetItem)) + +def index_var_of_get_setitem(stmt): + """get index variable for getitem/setitem nodes (and static cases)""" + if is_getitem(stmt): + if stmt.value.op == 'getitem': + return stmt.value.index + else: + return stmt.value.index_var + + if is_setitem(stmt): + if isinstance(stmt, ir.SetItem): + return stmt.index + else: + return stmt.index_var + + return None + +def set_index_var_of_get_setitem(stmt, new_index): + if is_getitem(stmt): + if stmt.value.op == 'getitem': + stmt.value.index = new_index + else: + stmt.value.index_var = new_index + elif is_setitem(stmt): + if isinstance(stmt, ir.SetItem): + stmt.index = new_index + else: + stmt.index_var = new_index + else: + raise ValueError("getitem or setitem node expected but received {}".format( + stmt)) + + +def is_namedtuple_class(c): + """check if c is a namedtuple class""" + if not isinstance(c, type): + return False + # should have only tuple as superclass + bases = c.__bases__ + if len(bases) != 1 or bases[0] != tuple: + return False + # should have _make method + if not hasattr(c, '_make'): + return False + # should have _fields that is all string + fields = getattr(c, '_fields', None) + if not isinstance(fields, tuple): + return False + return all(isinstance(f, str) for f in fields) + + +def fill_block_with_call(newblock, callee, label_next, inputs, outputs): + """Fill *newblock* to call *callee* with arguments listed in *inputs*. + The returned values are unwrapped into variables in *outputs*. + The block would then jump to *label_next*. + """ + scope = newblock.scope + loc = newblock.loc + + fn = ir.Const(value=callee, loc=loc) + fnvar = scope.make_temp(loc=loc) + newblock.append(ir.Assign(target=fnvar, value=fn, loc=loc)) + # call + args = [scope.get_exact(name) for name in inputs] + callexpr = ir.Expr.call(func=fnvar, args=args, kws=(), loc=loc) + callres = scope.make_temp(loc=loc) + newblock.append(ir.Assign(target=callres, value=callexpr, loc=loc)) + # unpack return value + for i, out in enumerate(outputs): + target = scope.get_exact(out) + getitem = ir.Expr.static_getitem(value=callres, index=i, + index_var=None, loc=loc) + newblock.append(ir.Assign(target=target, value=getitem, loc=loc)) + # jump to next block + newblock.append(ir.Jump(target=label_next, loc=loc)) + return newblock + + +def fill_callee_prologue(block, inputs, label_next): + """ + Fill a new block *block* that unwraps arguments using names in *inputs* and + then jumps to *label_next*. + + Expected to use with *fill_block_with_call()* + """ + scope = block.scope + loc = block.loc + # load args + args = [ir.Arg(name=k, index=i, loc=loc) + for i, k in enumerate(inputs)] + for aname, aval in zip(inputs, args): + tmp = ir.Var(scope=scope, name=aname, loc=loc) + block.append(ir.Assign(target=tmp, value=aval, loc=loc)) + # jump to loop entry + block.append(ir.Jump(target=label_next, loc=loc)) + return block + + +def fill_callee_epilogue(block, outputs): + """ + Fill a new block *block* to prepare the return values. + This block is the last block of the function. + + Expected to use with *fill_block_with_call()* + """ + scope = block.scope + loc = block.loc + # prepare tuples to return + vals = [scope.get_exact(name=name) for name in outputs] + tupexpr = ir.Expr.build_tuple(items=vals, loc=loc) + tup = scope.make_temp(loc=loc) + block.append(ir.Assign(target=tup, value=tupexpr, loc=loc)) + # return + block.append(ir.Return(value=tup, loc=loc)) + return block + + +def find_outer_value(func_ir, var): + """Check if a variable is a global value, and return the value, + or raise GuardException otherwise. + """ + dfn = get_definition(func_ir, var) + if isinstance(dfn, (ir.Global, ir.FreeVar)): + return dfn.value + + if isinstance(dfn, ir.Expr) and dfn.op == 'getattr': + prev_val = find_outer_value(func_ir, dfn.value) + try: + val = getattr(prev_val, dfn.attr) + return val + except AttributeError: + raise GuardException + + raise GuardException + + +def raise_on_unsupported_feature(func_ir, typemap): + """ + Helper function to walk IR and raise if it finds op codes + that are unsupported. Could be extended to cover IR sequences + as well as op codes. Intended use is to call it as a pipeline + stage just prior to lowering to prevent LoweringErrors for known + unsupported features. + """ + gdb_calls = [] # accumulate calls to gdb/gdb_init + + # issue 2195: check for excessively large tuples + for arg_name in func_ir.arg_names: + if arg_name in typemap and \ + isinstance(typemap[arg_name], types.containers.UniTuple) and \ + typemap[arg_name].count > 1000: + # Raise an exception when len(tuple) > 1000. The choice of this number (1000) + # was entirely arbitrary + msg = ("Tuple '{}' length must be smaller than 1000.\n" + "Large tuples lead to the generation of a prohibitively large " + "LLVM IR which causes excessive memory pressure " + "and large compile times.\n" + "As an alternative, the use of a 'list' is recommended in " + "place of a 'tuple' as lists do not suffer from this problem.".format(arg_name)) + raise UnsupportedError(msg, func_ir.loc) + + for blk in func_ir.blocks.values(): + for stmt in blk.find_insts(ir.Assign): + # This raises on finding `make_function` + if isinstance(stmt.value, ir.Expr): + if stmt.value.op == 'make_function': + val = stmt.value + + # See if the construct name can be refined + code = getattr(val, 'code', None) + if code is not None: + # check if this is a closure, the co_name will + # be the captured function name which is not + # useful so be explicit + if getattr(val, 'closure', None) is not None: + use = '' + expr = '' + else: + use = code.co_name + expr = '(%s) ' % use + else: + use = '' + expr = '' + + msg = ("Numba encountered the use of a language " + "feature it does not support in this context: " + "%s (op code: make_function not supported). If " + "the feature is explicitly supported it is " + "likely that the result of the expression %s" + "is being used in an unsupported manner.") % \ + (use, expr) + raise UnsupportedError(msg, stmt.value.loc) + + # this checks for gdb initialization calls, only one is permitted + if isinstance(stmt.value, (ir.Global, ir.FreeVar)): + val = stmt.value + val = getattr(val, 'value', None) + if val is None: + continue + + # check global function + found = False + if isinstance(val, pytypes.FunctionType): + found = val in {numba.gdb, numba.gdb_init} + if not found: # freevar bind to intrinsic + found = getattr(val, '_name', "") == "gdb_internal" + if found: + gdb_calls.append(stmt.loc) # report last seen location + + # this checks that np. was called if view is called + if isinstance(stmt.value, ir.Expr): + if stmt.value.op == 'getattr' and stmt.value.attr == 'view': + var = stmt.value.value.name + if isinstance(typemap[var], types.Array): + continue + df = func_ir.get_definition(var) + cn = guard(find_callname, func_ir, df) + if cn and cn[1] == 'numpy': + ty = getattr(numpy, cn[0]) + if (numpy.issubdtype(ty, numpy.integer) or + numpy.issubdtype(ty, numpy.floating)): + continue + + vardescr = '' if var.startswith('$') else "'{}' ".format(var) + raise TypingError( + "'view' can only be called on NumPy dtypes, " + "try wrapping the variable {}with 'np.()'". + format(vardescr), loc=stmt.loc) + + # checks for globals that are also reflected + if isinstance(stmt.value, ir.Global): + ty = typemap[stmt.target.name] + msg = ("The use of a %s type, assigned to variable '%s' in " + "globals, is not supported as globals are considered " + "compile-time constants and there is no known way to " + "compile a %s type as a constant.") + if (getattr(ty, 'reflected', False) or + isinstance(ty, (types.DictType, types.ListType))): + raise TypingError(msg % (ty, stmt.value.name, ty), loc=stmt.loc) + + # checks for generator expressions (yield in use when func_ir has + # not been identified as a generator). + if isinstance(stmt.value, ir.Yield) and not func_ir.is_generator: + msg = "The use of generator expressions is unsupported." + raise UnsupportedError(msg, loc=stmt.loc) + + # There is more than one call to function gdb/gdb_init + if len(gdb_calls) > 1: + msg = ("Calling either numba.gdb() or numba.gdb_init() more than once " + "in a function is unsupported (strange things happen!), use " + "numba.gdb_breakpoint() to create additional breakpoints " + "instead.\n\nRelevant documentation is available here:\n" + "https://numba.readthedocs.io/en/stable/user/troubleshoot.html" + "#using-numba-s-direct-gdb-bindings-in-nopython-mode\n\n" + "Conflicting calls found at:\n %s") + buf = '\n'.join([x.strformat() for x in gdb_calls]) + raise UnsupportedError(msg % buf) + + +def warn_deprecated(func_ir, typemap): + # first pass, just walk the type map + for name, ty in typemap.items(): + # the Type Metaclass has a reflected member + if ty.reflected: + # if its an arg, report function call + if name.startswith('arg.'): + loc = func_ir.loc + arg = name.split('.')[1] + fname = func_ir.func_id.func_qualname + tyname = 'list' if isinstance(ty, types.List) else 'set' + url = ("https://numba.readthedocs.io/en/stable/reference/" + "deprecation.html#deprecation-of-reflection-for-list-and" + "-set-types") + msg = ("\nEncountered the use of a type that is scheduled for " + "deprecation: type 'reflected %s' found for argument " + "'%s' of function '%s'.\n\nFor more information visit " + "%s" % (tyname, arg, fname, url)) + warnings.warn(NumbaPendingDeprecationWarning(msg, loc=loc)) + + +def resolve_func_from_module(func_ir, node): + """ + This returns the python function that is being getattr'd from a module in + some IR, it resolves import chains/submodules recursively. Should it not be + possible to find the python function being called None will be returned. + + func_ir - the FunctionIR object + node - the IR node from which to start resolving (should be a `getattr`). + """ + getattr_chain = [] + def resolve_mod(mod): + if getattr(mod, 'op', False) == 'getattr': + getattr_chain.insert(0, mod.attr) + try: + mod = func_ir.get_definition(mod.value) + except KeyError: # multiple definitions + return None + return resolve_mod(mod) + elif isinstance(mod, (ir.Global, ir.FreeVar)): + if isinstance(mod.value, pytypes.ModuleType): + return mod + return None + + mod = resolve_mod(node) + if mod is not None: + defn = mod.value + for x in getattr_chain: + defn = getattr(defn, x, False) + if not defn: + break + else: + return defn + else: + return None + + +def enforce_no_dels(func_ir): + """ + Enforce there being no ir.Del nodes in the IR. + """ + for blk in func_ir.blocks.values(): + dels = [x for x in blk.find_insts(ir.Del)] + if dels: + msg = "Illegal IR, del found at: %s" % dels[0] + raise CompilerError(msg, loc=dels[0].loc) + +def enforce_no_phis(func_ir): + """ + Enforce there being no ir.Expr.phi nodes in the IR. + """ + for blk in func_ir.blocks.values(): + phis = [x for x in blk.find_exprs(op='phi')] + if phis: + msg = "Illegal IR, phi found at: %s" % phis[0] + raise CompilerError(msg, loc=phis[0].loc) + + +def legalize_single_scope(blocks): + """Check the given mapping of ir.Block for containing a single scope. + """ + return len({blk.scope for blk in blocks.values()}) == 1 + + +def check_and_legalize_ir(func_ir, flags: "numba.core.compiler.Flags"): + """ + This checks that the IR presented is legal + """ + enforce_no_phis(func_ir) + enforce_no_dels(func_ir) + # postprocess and emit ir.Dels + post_proc = postproc.PostProcessor(func_ir) + post_proc.run(True, extend_lifetimes=flags.dbg_extend_lifetimes) + +def convert_code_obj_to_function(code_obj, caller_ir): + """ + Converts a code object from a `make_function.code` attr in the IR into a + python function, caller_ir is the FunctionIR of the caller and is used for + the resolution of freevars. + """ + fcode = code_obj.code + nfree = len(fcode.co_freevars) + + # try and resolve freevars if they are consts in the caller's IR + # these can be baked into the new function + freevars = [] + for x in fcode.co_freevars: + # not using guard here to differentiate between multiple definition and + # non-const variable + try: + freevar_def = caller_ir.get_definition(x) + except KeyError: + msg = ("Cannot capture a constant value for variable '%s' as there " + "are multiple definitions present." % x) + raise TypingError(msg, loc=code_obj.loc) + if isinstance(freevar_def, ir.Const): + freevars.append(freevar_def.value) + else: + msg = ("Cannot capture the non-constant value associated with " + "variable '%s' in a function that will escape." % x) + raise TypingError(msg, loc=code_obj.loc) + + func_env = "\n".join(["\tc_%d = %s" % (i, x) for i, x in enumerate(freevars)]) + func_clo = ",".join(["c_%d" % i for i in range(nfree)]) + co_varnames = list(fcode.co_varnames) + + # This is horrible. The code object knows about the number of args present + # it also knows the name of the args but these are bundled in with other + # vars in `co_varnames`. The make_function IR node knows what the defaults + # are, they are defined in the IR as consts. The following finds the total + # number of args (args + kwargs with defaults), finds the default values + # and infers the number of "kwargs with defaults" from this and then infers + # the number of actual arguments from that. + n_kwargs = 0 + n_allargs = fcode.co_argcount + kwarg_defaults = caller_ir.get_definition(code_obj.defaults) + if kwarg_defaults is not None: + if isinstance(kwarg_defaults, tuple): + d = [caller_ir.get_definition(x).value for x in kwarg_defaults] + kwarg_defaults_tup = tuple(d) + else: + d = [caller_ir.get_definition(x).value + for x in kwarg_defaults.items] + kwarg_defaults_tup = tuple(d) + n_kwargs = len(kwarg_defaults_tup) + nargs = n_allargs - n_kwargs + + func_arg = ",".join(["%s" % (co_varnames[i]) for i in range(nargs)]) + if n_kwargs: + kw_const = ["%s = %s" % (co_varnames[i + nargs], kwarg_defaults_tup[i]) + for i in range(n_kwargs)] + func_arg += ", " + func_arg += ", ".join(kw_const) + + # globals are the same as those in the caller + glbls = caller_ir.func_id.func.__globals__ + + # create the function and return it + return _create_function_from_code_obj(fcode, func_env, func_arg, func_clo, + glbls) + + +def fixup_var_define_in_scope(blocks): + """Fixes the mapping of ir.Block to ensure all referenced ir.Var are + defined in every scope used by the function. Such that looking up a variable + from any scope in this function will not fail. + + Note: This is a workaround. Ideally, all the blocks should refer to the + same ir.Scope, but that property is not maintained by all the passes. + """ + # Scan for all used variables + used_var = {} + for blk in blocks.values(): + scope = blk.scope + for inst in blk.body: + for var in inst.list_vars(): + used_var[var] = inst + # Note: not all blocks share a single scope even though they should. + # Ensure the scope of each block defines all used variables. + for blk in blocks.values(): + scope = blk.scope + for var, inst in used_var.items(): + # add this variable if it's not in scope + if var.name not in scope.localvars: + # Note: using a internal method to reuse the same + scope.localvars.define(var.name, var) + + +def transfer_scope(block, scope): + """Transfer the ir.Block to use the given ir.Scope. + """ + old_scope = block.scope + if old_scope is scope: + # bypass if the block is already using the given scope + return block + # Ensure variables are defined in the new scope + for var in old_scope.localvars._con.values(): + if var.name not in scope.localvars: + scope.localvars.define(var.name, var) + # replace scope + block.scope = scope + return block + + +def is_setup_with(stmt): + return isinstance(stmt, ir.EnterWith) + + +def is_terminator(stmt): + return isinstance(stmt, ir.Terminator) + + +def is_raise(stmt): + return isinstance(stmt, ir.Raise) + + +def is_return(stmt): + return isinstance(stmt, ir.Return) + + +def is_pop_block(stmt): + return isinstance(stmt, ir.PopBlock) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/itanium_mangler.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/itanium_mangler.py new file mode 100644 index 0000000000000000000000000000000000000000..963ee23183336e553e81e0efa85833a77f9df80d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/itanium_mangler.py @@ -0,0 +1,205 @@ +""" +Itanium CXX ABI Mangler + +Reference: http://mentorembedded.github.io/cxx-abi/abi.html + +The basics of the mangling scheme. + +We are hijacking the CXX mangling scheme for our use. We map Python modules +into CXX namespace. A `module1.submodule2.foo` is mapped to +`module1::submodule2::foo`. For parameterized numba types, we treat them as +templated types; for example, `array(int64, 1d, C)` becomes an +`array`. + +All mangled names are prefixed with "_Z". It is followed by the name of the +entity. A name contains one or more identifiers. Each identifier is encoded +as "". If the name is namespaced and, therefore, +has multiple identifiers, the entire name is encoded as "NE". + +For functions, arguments types follow. There are condensed encodings for basic +built-in types; e.g. "i" for int, "f" for float. For other types, the +previously mentioned name encoding should be used. + +For templated types, the template parameters are encoded immediately after the +name. If it is namespaced, it should be within the 'N' 'E' marker. Template +parameters are encoded in "IE", where each parameter is encoded using +the mentioned name encoding scheme. Template parameters can contain literal +values like the '1' in the array type shown earlier. There is special encoding +scheme for them to avoid leading digits. +""" + + +import re + +from numba.core import types + + +# According the scheme, valid characters for mangled names are [a-zA-Z0-9_]. +# We borrow the '_' as the escape character to encode invalid char into +# '_xx' where 'xx' is the hex codepoint. +_re_invalid_char = re.compile(r'[^a-z0-9_]', re.I) + +PREFIX = "_Z" + +# Numba types to mangled type code. These correspond with the codes listed in +# https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling-builtin +N2CODE = { + types.void: 'v', + types.boolean: 'b', + types.uint8: 'h', + types.int8: 'a', + types.uint16: 't', + types.int16: 's', + types.uint32: 'j', + types.int32: 'i', + types.uint64: 'y', + types.int64: 'x', + types.float16: 'Dh', + types.float32: 'f', + types.float64: 'd' +} + + +def _escape_string(text): + """Escape the given string so that it only contains ASCII characters + of [a-zA-Z0-9_$]. + + The dollar symbol ($) and other invalid characters are escaped into + the string sequence of "$xx" where "xx" is the hex codepoint of the char. + + Multibyte characters are encoded into utf8 and converted into the above + hex format. + """ + + def repl(m): + return ''.join(('_%02x' % ch) + for ch in m.group(0).encode('utf8')) + ret = re.sub(_re_invalid_char, repl, text) + # Return str if we got a unicode (for py2) + if not isinstance(ret, str): + return ret.encode('ascii') + return ret + + +def _fix_lead_digit(text): + """ + Fix text with leading digit + """ + if text and text[0].isdigit(): + return '_' + text + else: + return text + + +def _len_encoded(string): + """ + Prefix string with digit indicating the length. + Add underscore if string is prefixed with digits. + """ + string = _fix_lead_digit(string) + return '%u%s' % (len(string), string) + + +def mangle_abi_tag(abi_tag: str) -> str: + return "B" + _len_encoded(_escape_string(abi_tag)) + + +def mangle_identifier(ident, template_params='', *, abi_tags=(), uid=None): + """ + Mangle the identifier with optional template parameters and abi_tags. + + Note: + + This treats '.' as '::' in C++. + """ + if uid is not None: + # Add uid to abi-tags + abi_tags = (f"v{uid}", *abi_tags) + parts = [_len_encoded(_escape_string(x)) for x in ident.split('.')] + enc_abi_tags = list(map(mangle_abi_tag, abi_tags)) + extras = template_params + ''.join(enc_abi_tags) + if len(parts) > 1: + return 'N%s%sE' % (''.join(parts), extras) + else: + return '%s%s' % (parts[0], extras) + + +def mangle_type_or_value(typ): + """ + Mangle type parameter and arbitrary value. + """ + # Handle numba types + if isinstance(typ, types.Type): + if typ in N2CODE: + return N2CODE[typ] + else: + return mangle_templated_ident(*typ.mangling_args) + # Handle integer literal + elif isinstance(typ, int): + return 'Li%dE' % typ + # Handle str as identifier + elif isinstance(typ, str): + return mangle_identifier(typ) + # Otherwise + else: + enc = _escape_string(str(typ)) + return _len_encoded(enc) + + +# Alias +mangle_type = mangle_type_or_value +mangle_value = mangle_type_or_value + + +def mangle_templated_ident(identifier, parameters): + """ + Mangle templated identifier. + """ + template_params = ('I%sE' % ''.join(map(mangle_type_or_value, parameters)) + if parameters else '') + return mangle_identifier(identifier, template_params) + + +def mangle_args(argtys): + """ + Mangle sequence of Numba type objects and arbitrary values. + """ + return ''.join([mangle_type_or_value(t) for t in argtys]) + + +def mangle(ident, argtys, *, abi_tags=(), uid=None): + """ + Mangle identifier with Numba type objects and abi-tags. + """ + return ''.join([PREFIX, + mangle_identifier(ident, abi_tags=abi_tags, uid=uid), + mangle_args(argtys)]) + + +def prepend_namespace(mangled, ns): + """ + Prepend namespace to mangled name. + """ + if not mangled.startswith(PREFIX): + raise ValueError('input is not a mangled name') + elif mangled.startswith(PREFIX + 'N'): + # nested + remaining = mangled[3:] + ret = PREFIX + 'N' + mangle_identifier(ns) + remaining + else: + # non-nested + remaining = mangled[2:] + head, tail = _split_mangled_ident(remaining) + ret = PREFIX + 'N' + mangle_identifier(ns) + head + 'E' + tail + return ret + + +def _split_mangled_ident(mangled): + """ + Returns `(head, tail)` where `head` is the ` + ` encoded + identifier and `tail` is the remaining. + """ + ct = int(mangled) + ctlen = len(str(ct)) + at = ctlen + ct + return mangled[:at], mangled[at:] diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/llvm_bindings.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/llvm_bindings.py new file mode 100644 index 0000000000000000000000000000000000000000..fa94ad428b45088e6e9b685ac2766234e9c6cc06 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/llvm_bindings.py @@ -0,0 +1,46 @@ +""" +Useful options to debug LLVM passes + +llvm.set_option("test", "-debug-pass=Details") +llvm.set_option("test", "-debug-pass=Executions") +llvm.set_option("test", "-debug-pass=Arguments") +llvm.set_option("test", "-debug-pass=Structure") +llvm.set_option("test", "-debug-only=loop-vectorize") +llvm.set_option("test", "-help-hidden") + +""" + +from llvmlite import binding as llvm + + +def _inlining_threshold(optlevel, sizelevel=0): + """ + Compute the inlining threshold for the desired optimisation level + + Refer to http://llvm.org/docs/doxygen/html/InlineSimple_8cpp_source.html + """ + if optlevel > 2: + return 275 + + # -Os + if sizelevel == 1: + return 75 + + # -Oz + if sizelevel == 2: + return 25 + + return 225 + + +def create_pass_manager_builder(opt=2, loop_vectorize=False, + slp_vectorize=False): + """ + Create an LLVM pass manager with the desired optimisation level and options. + """ + pmb = llvm.create_pass_manager_builder() + pmb.opt_level = opt + pmb.loop_vectorize = loop_vectorize + pmb.slp_vectorize = slp_vectorize + pmb.inlining_threshold = _inlining_threshold(opt) + return pmb diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/lowering.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/lowering.py new file mode 100644 index 0000000000000000000000000000000000000000..84ce3514cfea658317a2d4f19948a9a689dfbbf1 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/lowering.py @@ -0,0 +1,1536 @@ +from collections import namedtuple, defaultdict +import operator +import warnings +from functools import partial + +import llvmlite.ir +from llvmlite.ir import Constant, IRBuilder + +from numba.core import (typing, utils, types, ir, debuginfo, funcdesc, + generators, config, ir_utils, cgutils, removerefctpass, + targetconfig) +from numba.core.errors import (LoweringError, new_error_context, TypingError, + LiteralTypingError, UnsupportedError, + NumbaDebugInfoWarning) +from numba.core.funcdesc import default_mangler +from numba.core.environment import Environment +from numba.core.analysis import compute_use_defs, must_use_alloca +from numba.misc.firstlinefinder import get_func_body_first_lineno + + +_VarArgItem = namedtuple("_VarArgItem", ("vararg", "index")) + + +class BaseLower(object): + """ + Lower IR to LLVM + """ + + def __init__(self, context, library, fndesc, func_ir, metadata=None): + self.library = library + self.fndesc = fndesc + self.blocks = utils.SortedMap(func_ir.blocks.items()) + self.func_ir = func_ir + self.call_conv = context.call_conv + self.generator_info = func_ir.generator_info + self.metadata = metadata + self.flags = targetconfig.ConfigStack.top_or_none() + + # Initialize LLVM + self.module = self.library.create_ir_module(self.fndesc.unique_name) + + # Python execution environment (will be available to the compiled + # function). + self.env = Environment.from_fndesc(self.fndesc) + + # Internal states + self.blkmap = {} + self.pending_phis = {} + self.varmap = {} + self.firstblk = min(self.blocks.keys()) + self.loc = -1 + + # Specializes the target context as seen inside the Lowerer + # This adds: + # - environment: the python execution environment + self.context = context.subtarget(environment=self.env, + fndesc=self.fndesc) + + # Debuginfo + dibuildercls = (self.context.DIBuilder + if self.context.enable_debuginfo + else debuginfo.DummyDIBuilder) + + # debuginfo def location + self.defn_loc = self._compute_def_location() + + self.debuginfo = dibuildercls(module=self.module, + filepath=func_ir.loc.filename, + cgctx=context) + + # Subclass initialization + self.init() + + def init(self): + pass + + def init_pyapi(self): + """ + Init the Python API and Environment Manager for the function being + lowered. + """ + if self.pyapi is not None: + return + self.pyapi = self.context.get_python_api(self.builder) + + # Store environment argument for later use + self.env_manager = self.context.get_env_manager(self.builder) + self.env_body = self.env_manager.env_body + self.envarg = self.env_manager.env_ptr + + def _compute_def_location(self): + # Debuginfo requires source to be accurate. Find it and warn if not + # found. If it's not found, use the func_ir line + 1, this assumes that + # the function definition is decorated with a 1 line jit decorator. + defn_loc = self.func_ir.loc.with_lineno(self.func_ir.loc.line + 1) + if self.context.enable_debuginfo: + fn = self.func_ir.func_id.func + optional_lno = get_func_body_first_lineno(fn) + if optional_lno is not None: + # -1 as lines start at 1 and this is an offset. + offset = optional_lno - 1 + defn_loc = self.func_ir.loc.with_lineno(offset) + else: + msg = ("Could not find source for function: " + f"{self.func_ir.func_id.func}. Debug line information " + "may be inaccurate.") + warnings.warn(NumbaDebugInfoWarning(msg)) + return defn_loc + + def pre_lower(self): + """ + Called before lowering all blocks. + """ + # A given Lower object can be used for several LL functions + # (for generators) and it's important to use a new API and + # EnvironmentManager. + self.pyapi = None + self.debuginfo.mark_subprogram(function=self.builder.function, + qualname=self.fndesc.qualname, + argnames=self.fndesc.args, + argtypes=self.fndesc.argtypes, + line=self.defn_loc.line) + + def post_lower(self): + """ + Called after all blocks are lowered + """ + self.debuginfo.finalize() + + def pre_block(self, block): + """ + Called before lowering a block. + """ + + def post_block(self, block): + """ + Called after lowering a block. + """ + + def return_exception(self, exc_class, exc_args=None, loc=None): + """Propagate exception to the caller. + """ + self.call_conv.return_user_exc( + self.builder, exc_class, exc_args, + loc=loc, func_name=self.func_ir.func_id.func_name, + ) + + def set_exception(self, exc_class, exc_args=None, loc=None): + """Set exception state in the current function. + """ + self.call_conv.set_static_user_exc( + self.builder, exc_class, exc_args, + loc=loc, func_name=self.func_ir.func_id.func_name, + ) + + def emit_environment_object(self): + """Emit a pointer to hold the Environment object. + """ + # Define global for the environment and initialize it to NULL + envname = self.context.get_env_name(self.fndesc) + self.context.declare_env_global(self.module, envname) + + def lower(self): + # Emit the Env into the module + self.emit_environment_object() + if self.generator_info is None: + self.genlower = None + self.lower_normal_function(self.fndesc) + else: + self.genlower = self.GeneratorLower(self) + self.gentype = self.genlower.gentype + + self.genlower.lower_init_func(self) + self.genlower.lower_next_func(self) + if self.gentype.has_finalizer: + self.genlower.lower_finalize_func(self) + + if config.DUMP_LLVM: + print(("LLVM DUMP %s" % self.fndesc).center(80, '-')) + if config.HIGHLIGHT_DUMPS: + try: + from pygments import highlight + from pygments.lexers import LlvmLexer as lexer + from pygments.formatters import Terminal256Formatter + from numba.misc.dump_style import by_colorscheme + print(highlight(self.module.__repr__(), lexer(), + Terminal256Formatter( + style=by_colorscheme()))) + except ImportError: + msg = "Please install pygments to see highlighted dumps" + raise ValueError(msg) + else: + print(self.module) + print('=' * 80) + + # Special optimization to remove NRT on functions that do not need it. + if self.context.enable_nrt and self.generator_info is None: + removerefctpass.remove_unnecessary_nrt_usage(self.function, + context=self.context, + fndesc=self.fndesc) + + # Run target specific post lowering transformation + self.context.post_lowering(self.module, self.library) + + # Materialize LLVM Module + self.library.add_ir_module(self.module) + + def extract_function_arguments(self): + self.fnargs = self.call_conv.decode_arguments(self.builder, + self.fndesc.argtypes, + self.function) + return self.fnargs + + def lower_normal_function(self, fndesc): + """ + Lower non-generator *fndesc*. + """ + self.setup_function(fndesc) + + # Init argument values + self.extract_function_arguments() + entry_block_tail = self.lower_function_body() + + # Close tail of entry block, do not emit debug metadata else the + # unconditional jump gets associated with the metadata from the function + # body end. + with debuginfo.suspend_emission(self.builder): + self.builder.position_at_end(entry_block_tail) + self.builder.branch(self.blkmap[self.firstblk]) + + def lower_function_body(self): + """ + Lower the current function's body, and return the entry block. + """ + # Init Python blocks + for offset in self.blocks: + bname = "B%s" % offset + self.blkmap[offset] = self.function.append_basic_block(bname) + + self.pre_lower() + # pre_lower() may have changed the current basic block + entry_block_tail = self.builder.basic_block + + self.debug_print("# function begin: {0}".format( + self.fndesc.unique_name)) + + # Lower all blocks + for offset, block in sorted(self.blocks.items()): + bb = self.blkmap[offset] + self.builder.position_at_end(bb) + self.lower_block(block) + self.post_lower() + return entry_block_tail + + def lower_block(self, block): + """ + Lower the given block. + """ + self.pre_block(block) + for inst in block.body: + self.loc = inst.loc + defaulterrcls = partial(LoweringError, loc=self.loc) + with new_error_context('lowering "{inst}" at {loc}', inst=inst, + loc=self.loc, errcls_=defaulterrcls): + self.lower_inst(inst) + self.post_block(block) + + def create_cpython_wrapper(self, release_gil=False): + """ + Create CPython wrapper(s) around this function (or generator). + """ + if self.genlower: + self.context.create_cpython_wrapper(self.library, + self.genlower.gendesc, + self.env, self.call_helper, + release_gil=release_gil) + self.context.create_cpython_wrapper(self.library, self.fndesc, + self.env, self.call_helper, + release_gil=release_gil) + + def create_cfunc_wrapper(self): + """ + Create C wrapper around this function. + """ + if self.genlower: + raise UnsupportedError('generator as a first-class function type') + self.context.create_cfunc_wrapper(self.library, self.fndesc, + self.env, self.call_helper) + + def setup_function(self, fndesc): + # Setup function + self.function = self.context.declare_function(self.module, fndesc) + if self.flags.dbg_optnone: + attrset = self.function.attributes + if "alwaysinline" not in attrset: + attrset.add("optnone") + attrset.add("noinline") + self.entry_block = self.function.append_basic_block('entry') + self.builder = IRBuilder(self.entry_block) + self.call_helper = self.call_conv.init_call_helper(self.builder) + + def typeof(self, varname): + return self.fndesc.typemap[varname] + + def debug_print(self, msg): + if config.DEBUG_JIT: + self.context.debug_print(self.builder, "DEBUGJIT: {0}".format(msg)) + + def print_variable(self, msg, varname): + """Helper to emit ``print(msg, varname)`` for debugging. + + Parameters + ---------- + msg : str + Literal string to be printed. + varname : str + A variable name whose value will be printed. + """ + argtys = ( + types.literal(msg), + self.fndesc.typemap[varname] + ) + args = ( + self.context.get_dummy_value(), + self.loadvar(varname), + ) + sig = typing.signature(types.none, *argtys) + + impl = self.context.get_function(print, sig) + impl(self.builder, args) + + +class Lower(BaseLower): + GeneratorLower = generators.GeneratorLower + + def init(self): + super().init() + # find all singly assigned variables + self._find_singly_assigned_variable() + + @property + def _disable_sroa_like_opt(self): + """Flags that the SROA like optimisation that Numba performs (which + prevent alloca and subsequent load/store for locals) should be disabled. + Currently, this is conditional solely on the presence of a request for + the emission of debug information.""" + return False if self.flags is None else self.flags.debuginfo + + def _find_singly_assigned_variable(self): + func_ir = self.func_ir + blocks = func_ir.blocks + + sav = set() + + if not self.func_ir.func_id.is_generator: + use_defs = compute_use_defs(blocks) + alloca_vars = must_use_alloca(blocks) + + # Compute where variables are defined + var_assign_map = defaultdict(set) + for blk, vl in use_defs.defmap.items(): + for var in vl: + var_assign_map[var].add(blk) + + # Compute where variables are used + var_use_map = defaultdict(set) + for blk, vl in use_defs.usemap.items(): + for var in vl: + var_use_map[var].add(blk) + + # Keep only variables that are defined locally and used locally + for var in var_assign_map: + if var not in alloca_vars and len(var_assign_map[var]) == 1: + # Usemap does not keep locally defined variables. + if len(var_use_map[var]) == 0: + # Ensure that the variable is not defined multiple times + # in the block + [defblk] = var_assign_map[var] + assign_stmts = self.blocks[defblk].find_insts(ir.Assign) + assigns = [stmt for stmt in assign_stmts + if stmt.target.name == var] + if len(assigns) == 1: + sav.add(var) + + self._singly_assigned_vars = sav + self._blk_local_varmap = {} + + def pre_block(self, block): + from numba.core.unsafe import eh + + super(Lower, self).pre_block(block) + self._cur_ir_block = block + + if block == self.firstblk: + # create slots for all the vars, irrespective of whether they are + # initialized, SSA will pick this up and warn users about using + # uninitialized variables. Slots are added as alloca in the first + # block + bb = self.blkmap[self.firstblk] + self.builder.position_at_end(bb) + all_names = set() + for block in self.blocks.values(): + for x in block.find_insts(ir.Del): + if x.value not in all_names: + all_names.add(x.value) + for name in all_names: + fetype = self.typeof(name) + self._alloca_var(name, fetype) + + # Detect if we are in a TRY block by looking for a call to + # `eh.exception_check`. + for call in block.find_exprs(op='call'): + defn = ir_utils.guard( + ir_utils.get_definition, self.func_ir, call.func, + ) + if defn is not None and isinstance(defn, ir.Global): + if defn.value is eh.exception_check: + if isinstance(block.terminator, ir.Branch): + targetblk = self.blkmap[block.terminator.truebr] + # NOTE: This hacks in an attribute for call_conv to + # pick up. This hack is no longer needed when + # all old-style implementations are gone. + self.builder._in_try_block = {'target': targetblk} + break + + def post_block(self, block): + # Clean-up + try: + del self.builder._in_try_block + except AttributeError: + pass + + def lower_inst(self, inst): + # Set debug location for all subsequent LL instructions + self.debuginfo.mark_location(self.builder, self.loc.line) + self.debug_print(str(inst)) + if isinstance(inst, ir.Assign): + ty = self.typeof(inst.target.name) + val = self.lower_assign(ty, inst) + argidx = None + # If this is a store from an arg, like x = arg.x then tell debuginfo + # that this is the arg + if isinstance(inst.value, ir.Arg): + # NOTE: debug location is the `def ` line + self.debuginfo.mark_location(self.builder, self.defn_loc.line) + argidx = inst.value.index + 1 # args start at 1 + self.storevar(val, inst.target.name, argidx=argidx) + + elif isinstance(inst, ir.Branch): + cond = self.loadvar(inst.cond.name) + tr = self.blkmap[inst.truebr] + fl = self.blkmap[inst.falsebr] + + condty = self.typeof(inst.cond.name) + pred = self.context.cast(self.builder, cond, condty, types.boolean) + assert pred.type == llvmlite.ir.IntType(1),\ + ("cond is not i1: %s" % pred.type) + self.builder.cbranch(pred, tr, fl) + + elif isinstance(inst, ir.Jump): + target = self.blkmap[inst.target] + self.builder.branch(target) + + elif isinstance(inst, ir.Return): + if self.generator_info: + # StopIteration + self.genlower.return_from_generator(self) + return + val = self.loadvar(inst.value.name) + oty = self.typeof(inst.value.name) + ty = self.fndesc.restype + if isinstance(ty, types.Optional): + # If returning an optional type + self.call_conv.return_optional_value(self.builder, ty, oty, val) + return + assert ty == oty, ( + "type '{}' does not match return type '{}'".format(oty, ty)) + retval = self.context.get_return_value(self.builder, ty, val) + self.call_conv.return_value(self.builder, retval) + + elif isinstance(inst, ir.PopBlock): + pass # this is just a marker + + elif isinstance(inst, ir.StaticSetItem): + signature = self.fndesc.calltypes[inst] + assert signature is not None + try: + impl = self.context.get_function('static_setitem', signature) + except NotImplementedError: + return self.lower_setitem(inst.target, inst.index_var, + inst.value, signature) + else: + target = self.loadvar(inst.target.name) + value = self.loadvar(inst.value.name) + valuety = self.typeof(inst.value.name) + value = self.context.cast(self.builder, value, valuety, + signature.args[2]) + return impl(self.builder, (target, inst.index, value)) + + elif isinstance(inst, ir.Print): + self.lower_print(inst) + + elif isinstance(inst, ir.SetItem): + signature = self.fndesc.calltypes[inst] + assert signature is not None + return self.lower_setitem(inst.target, inst.index, inst.value, + signature) + + elif isinstance(inst, ir.StoreMap): + signature = self.fndesc.calltypes[inst] + assert signature is not None + return self.lower_setitem(inst.dct, inst.key, inst.value, signature) + + elif isinstance(inst, ir.DelItem): + target = self.loadvar(inst.target.name) + index = self.loadvar(inst.index.name) + + targetty = self.typeof(inst.target.name) + indexty = self.typeof(inst.index.name) + + signature = self.fndesc.calltypes[inst] + assert signature is not None + + op = operator.delitem + fnop = self.context.typing_context.resolve_value_type(op) + callsig = fnop.get_call_type( + self.context.typing_context, signature.args, {}, + ) + impl = self.context.get_function(fnop, callsig) + + assert targetty == signature.args[0] + index = self.context.cast(self.builder, index, indexty, + signature.args[1]) + + return impl(self.builder, (target, index)) + + elif isinstance(inst, ir.Del): + self.delvar(inst.value) + + elif isinstance(inst, ir.SetAttr): + target = self.loadvar(inst.target.name) + value = self.loadvar(inst.value.name) + signature = self.fndesc.calltypes[inst] + + targetty = self.typeof(inst.target.name) + valuety = self.typeof(inst.value.name) + assert signature is not None + assert signature.args[0] == targetty + impl = self.context.get_setattr(inst.attr, signature) + + # Convert argument to match + value = self.context.cast(self.builder, value, valuety, + signature.args[1]) + + return impl(self.builder, (target, value)) + + elif isinstance(inst, ir.StaticRaise): + self.lower_static_raise(inst) + + elif isinstance(inst, ir.StaticTryRaise): + self.lower_static_try_raise(inst) + + else: + if hasattr(self.context, "lower_extensions"): + for _class, func in self.context.lower_extensions.items(): + if isinstance(inst, _class): + func(self, inst) + return + raise NotImplementedError(type(inst)) + + def lower_setitem(self, target_var, index_var, value_var, signature): + target = self.loadvar(target_var.name) + value = self.loadvar(value_var.name) + index = self.loadvar(index_var.name) + + targetty = self.typeof(target_var.name) + valuety = self.typeof(value_var.name) + indexty = self.typeof(index_var.name) + + op = operator.setitem + fnop = self.context.typing_context.resolve_value_type(op) + callsig = fnop.get_call_type( + self.context.typing_context, signature.args, {}, + ) + impl = self.context.get_function(fnop, callsig) + + # Convert argument to match + if isinstance(targetty, types.Optional): + target = self.context.cast(self.builder, target, targetty, + targetty.type) + else: + ul = types.unliteral + assert ul(targetty) == ul(signature.args[0]) + + index = self.context.cast(self.builder, index, indexty, + signature.args[1]) + value = self.context.cast(self.builder, value, valuety, + signature.args[2]) + + return impl(self.builder, (target, index, value)) + + def lower_static_raise(self, inst): + if inst.exc_class is None: + # Reraise + self.return_exception(None, loc=self.loc) + else: + self.return_exception(inst.exc_class, inst.exc_args, loc=self.loc) + + def lower_static_try_raise(self, inst): + if inst.exc_class is None: + # Reraise + self.set_exception(None, loc=self.loc) + else: + self.set_exception(inst.exc_class, inst.exc_args, loc=self.loc) + + def lower_assign(self, ty, inst): + value = inst.value + # In nopython mode, closure vars are frozen like globals + if isinstance(value, (ir.Const, ir.Global, ir.FreeVar)): + res = self.context.get_constant_generic(self.builder, ty, + value.value) + self.incref(ty, res) + return res + + elif isinstance(value, ir.Expr): + return self.lower_expr(ty, value) + + elif isinstance(value, ir.Var): + val = self.loadvar(value.name) + oty = self.typeof(value.name) + res = self.context.cast(self.builder, val, oty, ty) + self.incref(ty, res) + return res + + elif isinstance(value, ir.Arg): + # Suspend debug info else all the arg repacking ends up being + # associated with some line or other and it's actually just a detail + # of Numba's CC. + with debuginfo.suspend_emission(self.builder): + # Cast from the argument type to the local variable type + # (note the "arg.FOO" convention as used in typeinfer) + argty = self.typeof("arg." + value.name) + if isinstance(argty, types.Omitted): + pyval = argty.value + tyctx = self.context.typing_context + valty = tyctx.resolve_value_type_prefer_literal(pyval) + # use the type of the constant value + const = self.context.get_constant_generic( + self.builder, valty, pyval, + ) + # cast it to the variable type + res = self.context.cast(self.builder, const, valty, ty) + else: + val = self.fnargs[value.index] + res = self.context.cast(self.builder, val, argty, ty) + self.incref(ty, res) + return res + + elif isinstance(value, ir.Yield): + res = self.lower_yield(ty, value) + self.incref(ty, res) + return res + + raise NotImplementedError(type(value), value) + + def lower_yield(self, retty, inst): + yp = self.generator_info.yield_points[inst.index] + assert yp.inst is inst + y = generators.LowerYield(self, yp, yp.live_vars) + y.lower_yield_suspend() + # Yield to caller + val = self.loadvar(inst.value.name) + typ = self.typeof(inst.value.name) + actual_rettyp = self.gentype.yield_type + + # cast the local val to the type yielded + yret = self.context.cast(self.builder, val, typ, actual_rettyp) + + # get the return repr of yielded value + retval = self.context.get_return_value( + self.builder, actual_rettyp, yret, + ) + + # return + self.call_conv.return_value(self.builder, retval) + + # Resumption point + y.lower_yield_resume() + # None is returned by the yield expression + return self.context.get_constant_generic(self.builder, retty, None) + + def lower_binop(self, resty, expr, op): + # if op in utils.OPERATORS_TO_BUILTINS: + # map operator.the_op => the corresponding types.Function() + # TODO: is this looks dodgy ... + op = self.context.typing_context.resolve_value_type(op) + + lhs = expr.lhs + rhs = expr.rhs + static_lhs = expr.static_lhs + static_rhs = expr.static_rhs + lty = self.typeof(lhs.name) + rty = self.typeof(rhs.name) + lhs = self.loadvar(lhs.name) + rhs = self.loadvar(rhs.name) + + # Convert argument to match + signature = self.fndesc.calltypes[expr] + lhs = self.context.cast(self.builder, lhs, lty, signature.args[0]) + rhs = self.context.cast(self.builder, rhs, rty, signature.args[1]) + + def cast_result(res): + return self.context.cast(self.builder, res, + signature.return_type, resty) + + # First try with static operands, if known + def try_static_impl(tys, args): + if any(a is ir.UNDEFINED for a in args): + return None + try: + if isinstance(op, types.Function): + static_sig = op.get_call_type(self.context.typing_context, + tys, {}) + else: + static_sig = typing.signature(signature.return_type, *tys) + except TypingError: + return None + try: + static_impl = self.context.get_function(op, static_sig) + return static_impl(self.builder, args) + except NotImplementedError: + return None + + res = try_static_impl( + (_lit_or_omitted(static_lhs), _lit_or_omitted(static_rhs)), + (static_lhs, static_rhs), + ) + if res is not None: + return cast_result(res) + + res = try_static_impl( + (_lit_or_omitted(static_lhs), rty), + (static_lhs, rhs), + ) + if res is not None: + return cast_result(res) + + res = try_static_impl( + (lty, _lit_or_omitted(static_rhs)), + (lhs, static_rhs), + ) + if res is not None: + return cast_result(res) + + # Normal implementation for generic arguments + + sig = op.get_call_type(self.context.typing_context, signature.args, {}) + impl = self.context.get_function(op, sig) + res = impl(self.builder, (lhs, rhs)) + return cast_result(res) + + def lower_getitem(self, resty, expr, value, index, signature): + baseval = self.loadvar(value.name) + indexval = self.loadvar(index.name) + # Get implementation of getitem + op = operator.getitem + fnop = self.context.typing_context.resolve_value_type(op) + callsig = fnop.get_call_type( + self.context.typing_context, signature.args, {}, + ) + impl = self.context.get_function(fnop, callsig) + + argvals = (baseval, indexval) + argtyps = (self.typeof(value.name), + self.typeof(index.name)) + castvals = [self.context.cast(self.builder, av, at, ft) + for av, at, ft in zip(argvals, argtyps, + signature.args)] + res = impl(self.builder, castvals) + return self.context.cast(self.builder, res, + signature.return_type, + resty) + + def _cast_var(self, var, ty): + """ + Cast a Numba IR variable to the given Numba type, returning a + low-level value. + """ + if isinstance(var, _VarArgItem): + varty = self.typeof(var.vararg.name)[var.index] + val = self.builder.extract_value(self.loadvar(var.vararg.name), + var.index) + else: + varty = self.typeof(var.name) + val = self.loadvar(var.name) + return self.context.cast(self.builder, val, varty, ty) + + def fold_call_args(self, fnty, signature, pos_args, vararg, kw_args): + if vararg: + # Inject *args from function call + # The lowering will be done in _cast_var() above. + tp_vararg = self.typeof(vararg.name) + assert isinstance(tp_vararg, types.BaseTuple) + pos_args = pos_args + [_VarArgItem(vararg, i) + for i in range(len(tp_vararg))] + + # Fold keyword arguments and resolve default argument values + pysig = signature.pysig + if pysig is None: + if kw_args: + raise NotImplementedError("unsupported keyword arguments " + "when calling %s" % (fnty,)) + argvals = [self._cast_var(var, sigty) + for var, sigty in zip(pos_args, signature.args)] + else: + def normal_handler(index, param, var): + return self._cast_var(var, signature.args[index]) + + def default_handler(index, param, default): + return self.context.get_constant_generic( + self.builder, signature.args[index], default) + + def stararg_handler(index, param, vars): + stararg_ty = signature.args[index] + assert isinstance(stararg_ty, types.BaseTuple), stararg_ty + values = [self._cast_var(var, sigty) + for var, sigty in zip(vars, stararg_ty)] + return cgutils.make_anonymous_struct(self.builder, values) + + argvals = typing.fold_arguments(pysig, + pos_args, dict(kw_args), + normal_handler, + default_handler, + stararg_handler) + return argvals + + def lower_print(self, inst): + """ + Lower a ir.Print() + """ + # We handle this, as far as possible, as a normal call to built-in + # print(). This will make it easy to undo the special ir.Print + # rewrite when it becomes unnecessary (e.g. when we have native + # strings). + sig = self.fndesc.calltypes[inst] + assert sig.return_type == types.none + fnty = self.context.typing_context.resolve_value_type(print) + + # Fix the call signature to inject any constant-inferred + # string argument + pos_tys = list(sig.args) + pos_args = list(inst.args) + for i in range(len(pos_args)): + if i in inst.consts: + pyval = inst.consts[i] + if isinstance(pyval, str): + pos_tys[i] = types.literal(pyval) + + fixed_sig = typing.signature(sig.return_type, *pos_tys) + fixed_sig = fixed_sig.replace(pysig=sig.pysig) + + argvals = self.fold_call_args(fnty, sig, pos_args, inst.vararg, {}) + impl = self.context.get_function(print, fixed_sig) + impl(self.builder, argvals) + + def lower_call(self, resty, expr): + signature = self.fndesc.calltypes[expr] + self.debug_print("# lower_call: expr = {0}".format(expr)) + if isinstance(signature.return_type, types.Phantom): + return self.context.get_dummy_value() + + fnty = self.typeof(expr.func.name) + + if isinstance(fnty, types.ObjModeDispatcher): + res = self._lower_call_ObjModeDispatcher(fnty, expr, signature) + + elif isinstance(fnty, types.ExternalFunction): + res = self._lower_call_ExternalFunction(fnty, expr, signature) + + elif isinstance(fnty, types.ExternalFunctionPointer): + res = self._lower_call_ExternalFunctionPointer( + fnty, expr, signature) + + elif isinstance(fnty, types.RecursiveCall): + res = self._lower_call_RecursiveCall(fnty, expr, signature) + + elif isinstance(fnty, types.FunctionType): + res = self._lower_call_FunctionType(fnty, expr, signature) + + else: + res = self._lower_call_normal(fnty, expr, signature) + + # If lowering the call returned None, interpret that as returning dummy + # value if the return type of the function is void, otherwise there is + # a problem + if res is None: + if signature.return_type == types.void: + res = self.context.get_dummy_value() + else: + raise LoweringError( + msg="non-void function returns None from implementation", + loc=self.loc + ) + + return self.context.cast(self.builder, res, signature.return_type, + resty) + + def _lower_call_ObjModeDispatcher(self, fnty, expr, signature): + from numba.core.pythonapi import ObjModeUtils + + self.init_pyapi() + # Acquire the GIL + gil_state = self.pyapi.gil_ensure() + # Fix types + argnames = [a.name for a in expr.args] + argtypes = [self.typeof(a) for a in argnames] + argvalues = [self.loadvar(a) for a in argnames] + for v, ty in zip(argvalues, argtypes): + # Because .from_native_value steal the reference + self.incref(ty, v) + + argobjs = [self.pyapi.from_native_value(atyp, aval, + self.env_manager) + for atyp, aval in zip(argtypes, argvalues)] + + # Load objmode dispatcher + callee = ObjModeUtils(self.pyapi).load_dispatcher(fnty, argtypes) + # Make Call + ret_obj = self.pyapi.call_function_objargs(callee, argobjs) + has_exception = cgutils.is_null(self.builder, ret_obj) + with self. builder.if_else(has_exception) as (then, orelse): + # Handles exception + # This branch must exit the function + with then: + # Clean arg + for obj in argobjs: + self.pyapi.decref(obj) + + # Release the GIL + self.pyapi.gil_release(gil_state) + + # Return and signal exception + self.call_conv.return_exc(self.builder) + + # Handles normal return + with orelse: + # Fix output value + native = self.pyapi.to_native_value( + fnty.dispatcher.output_types, + ret_obj, + ) + output = native.value + + # Release objs + self.pyapi.decref(ret_obj) + for obj in argobjs: + self.pyapi.decref(obj) + + # cleanup output + if callable(native.cleanup): + native.cleanup() + + # Release the GIL + self.pyapi.gil_release(gil_state) + + # Error during unboxing + with self.builder.if_then(native.is_error): + self.call_conv.return_exc(self.builder) + + return output + + def _lower_call_ExternalFunction(self, fnty, expr, signature): + # Handle a named external function + self.debug_print("# external function") + argvals = self.fold_call_args( + fnty, signature, expr.args, expr.vararg, expr.kws, + ) + fndesc = funcdesc.ExternalFunctionDescriptor( + fnty.symbol, fnty.sig.return_type, fnty.sig.args) + func = self.context.declare_external_function( + self.builder.module, fndesc) + return self.context.call_external_function( + self.builder, func, fndesc.argtypes, argvals, + ) + + def _lower_call_ExternalFunctionPointer(self, fnty, expr, signature): + # Handle a C function pointer + self.debug_print("# calling external function pointer") + argvals = self.fold_call_args( + fnty, signature, expr.args, expr.vararg, expr.kws, + ) + pointer = self.loadvar(expr.func.name) + # If the external function pointer uses libpython + if fnty.requires_gil: + self.init_pyapi() + # Acquire the GIL + gil_state = self.pyapi.gil_ensure() + # Make PyObjects + newargvals = [] + pyvals = [] + for exptyp, gottyp, aval in zip(fnty.sig.args, signature.args, + argvals): + # Adjust argument values to pyobjects + if exptyp == types.ffi_forced_object: + self.incref(gottyp, aval) + obj = self.pyapi.from_native_value( + gottyp, aval, self.env_manager, + ) + newargvals.append(obj) + pyvals.append(obj) + else: + newargvals.append(aval) + + # Call external function + res = self.context.call_function_pointer( + self.builder, pointer, newargvals, fnty.cconv, + ) + # Release PyObjects + for obj in pyvals: + self.pyapi.decref(obj) + + # Release the GIL + self.pyapi.gil_release(gil_state) + # If the external function pointer does NOT use libpython + else: + res = self.context.call_function_pointer( + self.builder, pointer, argvals, fnty.cconv, + ) + return res + + def _lower_call_RecursiveCall(self, fnty, expr, signature): + # Recursive call + argvals = self.fold_call_args( + fnty, signature, expr.args, expr.vararg, expr.kws, + ) + rec_ov = fnty.get_overloads(signature.args) + mangler = self.context.mangler or default_mangler + abi_tags = self.fndesc.abi_tags + mangled_name = mangler(rec_ov.qualname, signature.args, + abi_tags=abi_tags, uid=rec_ov.uid) + # special case self recursion + if self.builder.function.name.startswith(mangled_name): + res = self.context.call_internal( + self.builder, self.fndesc, signature, argvals, + ) + else: + res = self.context.call_unresolved( + self.builder, mangled_name, signature, argvals, + ) + return res + + def _lower_call_FunctionType(self, fnty, expr, signature): + self.debug_print("# calling first-class function type") + sig = types.unliteral(signature) + if not fnty.check_signature(signature): + # value dependent polymorphism? + raise UnsupportedError( + f'mismatch of function types:' + f' expected {fnty} but got {types.FunctionType(sig)}') + ftype = fnty.ftype + argvals = self.fold_call_args( + fnty, sig, expr.args, expr.vararg, expr.kws, + ) + func_ptr = self.__get_function_pointer(ftype, expr.func.name, sig=sig) + res = self.builder.call(func_ptr, argvals, cconv=fnty.cconv) + return res + + def __get_function_pointer(self, ftype, fname, sig=None): + from numba.experimental.function_type import lower_get_wrapper_address + + llty = self.context.get_value_type(ftype) + fstruct = self.loadvar(fname) + addr = self.builder.extract_value(fstruct, 0, + name='addr_of_%s' % (fname)) + + fptr = cgutils.alloca_once(self.builder, llty, + name="fptr_of_%s" % (fname)) + with self.builder.if_else( + cgutils.is_null(self.builder, addr), + likely=False) as (then, orelse): + with then: + self.init_pyapi() + # Acquire the GIL + gil_state = self.pyapi.gil_ensure() + pyaddr = self.builder.extract_value( + fstruct, 1, + name='pyaddr_of_%s' % (fname)) + # try to recover the function address, see + # test_zero_address BadToGood example in + # test_function_type.py + addr1 = lower_get_wrapper_address( + self.context, self.builder, pyaddr, sig, + failure_mode='ignore') + with self.builder.if_then( + cgutils.is_null(self.builder, addr1), likely=False): + self.return_exception( + RuntimeError, + exc_args=(f"{ftype} function address is null",), + loc=self.loc) + addr2 = self.pyapi.long_as_voidptr(addr1) + self.builder.store(self.builder.bitcast(addr2, llty), fptr) + self.pyapi.decref(addr1) + self.pyapi.gil_release(gil_state) + with orelse: + self.builder.store(self.builder.bitcast(addr, llty), fptr) + return self.builder.load(fptr) + + def _lower_call_normal(self, fnty, expr, signature): + # Normal function resolution + self.debug_print("# calling normal function: {0}".format(fnty)) + self.debug_print("# signature: {0}".format(signature)) + if isinstance(fnty, types.ObjModeDispatcher): + argvals = expr.func.args + else: + argvals = self.fold_call_args( + fnty, signature, expr.args, expr.vararg, expr.kws, + ) + tname = expr.target + if tname is not None: + from numba.core.target_extension import resolve_dispatcher_from_str + disp = resolve_dispatcher_from_str(tname) + hw_ctx = disp.targetdescr.target_context + impl = hw_ctx.get_function(fnty, signature) + else: + impl = self.context.get_function(fnty, signature) + if signature.recvr: + # The "self" object is passed as the function object + # for bounded function + the_self = self.loadvar(expr.func.name) + # Prepend the self reference + argvals = [the_self] + list(argvals) + + res = impl(self.builder, argvals, self.loc) + return res + + def lower_expr(self, resty, expr): + if expr.op == 'binop': + return self.lower_binop(resty, expr, expr.fn) + elif expr.op == 'inplace_binop': + lty = self.typeof(expr.lhs.name) + if lty.mutable: + return self.lower_binop(resty, expr, expr.fn) + else: + # inplace operators on non-mutable types reuse the same + # definition as the corresponding copying operators.) + return self.lower_binop(resty, expr, expr.immutable_fn) + elif expr.op == 'unary': + val = self.loadvar(expr.value.name) + typ = self.typeof(expr.value.name) + func_ty = self.context.typing_context.resolve_value_type(expr.fn) + # Get function + signature = self.fndesc.calltypes[expr] + impl = self.context.get_function(func_ty, signature) + # Convert argument to match + val = self.context.cast(self.builder, val, typ, signature.args[0]) + res = impl(self.builder, [val]) + res = self.context.cast(self.builder, res, + signature.return_type, resty) + return res + + elif expr.op == 'call': + res = self.lower_call(resty, expr) + return res + + elif expr.op == 'pair_first': + val = self.loadvar(expr.value.name) + ty = self.typeof(expr.value.name) + res = self.context.pair_first(self.builder, val, ty) + self.incref(resty, res) + return res + + elif expr.op == 'pair_second': + val = self.loadvar(expr.value.name) + ty = self.typeof(expr.value.name) + res = self.context.pair_second(self.builder, val, ty) + self.incref(resty, res) + return res + + elif expr.op in ('getiter', 'iternext'): + val = self.loadvar(expr.value.name) + ty = self.typeof(expr.value.name) + signature = self.fndesc.calltypes[expr] + impl = self.context.get_function(expr.op, signature) + [fty] = signature.args + castval = self.context.cast(self.builder, val, ty, fty) + res = impl(self.builder, (castval,)) + res = self.context.cast(self.builder, res, signature.return_type, + resty) + return res + + elif expr.op == 'exhaust_iter': + val = self.loadvar(expr.value.name) + ty = self.typeof(expr.value.name) + # Unpack optional + if isinstance(ty, types.Optional): + val = self.context.cast(self.builder, val, ty, ty.type) + ty = ty.type + + # If we have a tuple, we needn't do anything + # (and we can't iterate over the heterogeneous ones). + if isinstance(ty, types.BaseTuple): + assert ty == resty + self.incref(ty, val) + return val + + itemty = ty.iterator_type.yield_type + tup = self.context.get_constant_undef(resty) + pairty = types.Pair(itemty, types.boolean) + getiter_sig = typing.signature(ty.iterator_type, ty) + getiter_impl = self.context.get_function('getiter', + getiter_sig) + iternext_sig = typing.signature(pairty, ty.iterator_type) + iternext_impl = self.context.get_function('iternext', + iternext_sig) + iterobj = getiter_impl(self.builder, (val,)) + # We call iternext() as many times as desired (`expr.count`). + for i in range(expr.count): + pair = iternext_impl(self.builder, (iterobj,)) + is_valid = self.context.pair_second(self.builder, + pair, pairty) + with cgutils.if_unlikely(self.builder, + self.builder.not_(is_valid)): + self.return_exception(ValueError, loc=self.loc) + item = self.context.pair_first(self.builder, + pair, pairty) + tup = self.builder.insert_value(tup, item, i) + + # Call iternext() once more to check that the iterator + # is exhausted. + pair = iternext_impl(self.builder, (iterobj,)) + is_valid = self.context.pair_second(self.builder, + pair, pairty) + with cgutils.if_unlikely(self.builder, is_valid): + self.return_exception(ValueError, loc=self.loc) + + self.decref(ty.iterator_type, iterobj) + return tup + + elif expr.op == "getattr": + val = self.loadvar(expr.value.name) + ty = self.typeof(expr.value.name) + + if isinstance(resty, types.BoundFunction): + # if we are getting out a method, assume we have typed this + # properly and just build a bound function object + casted = self.context.cast(self.builder, val, ty, resty.this) + res = self.context.get_bound_function(self.builder, casted, + resty.this) + self.incref(resty, res) + return res + else: + impl = self.context.get_getattr(ty, expr.attr) + attrty = self.context.typing_context.resolve_getattr(ty, + expr.attr) + + if impl is None: + # ignore the attribute + return self.context.get_dummy_value() + else: + res = impl(self.context, self.builder, ty, val, expr.attr) + + # Cast the attribute type to the expected output type + res = self.context.cast(self.builder, res, attrty, resty) + return res + + elif expr.op == "static_getitem": + signature = typing.signature( + resty, + self.typeof(expr.value.name), + _lit_or_omitted(expr.index), + ) + try: + # Both get_function() and the returned implementation can + # raise NotImplementedError if the types aren't supported + impl = self.context.get_function("static_getitem", signature) + return impl(self.builder, + (self.loadvar(expr.value.name), expr.index)) + except NotImplementedError: + if expr.index_var is None: + raise + # Fall back on the generic getitem() implementation + # for this type. + signature = self.fndesc.calltypes[expr] + return self.lower_getitem(resty, expr, expr.value, + expr.index_var, signature) + elif expr.op == "typed_getitem": + signature = typing.signature( + resty, + self.typeof(expr.value.name), + self.typeof(expr.index.name), + ) + impl = self.context.get_function("typed_getitem", signature) + return impl(self.builder, (self.loadvar(expr.value.name), + self.loadvar(expr.index.name))) + elif expr.op == "getitem": + signature = self.fndesc.calltypes[expr] + return self.lower_getitem(resty, expr, expr.value, expr.index, + signature) + + elif expr.op == "build_tuple": + itemvals = [self.loadvar(i.name) for i in expr.items] + itemtys = [self.typeof(i.name) for i in expr.items] + castvals = [self.context.cast(self.builder, val, fromty, toty) + for val, toty, fromty in zip(itemvals, resty, itemtys)] + tup = self.context.make_tuple(self.builder, resty, castvals) + self.incref(resty, tup) + return tup + + elif expr.op == "build_list": + itemvals = [self.loadvar(i.name) for i in expr.items] + itemtys = [self.typeof(i.name) for i in expr.items] + if isinstance(resty, types.LiteralList): + castvals = [self.context.cast(self.builder, val, fromty, toty) + for val, toty, fromty in zip(itemvals, resty.types, + itemtys)] + tup = self.context.make_tuple(self.builder, + types.Tuple(resty.types), + castvals) + self.incref(resty, tup) + return tup + else: + castvals = [self.context.cast(self.builder, val, fromty, + resty.dtype) + for val, fromty in zip(itemvals, itemtys)] + return self.context.build_list(self.builder, resty, castvals) + + elif expr.op == "build_set": + # Insert in reverse order, as Python does + items = expr.items[::-1] + itemvals = [self.loadvar(i.name) for i in items] + itemtys = [self.typeof(i.name) for i in items] + castvals = [self.context.cast(self.builder, val, fromty, + resty.dtype) + for val, fromty in zip(itemvals, itemtys)] + return self.context.build_set(self.builder, resty, castvals) + + elif expr.op == "build_map": + items = expr.items + keys, values = [], [] + key_types, value_types = [], [] + for k, v in items: + key = self.loadvar(k.name) + keytype = self.typeof(k.name) + val = self.loadvar(v.name) + valtype = self.typeof(v.name) + keys.append(key) + values.append(val) + key_types.append(keytype) + value_types.append(valtype) + return self.context.build_map(self.builder, resty, + list(zip(key_types, value_types)), + list(zip(keys, values))) + + elif expr.op == "cast": + val = self.loadvar(expr.value.name) + ty = self.typeof(expr.value.name) + castval = self.context.cast(self.builder, val, ty, resty) + self.incref(resty, castval) + return castval + + elif expr.op == "phi": + raise LoweringError("PHI not stripped") + + elif expr.op == 'null': + return self.context.get_constant_null(resty) + + elif expr.op in self.context.special_ops: + res = self.context.special_ops[expr.op](self, expr) + return res + + raise NotImplementedError(expr) + + def _alloca_var(self, name, fetype): + """ + Ensure the given variable has an allocated stack slot (if needed). + """ + if name in self.varmap: + # quit early + return + + # If the name is used in multiple blocks or lowering with debuginfo... + if ((name not in self._singly_assigned_vars) or + self._disable_sroa_like_opt): + # If not already defined, allocate it + ptr = self.alloca(name, fetype) + # Remember the pointer + self.varmap[name] = ptr + + def getvar(self, name): + """ + Get a pointer to the given variable's slot. + """ + if not self._disable_sroa_like_opt: + assert name not in self._blk_local_varmap + assert name not in self._singly_assigned_vars + return self.varmap[name] + + def loadvar(self, name): + """ + Load the given variable's value. + """ + if name in self._blk_local_varmap and not self._disable_sroa_like_opt: + return self._blk_local_varmap[name] + ptr = self.getvar(name) + + # Don't associate debuginfo with the load for a function arg else it + # creates instructions ahead of the first source line of the + # function which then causes problems with breaking on the function + # symbol (it hits the symbol, not the first line). + if name in self.func_ir.arg_names: + with debuginfo.suspend_emission(self.builder): + return self.builder.load(ptr) + else: + return self.builder.load(ptr) + + def storevar(self, value, name, argidx=None): + """ + Store the value into the given variable. + """ + fetype = self.typeof(name) + # Define if not already + self._alloca_var(name, fetype) + + # Store variable + if (name in self._singly_assigned_vars and + not self._disable_sroa_like_opt): + self._blk_local_varmap[name] = value + else: + if argidx is None: + # Clean up existing value stored in the variable, not needed + # if it's an arg + old = self.loadvar(name) + self.decref(fetype, old) + + # stack stored variable + ptr = self.getvar(name) + if value.type != ptr.type.pointee: + msg = ("Storing {value.type} to ptr of {ptr.type.pointee} " + "('{name}'). FE type {fetype}").format(value=value, + ptr=ptr, + fetype=fetype, + name=name) + raise AssertionError(msg) + + # If this store is associated with an argument to the function (i.e. + # store following reassemble from CC splatting structs as many args + # to the function) then mark this variable as such. + if argidx is not None: + with debuginfo.suspend_emission(self.builder): + self.builder.store(value, ptr) + loc = self.defn_loc # the line with `def ` + lltype = self.context.get_value_type(fetype) + sizeof = self.context.get_abi_sizeof(lltype) + datamodel = self.context.data_model_manager[fetype] + self.debuginfo.mark_variable(self.builder, ptr, name=name, + lltype=lltype, size=sizeof, + line=loc.line, datamodel=datamodel, + argidx=argidx) + else: + self.builder.store(value, ptr) + + def delvar(self, name): + """ + Delete the given variable. + """ + fetype = self.typeof(name) + + # Out-of-order + if (name not in self._blk_local_varmap and + not self._disable_sroa_like_opt): + if name in self._singly_assigned_vars: + self._singly_assigned_vars.discard(name) + + # Define if not already (may happen if the variable is deleted + # at the beginning of a loop, but only set later in the loop) + self._alloca_var(name, fetype) + + if name in self._blk_local_varmap and not self._disable_sroa_like_opt: + llval = self._blk_local_varmap[name] + self.decref(fetype, llval) + else: + ptr = self.getvar(name) + self.decref(fetype, self.builder.load(ptr)) + # Zero-fill variable to avoid double frees on subsequent dels + self.builder.store(Constant(ptr.type.pointee, None), ptr) + + def alloca(self, name, type): + lltype = self.context.get_value_type(type) + datamodel = self.context.data_model_manager[type] + return self.alloca_lltype(name, lltype, datamodel=datamodel) + + def alloca_lltype(self, name, lltype, datamodel=None): + # Is user variable? + is_uservar = not name.startswith('$') + # Allocate space for variable + aptr = cgutils.alloca_once(self.builder, lltype, + name=name, zfill=False) + + # Emit debug info for user variable + if is_uservar: + # Don't associate debuginfo with the alloca for a function arg, this + # is handled by the first store to the alloca so that repacking the + # splatted args from the CC is dealt with. + if name not in self.func_ir.arg_names: + sizeof = self.context.get_abi_sizeof(lltype) + self.debuginfo.mark_variable(self.builder, aptr, name=name, + lltype=lltype, size=sizeof, + line=self.loc.line, + datamodel=datamodel,) + return aptr + + def incref(self, typ, val): + if not self.context.enable_nrt: + return + + self.context.nrt.incref(self.builder, typ, val) + + def decref(self, typ, val): + if not self.context.enable_nrt: + return + + # do not associate decref with "use", it creates "jumpy" line info as + # the decrefs are usually where the ir.Del nodes are, which is at the + # end of the block. + with debuginfo.suspend_emission(self.builder): + self.context.nrt.decref(self.builder, typ, val) + + +def _lit_or_omitted(value): + """Returns a Literal instance if the type of value is supported; + otherwise, return `Omitted(value)`. + """ + try: + return types.literal(value) + except LiteralTypingError: + return types.Omitted(value) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/object_mode_passes.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/object_mode_passes.py new file mode 100644 index 0000000000000000000000000000000000000000..f5cd52383c11986fa080cc2b7a356ddc9948843b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/object_mode_passes.py @@ -0,0 +1,169 @@ +import warnings +from numba.core import (errors, types, typing, funcdesc, config, pylowering, + transforms) +from numba.core.compiler_machinery import (FunctionPass, LoweringPass, + register_pass) +from collections import defaultdict + + +@register_pass(mutates_CFG=True, analysis_only=False) +class ObjectModeFrontEnd(FunctionPass): + _name = "object_mode_front_end" + + def __init__(self): + FunctionPass.__init__(self) + + def _frontend_looplift(self, state): + """ + Loop lifting analysis and transformation + """ + loop_flags = state.flags.copy() + outer_flags = state.flags.copy() + # Do not recursively loop lift + outer_flags.enable_looplift = False + loop_flags.enable_looplift = False + if not state.flags.enable_pyobject_looplift: + loop_flags.enable_pyobject = False + loop_flags.enable_ssa = False + + main, loops = transforms.loop_lifting(state.func_ir, + typingctx=state.typingctx, + targetctx=state.targetctx, + locals=state.locals, + flags=loop_flags) + if loops: + # Some loops were extracted + if config.DEBUG_FRONTEND or config.DEBUG: + for loop in loops: + print("Lifting loop", loop.get_source_location()) + from numba.core.compiler import compile_ir + cres = compile_ir(state.typingctx, state.targetctx, main, + state.args, state.return_type, + outer_flags, state.locals, + lifted=tuple(loops), lifted_from=None, + is_lifted_loop=True) + return cres + + def run_pass(self, state): + from numba.core.compiler import _EarlyPipelineCompletion + # NOTE: That so much stuff, including going back into the compiler, is + # captured in a single pass is not ideal. + if state.flags.enable_looplift: + assert not state.lifted + cres = self._frontend_looplift(state) + if cres is not None: + raise _EarlyPipelineCompletion(cres) + + # Fallback typing: everything is a python object + state.typemap = defaultdict(lambda: types.pyobject) + state.calltypes = defaultdict(lambda: types.pyobject) + state.return_type = types.pyobject + return True + + +@register_pass(mutates_CFG=True, analysis_only=False) +class ObjectModeBackEnd(LoweringPass): + + _name = "object_mode_back_end" + + def __init__(self): + LoweringPass.__init__(self) + + def _py_lowering_stage(self, targetctx, library, interp, flags): + fndesc = funcdesc.PythonFunctionDescriptor.from_object_mode_function( + interp + ) + with targetctx.push_code_library(library): + lower = pylowering.PyLower(targetctx, library, fndesc, interp) + lower.lower() + if not flags.no_cpython_wrapper: + lower.create_cpython_wrapper() + env = lower.env + call_helper = lower.call_helper + del lower + from numba.core.compiler import _LowerResult # TODO: move this + if flags.no_compile: + return _LowerResult(fndesc, call_helper, cfunc=None, env=env) + else: + # Prepare for execution + cfunc = targetctx.get_executable(library, fndesc, env) + return _LowerResult(fndesc, call_helper, cfunc=cfunc, env=env) + + def run_pass(self, state): + """ + Lowering for object mode + """ + + if state.library is None: + codegen = state.targetctx.codegen() + state.library = codegen.create_library(state.func_id.func_qualname) + # Enable object caching upfront, so that the library can + # be later serialized. + state.library.enable_object_caching() + + def backend_object_mode(): + """ + Object mode compilation + """ + if len(state.args) != state.nargs: + # append missing + # BUG?: What's going on with nargs here? + # check state.nargs vs self.nargs on original code + state.args = (tuple(state.args) + (types.pyobject,) * + (state.nargs - len(state.args))) + + return self._py_lowering_stage(state.targetctx, + state.library, + state.func_ir, + state.flags) + + lowered = backend_object_mode() + signature = typing.signature(state.return_type, *state.args) + from numba.core.compiler import compile_result + state.cr = compile_result( + typing_context=state.typingctx, + target_context=state.targetctx, + entry_point=lowered.cfunc, + typing_error=state.status.fail_reason, + type_annotation=state.type_annotation, + library=state.library, + call_helper=lowered.call_helper, + signature=signature, + objectmode=True, + lifted=state.lifted, + fndesc=lowered.fndesc, + environment=lowered.env, + metadata=state.metadata, + reload_init=state.reload_init, + ) + + # Warn, deprecated behaviour, code compiled in objmode without + # force_pyobject indicates fallback from nopython mode + if not state.flags.force_pyobject: + # first warn about object mode and yes/no to lifted loops + if len(state.lifted) > 0: + warn_msg = ('Function "%s" was compiled in object mode without' + ' forceobj=True, but has lifted loops.' % + (state.func_id.func_name,)) + else: + warn_msg = ('Function "%s" was compiled in object mode without' + ' forceobj=True.' % (state.func_id.func_name,)) + warnings.warn(errors.NumbaWarning(warn_msg, + state.func_ir.loc)) + + url = ("https://numba.readthedocs.io/en/stable/reference/" + "deprecation.html#deprecation-of-object-mode-fall-" + "back-behaviour-when-using-jit") + msg = ("\nFall-back from the nopython compilation path to the " + "object mode compilation path has been detected, this is " + "deprecated behaviour.\n\nFor more information visit %s" % + url) + warnings.warn(errors.NumbaDeprecationWarning(msg, + state.func_ir.loc)) + if state.flags.release_gil: + warn_msg = ("Code running in object mode won't allow parallel" + " execution despite nogil=True.") + warnings.warn_explicit(warn_msg, errors.NumbaWarning, + state.func_id.filename, + state.func_id.firstlineno) + return True diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/optional.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/optional.py new file mode 100644 index 0000000000000000000000000000000000000000..16dcb52360ad2190c9b6af44023a362bbc33bf37 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/optional.py @@ -0,0 +1,121 @@ +import operator + +from numba.core import types, typing, cgutils + +from numba.core.imputils import (lower_cast, lower_builtin, + lower_getattr_generic, impl_ret_untracked, + lower_setattr_generic) + + +def always_return_true_impl(context, builder, sig, args): + return cgutils.true_bit + + +def always_return_false_impl(context, builder, sig, args): + return cgutils.false_bit + + +def optional_is_none(context, builder, sig, args): + """ + Check if an Optional value is invalid + """ + [lty, rty] = sig.args + [lval, rval] = args + + # Make sure None is on the right + if lty == types.none: + lty, rty = rty, lty + lval, rval = rval, lval + + opt_type = lty + opt_val = lval + + opt = context.make_helper(builder, opt_type, opt_val) + res = builder.not_(cgutils.as_bool_bit(builder, opt.valid)) + return impl_ret_untracked(context, builder, sig.return_type, res) + + +# None is/not None +lower_builtin(operator.is_, types.none, types.none)(always_return_true_impl) + +# Optional is None +lower_builtin(operator.is_, types.Optional, types.none)(optional_is_none) +lower_builtin(operator.is_, types.none, types.Optional)(optional_is_none) + + +@lower_getattr_generic(types.Optional) +def optional_getattr(context, builder, typ, value, attr): + """ + Optional.__getattr__ => redirect to the wrapped type. + """ + inner_type = typ.type + val = context.cast(builder, value, typ, inner_type) + imp = context.get_getattr(inner_type, attr) + return imp(context, builder, inner_type, val, attr) + + +@lower_setattr_generic(types.Optional) +def optional_setattr(context, builder, sig, args, attr): + """ + Optional.__setattr__ => redirect to the wrapped type. + """ + basety, valty = sig.args + target, val = args + target_type = basety.type + target = context.cast(builder, target, basety, target_type) + + newsig = typing.signature(sig.return_type, target_type, valty) + imp = context.get_setattr(attr, newsig) + return imp(builder, (target, val)) + + +@lower_cast(types.Optional, types.Optional) +def optional_to_optional(context, builder, fromty, toty, val): + """ + The handling of optional->optional cast must be special cased for + correct propagation of None value. Given type T and U. casting of + T? to U? (? denotes optional) should always succeed. If the from-value + is None, the None value the casted value (U?) should be None; otherwise, + the from-value is casted to U. This is different from casting T? to U, + which requires the from-value must not be None. + """ + optval = context.make_helper(builder, fromty, value=val) + validbit = cgutils.as_bool_bit(builder, optval.valid) + # Create uninitialized optional value + outoptval = context.make_helper(builder, toty) + + with builder.if_else(validbit) as (is_valid, is_not_valid): + with is_valid: + # Cast internal value + outoptval.valid = cgutils.true_bit + outoptval.data = context.cast(builder, optval.data, + fromty.type, toty.type) + + with is_not_valid: + # Store None to result + outoptval.valid = cgutils.false_bit + outoptval.data = cgutils.get_null_value( + outoptval.data.type) + + return outoptval._getvalue() + + +@lower_cast(types.Any, types.Optional) +def any_to_optional(context, builder, fromty, toty, val): + if fromty == types.none: + return context.make_optional_none(builder, toty.type) + else: + val = context.cast(builder, val, fromty, toty.type) + return context.make_optional_value(builder, toty.type, val) + + +@lower_cast(types.Optional, types.Any) +@lower_cast(types.Optional, types.Boolean) +def optional_to_any(context, builder, fromty, toty, val): + optval = context.make_helper(builder, fromty, value=val) + validbit = cgutils.as_bool_bit(builder, optval.valid) + with builder.if_then(builder.not_(validbit), likely=False): + msg = "expected %s, got None" % (fromty.type,) + context.call_conv.return_user_exc(builder, TypeError, (msg,)) + + return context.cast(builder, optval.data, fromty.type, toty) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/options.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/options.py new file mode 100644 index 0000000000000000000000000000000000000000..3ca89a8e1ed9541b30a1805f79c887ffa2d9dff5 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/options.py @@ -0,0 +1,109 @@ +""" +Target Options +""" +import operator + +from numba.core import config, utils +from numba.core.targetconfig import TargetConfig, Option + + +class TargetOptions: + """Target options maps user options from decorators to the + ``numba.core.compiler.Flags`` used by lowering and target context. + """ + class Mapping: + def __init__(self, flag_name, apply=lambda x: x): + self.flag_name = flag_name + self.apply = apply + + def finalize(self, flags, options): + """Subclasses can override this method to make target specific + customizations of default flags. + + Parameters + ---------- + flags : Flags + options : dict + """ + pass + + @classmethod + def parse_as_flags(cls, flags, options): + """Parse target options defined in ``options`` and set ``flags`` + accordingly. + + Parameters + ---------- + flags : Flags + options : dict + """ + opt = cls() + opt._apply(flags, options) + opt.finalize(flags, options) + return flags + + def _apply(self, flags, options): + # Find all Mapping instances in the class + mappings = {} + cls = type(self) + for k in dir(cls): + v = getattr(cls, k) + if isinstance(v, cls.Mapping): + mappings[k] = v + + used = set() + for k, mapping in mappings.items(): + if k in options: + v = mapping.apply(options[k]) + setattr(flags, mapping.flag_name, v) + used.add(k) + + unused = set(options) - used + if unused: + # Unread options? + m = (f"Unrecognized options: {unused}. " + f"Known options are {mappings.keys()}") + raise KeyError(m) + + +_mapping = TargetOptions.Mapping + + +class DefaultOptions: + """Defines how user-level target options are mapped to the target flags. + """ + nopython = _mapping("enable_pyobject", operator.not_) + forceobj = _mapping("force_pyobject") + looplift = _mapping("enable_looplift") + _nrt = _mapping("nrt") + debug = _mapping("debuginfo") + boundscheck = _mapping("boundscheck") + nogil = _mapping("release_gil") + + no_rewrites = _mapping("no_rewrites") + no_cpython_wrapper = _mapping("no_cpython_wrapper") + no_cfunc_wrapper = _mapping("no_cfunc_wrapper") + + parallel = _mapping("auto_parallel") + fastmath = _mapping("fastmath") + error_model = _mapping("error_model") + inline = _mapping("inline") + forceinline = _mapping("forceinline") + + target_backend = _mapping("target_backend") + + _dbg_extend_lifetimes = _mapping("dbg_extend_lifetimes") + _dbg_optnone = _mapping("dbg_optnone") + + +def include_default_options(*args): + """Returns a mixin class with a subset of the options + + Parameters + ---------- + *args : str + Option names to include. + """ + glbs = {k: getattr(DefaultOptions, k) for k in args} + + return type("OptionMixins", (), glbs) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/overload_glue.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/overload_glue.py new file mode 100644 index 0000000000000000000000000000000000000000..03eb97bfbe4b8e854cfa405842eb045673d4bf50 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/overload_glue.py @@ -0,0 +1,308 @@ +""" +Provides wrapper functions for "glueing" together Numba implementations that are +written in the "old" style of a separate typing and lowering implementation. +""" +import types as pytypes +import textwrap +from threading import RLock +from collections import defaultdict + +from numba.core import errors + + +class _OverloadWrapper(object): + """This class does all the work of assembling and registering wrapped split + implementations. + """ + + def __init__(self, function, typing_key=None): + assert function is not None + self._function = function + self._typing_key = typing_key + self._BIND_TYPES = dict() + self._selector = None + self._TYPER = None + # run to register overload, the intrinsic sorts out the binding to the + # registered impls at the point the overload is evaluated, i.e. this + # is all lazy. + self._build() + + def _stub_generator(self, body_func, varnames): + """This generates a function based on the argnames provided in + "varnames", the "body_func" is the function that'll type the overloaded + function and then work out which lowering to return""" + def stub(tyctx): + # body is supplied when the function is magic'd into life via glbls + return body(tyctx) # noqa: F821 + + stub_code = stub.__code__ + new_varnames = [*stub_code.co_varnames] + new_varnames.extend(varnames) + co_argcount = len(new_varnames) + co_args = [co_argcount] + additional_co_nlocals = len(varnames) + + from numba.core import utils + if utils.PYVERSION >= (3, 8): + co_args.append(stub_code.co_posonlyargcount) + co_args.append(stub_code.co_kwonlyargcount) + co_args.extend([stub_code.co_nlocals + additional_co_nlocals, + stub_code.co_stacksize, + stub_code.co_flags, + stub_code.co_code, + stub_code.co_consts, + stub_code.co_names, + tuple(new_varnames), + stub_code.co_filename, + stub_code.co_name, + stub_code.co_firstlineno, + stub_code.co_lnotab, + stub_code.co_freevars, + stub_code.co_cellvars + ]) + + new_code = pytypes.CodeType(*co_args) + + # get function + new_func = pytypes.FunctionType(new_code, {'body': body_func}) + return new_func + + def wrap_typing(self): + """ + Use this to replace @infer_global, it records the decorated function + as a typer for the argument `concrete_function`. + """ + if self._typing_key is None: + key = self._function + else: + key = self._typing_key + + def inner(typing_class): + # Note that two templates could be used for the same function, to + # avoid @infer_global etc the typing template is copied. This is to + # ensure there's a 1:1 relationship between the typing templates and + # their keys. + clazz_dict = dict(typing_class.__dict__) + clazz_dict['key'] = key + cloned = type(f"cloned_template_for_{key}", typing_class.__bases__, + clazz_dict) + self._TYPER = cloned + _overload_glue.add_no_defer(key) + self._build() + return typing_class + return inner + + def wrap_impl(self, *args): + """ + Use this to replace @lower*, it records the decorated function as the + lowering implementation + """ + assert self._TYPER is not None + + def inner(lowerer): + self._BIND_TYPES[args] = lowerer + return lowerer + return inner + + def _assemble(self): + """Assembles the OverloadSelector definitions from the registered + typing to lowering map. + """ + from numba.core.base import OverloadSelector + + if self._typing_key is None: + key = self._function + else: + key = self._typing_key + + _overload_glue.flush_deferred_lowering(key) + + self._selector = OverloadSelector() + msg = f"No entries in the typing->lowering map for {self._function}" + assert self._BIND_TYPES, msg + for sig, impl in self._BIND_TYPES.items(): + self._selector.append(impl, sig) + + def _build(self): + from numba.core.extending import overload, intrinsic + + @overload(self._function, strict=False, + jit_options={'forceinline': True}) + def ol_generated(*ol_args, **ol_kwargs): + def body(tyctx): + msg = f"No typer registered for {self._function}" + if self._TYPER is None: + raise errors.InternalError(msg) + typing = self._TYPER(tyctx) + sig = typing.apply(ol_args, ol_kwargs) + if sig is None: + # this follows convention of something not typeable + # returning None + return None + if self._selector is None: + self._assemble() + lowering = self._selector.find(sig.args) + msg = (f"Could not find implementation to lower {sig} for ", + f"{self._function}") + if lowering is None: + raise errors.InternalError(msg) + return sig, lowering + + # Need a typing context now so as to get a signature and a binding + # for the kwarg order. + from numba.core.target_extension import (dispatcher_registry, + resolve_target_str, + current_target) + disp = dispatcher_registry[resolve_target_str(current_target())] + typing_context = disp.targetdescr.typing_context + typing = self._TYPER(typing_context) + sig = typing.apply(ol_args, ol_kwargs) + if not sig: + # No signature is a typing error, there's no match, so report it + raise errors.TypingError("No match") + + # The following code branches based on whether the signature has a + # "pysig", if it does, it's from a CallableTemplate and + # specialisation is required based on precise arg/kwarg names and + # default values, if it does not, then it just requires + # specialisation based on the arg count. + # + # The "gen_var_names" function is defined to generate the variable + # names at the call site of the intrinsic. + # + # The "call_str_specific" is the list of args to the function + # returned by the @overload, it has to have matching arg names and + # kwargs names/defaults if the underlying typing template supports + # it (CallableTemplate), else it has to have a matching number of + # arguments (AbstractTemplate). The "call_str" is the list of args + # that will be passed to the intrinsic that deals with typing and + # selection of the lowering etc, so it just needs to be a list of + # the argument names. + + if sig.pysig: # CallableTemplate, has pysig + pysig_params = sig.pysig.parameters + + # Define the var names + gen_var_names = [x for x in pysig_params.keys()] + # CallableTemplate, pysig is present so generate the exact thing + # this is to permit calling with positional args specified by + # name. + buf = [] + for k, v in pysig_params.items(): + if v.default is v.empty: # no default ~= positional arg + buf.append(k) + else: # is kwarg, wire in default + buf.append(f'{k} = {v.default}') + call_str_specific = ', '.join(buf) + call_str = ', '.join(pysig_params.keys()) + else: # AbstractTemplate, need to bind 1:1 vars to the arg count + # Define the var names + gen_var_names = [f'tmp{x}' for x in range(len(ol_args))] + # Everything is just passed by position, there should be no + # kwargs. + assert not ol_kwargs + call_str_specific = ', '.join(gen_var_names) + call_str = call_str_specific + + stub = self._stub_generator(body, gen_var_names) + intrin = intrinsic(stub) + + # NOTE: The jit_wrapper functions cannot take `*args` + # albeit this an obvious choice for accepting an unknown number + # of arguments. If this is done, `*args` ends up as a cascade of + # Tuple assembling in the IR which ends up with literal + # information being lost. As a result the _exact_ argument list + # is generated to match the number of arguments and kwargs. + name = str(self._function) + # This is to name the function with something vaguely identifiable + name = ''.join([x if x not in {'>','<',' ','-','.'} else '_' + for x in name]) + gen = textwrap.dedent((""" + def jit_wrapper_{}({}): + return intrin({}) + """)).format(name, call_str_specific, call_str) + l = {} + g = {'intrin': intrin} + exec(gen, g, l) + return l['jit_wrapper_{}'.format(name)] + + +class _Gluer: + """This is a helper class to make sure that each concrete overload has only + one wrapper as the code relies on the wrapper being a singleton.""" + def __init__(self): + self._registered = dict() + self._lock = RLock() + # `_no_defer` stores keys that should not defer lowering because typing + # is already provided. + self._no_defer = set() + # `_deferred` stores lowering that must be deferred because the typing + # has not been provided. + self._deferred = defaultdict(list) + + def __call__(self, func, typing_key=None): + with self._lock: + if typing_key is None: + key = func + else: + key = typing_key + if key in self._registered: + return self._registered[key] + else: + wrapper = _OverloadWrapper(func, typing_key=typing_key) + self._registered[key] = wrapper + return wrapper + + def defer_lowering(self, key, lower_fn): + """Defer lowering of the given key and lowering function. + """ + with self._lock: + if key in self._no_defer: + # Key is marked as no defer, register lowering now + lower_fn() + else: + # Defer + self._deferred[key].append(lower_fn) + + def add_no_defer(self, key): + """Stop lowering to be deferred for the given key. + """ + with self._lock: + self._no_defer.add(key) + + def flush_deferred_lowering(self, key): + """Flush the deferred lowering for the given key. + """ + with self._lock: + deferred = self._deferred.pop(key, []) + for cb in deferred: + cb() + + +_overload_glue = _Gluer() +del _Gluer + + +def glue_typing(concrete_function, typing_key=None): + """This is a decorator for wrapping the typing part for a concrete function + 'concrete_function', it's a text-only replacement for '@infer_global'""" + return _overload_glue(concrete_function, + typing_key=typing_key).wrap_typing() + + +def glue_lowering(*args): + """This is a decorator for wrapping the implementation (lowering) part for + a concrete function. 'args[0]' is the concrete_function, 'args[1:]' are the + types the lowering will accept. This acts as a text-only replacement for + '@lower/@lower_builtin'""" + + def wrap(fn): + key = args[0] + + def real_call(): + glue = _overload_glue(args[0], typing_key=key) + return glue.wrap_impl(*args[1:])(fn) + + _overload_glue.defer_lowering(key, real_call) + return fn + return wrap diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/postproc.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/postproc.py new file mode 100644 index 0000000000000000000000000000000000000000..a43c601dfbaa381774ac17fd7c7f06bd0c61279d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/postproc.py @@ -0,0 +1,238 @@ +from numba.core import utils, ir, analysis, transforms, ir_utils + + +class YieldPoint(object): + + def __init__(self, block, inst): + assert isinstance(block, ir.Block) + assert isinstance(inst, ir.Yield) + self.block = block + self.inst = inst + self.live_vars = None + self.weak_live_vars = None + + +class GeneratorInfo(object): + + def __init__(self): + # { index: YieldPoint } + self.yield_points = {} + # Ordered list of variable names + self.state_vars = [] + + def get_yield_points(self): + """ + Return an iterable of YieldPoint instances. + """ + return self.yield_points.values() + + +class VariableLifetime(object): + """ + For lazily building information of variable lifetime + """ + def __init__(self, blocks): + self._blocks = blocks + + @utils.cached_property + def cfg(self): + return analysis.compute_cfg_from_blocks(self._blocks) + + @utils.cached_property + def usedefs(self): + return analysis.compute_use_defs(self._blocks) + + @utils.cached_property + def livemap(self): + return analysis.compute_live_map(self.cfg, self._blocks, + self.usedefs.usemap, + self.usedefs.defmap) + + @utils.cached_property + def deadmaps(self): + return analysis.compute_dead_maps(self.cfg, self._blocks, self.livemap, + self.usedefs.defmap) + + +# other packages that define new nodes add calls for inserting dels +# format: {type:function} +ir_extension_insert_dels = {} + + +class PostProcessor(object): + """ + A post-processor for Numba IR. + """ + + def __init__(self, func_ir): + self.func_ir = func_ir + + def run(self, emit_dels: bool = False, extend_lifetimes: bool = False): + """ + Run the following passes over Numba IR: + - canonicalize the CFG + - emit explicit `del` instructions for variables + - compute lifetime of variables + - compute generator info (if function is a generator function) + """ + self.func_ir.blocks = transforms.canonicalize_cfg(self.func_ir.blocks) + vlt = VariableLifetime(self.func_ir.blocks) + self.func_ir.variable_lifetime = vlt + + bev = analysis.compute_live_variables(vlt.cfg, self.func_ir.blocks, + vlt.usedefs.defmap, + vlt.deadmaps.combined) + for offset, ir_block in self.func_ir.blocks.items(): + self.func_ir.block_entry_vars[ir_block] = bev[offset] + + if self.func_ir.is_generator: + self.func_ir.generator_info = GeneratorInfo() + self._compute_generator_info() + else: + self.func_ir.generator_info = None + + # Emit del nodes, do this last as the generator info parsing generates + # and then strips dels as part of its analysis. + if emit_dels: + self._insert_var_dels(extend_lifetimes=extend_lifetimes) + + def _populate_generator_info(self): + """ + Fill `index` for the Yield instruction and create YieldPoints. + """ + dct = self.func_ir.generator_info.yield_points + assert not dct, 'rerunning _populate_generator_info' + for block in self.func_ir.blocks.values(): + for inst in block.body: + if isinstance(inst, ir.Assign): + yieldinst = inst.value + if isinstance(yieldinst, ir.Yield): + index = len(dct) + 1 + yieldinst.index = index + yp = YieldPoint(block, yieldinst) + dct[yieldinst.index] = yp + + def _compute_generator_info(self): + """ + Compute the generator's state variables as the union of live variables + at all yield points. + """ + # generate del info, it's used in analysis here, strip it out at the end + self._insert_var_dels() + self._populate_generator_info() + gi = self.func_ir.generator_info + for yp in gi.get_yield_points(): + live_vars = set(self.func_ir.get_block_entry_vars(yp.block)) + weak_live_vars = set() + stmts = iter(yp.block.body) + for stmt in stmts: + if isinstance(stmt, ir.Assign): + if stmt.value is yp.inst: + break + live_vars.add(stmt.target.name) + elif isinstance(stmt, ir.Del): + live_vars.remove(stmt.value) + else: + assert 0, "couldn't find yield point" + # Try to optimize out any live vars that are deleted immediately + # after the yield point. + for stmt in stmts: + if isinstance(stmt, ir.Del): + name = stmt.value + if name in live_vars: + live_vars.remove(name) + weak_live_vars.add(name) + else: + break + yp.live_vars = live_vars + yp.weak_live_vars = weak_live_vars + + st = set() + for yp in gi.get_yield_points(): + st |= yp.live_vars + st |= yp.weak_live_vars + gi.state_vars = sorted(st) + self.remove_dels() + + def _insert_var_dels(self, extend_lifetimes=False): + """ + Insert del statements for each variable. + Returns a 2-tuple of (variable definition map, variable deletion map) + which indicates variables defined and deleted in each block. + + The algorithm avoids relying on explicit knowledge on loops and + distinguish between variables that are defined locally vs variables that + come from incoming blocks. + We start with simple usage (variable reference) and definition (variable + creation) maps on each block. Propagate the liveness info to predecessor + blocks until it stabilize, at which point we know which variables must + exist before entering each block. Then, we compute the end of variable + lives and insert del statements accordingly. Variables are deleted after + the last use. Variable referenced by terminators (e.g. conditional + branch and return) are deleted by the successors or the caller. + """ + vlt = self.func_ir.variable_lifetime + self._patch_var_dels(vlt.deadmaps.internal, vlt.deadmaps.escaping, + extend_lifetimes=extend_lifetimes) + + def _patch_var_dels(self, internal_dead_map, escaping_dead_map, + extend_lifetimes=False): + """ + Insert delete in each block + """ + for offset, ir_block in self.func_ir.blocks.items(): + # for each internal var, insert delete after the last use + internal_dead_set = internal_dead_map[offset].copy() + delete_pts = [] + # for each statement in reverse order + for stmt in reversed(ir_block.body[:-1]): + # internal vars that are used here + live_set = set(v.name for v in stmt.list_vars()) + dead_set = live_set & internal_dead_set + for T, def_func in ir_extension_insert_dels.items(): + if isinstance(stmt, T): + done_dels = def_func(stmt, dead_set) + dead_set -= done_dels + internal_dead_set -= done_dels + # used here but not afterwards + delete_pts.append((stmt, dead_set)) + internal_dead_set -= dead_set + + # rewrite body and insert dels + body = [] + lastloc = ir_block.loc + del_store = [] + for stmt, delete_set in reversed(delete_pts): + # If using extended lifetimes then the Dels are all put at the + # block end just ahead of the terminator, so associate their + # location with the terminator. + if extend_lifetimes: + lastloc = ir_block.body[-1].loc + else: + lastloc = stmt.loc + # Ignore dels (assuming no user inserted deletes) + if not isinstance(stmt, ir.Del): + body.append(stmt) + # note: the reverse sort is not necessary for correctness + # it is just to minimize changes to test for now + for var_name in sorted(delete_set, reverse=True): + delnode = ir.Del(var_name, loc=lastloc) + if extend_lifetimes: + del_store.append(delnode) + else: + body.append(delnode) + if extend_lifetimes: + body.extend(del_store) + body.append(ir_block.body[-1]) # terminator + ir_block.body = body + + # vars to delete at the start + escape_dead_set = escaping_dead_map[offset] + for var_name in sorted(escape_dead_set): + ir_block.prepend(ir.Del(var_name, loc=ir_block.body[0].loc)) + + def remove_dels(self): + """ + Strips the IR of Del nodes + """ + ir_utils.remove_dels(self.func_ir.blocks) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/pylowering.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/pylowering.py new file mode 100644 index 0000000000000000000000000000000000000000..017124c632bf355cd397514b22295c9175568f7d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/pylowering.py @@ -0,0 +1,655 @@ +""" +Lowering implementation for object mode. +""" + + +import builtins +import operator +import inspect + +import llvmlite.ir + +from numba.core import types, utils, ir, generators, cgutils +from numba.core.errors import (ForbiddenConstruct, LoweringError, + NumbaNotImplementedError) +from numba.core.lowering import BaseLower + + +# Issue #475: locals() is unsupported as calling it naively would give +# out wrong results. +_unsupported_builtins = set([locals]) + +# Map operators to methods on the PythonAPI class +PYTHON_BINOPMAP = { + operator.add: ("number_add", False), + operator.sub: ("number_subtract", False), + operator.mul: ("number_multiply", False), + operator.truediv: ("number_truedivide", False), + operator.floordiv: ("number_floordivide", False), + operator.mod: ("number_remainder", False), + operator.pow: ("number_power", False), + operator.lshift: ("number_lshift", False), + operator.rshift: ("number_rshift", False), + operator.and_: ("number_and", False), + operator.or_: ("number_or", False), + operator.xor: ("number_xor", False), + # inplace operators + operator.iadd: ("number_add", True), + operator.isub: ("number_subtract", True), + operator.imul: ("number_multiply", True), + operator.itruediv: ("number_truedivide", True), + operator.ifloordiv: ("number_floordivide", True), + operator.imod: ("number_remainder", True), + operator.ipow: ("number_power", True), + operator.ilshift: ("number_lshift", True), + operator.irshift: ("number_rshift", True), + operator.iand: ("number_and", True), + operator.ior: ("number_or", True), + operator.ixor: ("number_xor", True), +} + +PYTHON_BINOPMAP[operator.matmul] = ("number_matrix_multiply", False) +PYTHON_BINOPMAP[operator.imatmul] = ("number_matrix_multiply", True) + +PYTHON_COMPAREOPMAP = { + operator.eq: '==', + operator.ne: '!=', + operator.lt: '<', + operator.le: '<=', + operator.gt: '>', + operator.ge: '>=', + operator.is_: 'is', + operator.is_not: 'is not', + operator.contains: 'in' +} + +class PyLower(BaseLower): + + GeneratorLower = generators.PyGeneratorLower + + def init(self): + # Strings to be frozen into the Environment object + self._frozen_strings = set() + + self._live_vars = set() + + def pre_lower(self): + super(PyLower, self).pre_lower() + self.init_pyapi() + + def post_lower(self): + pass + + def pre_block(self, block): + self.init_vars(block) + + def lower_inst(self, inst): + if isinstance(inst, ir.Assign): + value = self.lower_assign(inst) + self.storevar(value, inst.target.name) + + elif isinstance(inst, ir.SetItem): + target = self.loadvar(inst.target.name) + index = self.loadvar(inst.index.name) + value = self.loadvar(inst.value.name) + ok = self.pyapi.object_setitem(target, index, value) + self.check_int_status(ok) + + elif isinstance(inst, ir.DelItem): + target = self.loadvar(inst.target.name) + index = self.loadvar(inst.index.name) + ok = self.pyapi.object_delitem(target, index) + self.check_int_status(ok) + + elif isinstance(inst, ir.SetAttr): + target = self.loadvar(inst.target.name) + value = self.loadvar(inst.value.name) + ok = self.pyapi.object_setattr(target, + self._freeze_string(inst.attr), + value) + self.check_int_status(ok) + + elif isinstance(inst, ir.DelAttr): + target = self.loadvar(inst.target.name) + ok = self.pyapi.object_delattr(target, + self._freeze_string(inst.attr)) + self.check_int_status(ok) + + elif isinstance(inst, ir.StoreMap): + dct = self.loadvar(inst.dct.name) + key = self.loadvar(inst.key.name) + value = self.loadvar(inst.value.name) + ok = self.pyapi.dict_setitem(dct, key, value) + self.check_int_status(ok) + + elif isinstance(inst, ir.Return): + retval = self.loadvar(inst.value.name) + if self.generator_info: + # StopIteration + # We own a reference to the "return value", but we + # don't return it. + self.pyapi.decref(retval) + self.genlower.return_from_generator(self) + return + # No need to incref() as the reference is already owned. + self.call_conv.return_value(self.builder, retval) + + elif isinstance(inst, ir.Branch): + cond = self.loadvar(inst.cond.name) + if cond.type == llvmlite.ir.IntType(1): + istrue = cond + else: + istrue = self.pyapi.object_istrue(cond) + zero = llvmlite.ir.Constant(istrue.type, None) + pred = self.builder.icmp_unsigned('!=', istrue, zero) + tr = self.blkmap[inst.truebr] + fl = self.blkmap[inst.falsebr] + self.builder.cbranch(pred, tr, fl) + + elif isinstance(inst, ir.Jump): + target = self.blkmap[inst.target] + self.builder.branch(target) + + elif isinstance(inst, ir.Del): + self.delvar(inst.value) + + elif isinstance(inst, ir.PopBlock): + pass # this is just a marker + + elif isinstance(inst, ir.Raise): + if inst.exception is not None: + exc = self.loadvar(inst.exception.name) + # A reference will be stolen by raise_object() and another + # by return_exception_raised(). + self.incref(exc) + else: + exc = None + self.pyapi.raise_object(exc) + self.return_exception_raised() + + else: + msg = f"{type(inst)}, {inst}" + raise NumbaNotImplementedError(msg) + + @utils.cached_property + def _omitted_typobj(self): + """Return a `OmittedArg` type instance as a LLVM value suitable for + testing at runtime. + """ + from numba.core.dispatcher import OmittedArg + return self.pyapi.unserialize( + self.pyapi.serialize_object(OmittedArg)) + + def lower_assign(self, inst): + """ + The returned object must have a new reference + """ + value = inst.value + if isinstance(value, (ir.Const, ir.FreeVar)): + return self.lower_const(value.value) + elif isinstance(value, ir.Var): + val = self.loadvar(value.name) + self.incref(val) + return val + elif isinstance(value, ir.Expr): + return self.lower_expr(value) + elif isinstance(value, ir.Global): + return self.lower_global(value.name, value.value) + elif isinstance(value, ir.Yield): + return self.lower_yield(value) + elif isinstance(value, ir.Arg): + param = self.func_ir.func_id.pysig.parameters.get(value.name) + + obj = self.fnargs[value.index] + slot = cgutils.alloca_once_value(self.builder, obj) + # Don't check for OmittedArg unless the argument has a default + if param is not None and param.default is inspect.Parameter.empty: + self.incref(obj) + self.builder.store(obj, slot) + else: + # When an argument is omitted, the dispatcher hands it as + # _OmittedArg() + typobj = self.pyapi.get_type(obj) + is_omitted = self.builder.icmp_unsigned('==', typobj, + self._omitted_typobj) + with self.builder.if_else(is_omitted, likely=False) as (omitted, present): + with present: + self.incref(obj) + self.builder.store(obj, slot) + with omitted: + # The argument is omitted => get the default value + obj = self.pyapi.object_getattr_string(obj, 'value') + self.builder.store(obj, slot) + + return self.builder.load(slot) + else: + raise NotImplementedError(type(value), value) + + def lower_yield(self, inst): + yp = self.generator_info.yield_points[inst.index] + assert yp.inst is inst + self.genlower.init_generator_state(self) + + # Save live vars in state + # We also need to save live vars that are del'ed afterwards. + y = generators.LowerYield(self, yp, yp.live_vars | yp.weak_live_vars) + y.lower_yield_suspend() + # Yield to caller + val = self.loadvar(inst.value.name) + # Let caller own the reference + self.pyapi.incref(val) + self.call_conv.return_value(self.builder, val) + + # Resumption point + y.lower_yield_resume() + # None is returned by the yield expression + return self.pyapi.make_none() + + def lower_binop(self, expr, op, inplace=False): + lhs = self.loadvar(expr.lhs.name) + rhs = self.loadvar(expr.rhs.name) + assert not isinstance(op, str) + if op in PYTHON_BINOPMAP: + fname, inplace = PYTHON_BINOPMAP[op] + fn = getattr(self.pyapi, fname) + res = fn(lhs, rhs, inplace=inplace) + else: + # Assumed to be rich comparison + fn = PYTHON_COMPAREOPMAP.get(expr.fn, expr.fn) + if fn == 'in': # 'in' and operator.contains have args reversed + lhs, rhs = rhs, lhs + res = self.pyapi.object_richcompare(lhs, rhs, fn) + self.check_error(res) + return res + + def lower_expr(self, expr): + if expr.op == 'binop': + return self.lower_binop(expr, expr.fn, inplace=False) + elif expr.op == 'inplace_binop': + return self.lower_binop(expr, expr.fn, inplace=True) + elif expr.op == 'unary': + value = self.loadvar(expr.value.name) + if expr.fn == operator.neg: + res = self.pyapi.number_negative(value) + elif expr.fn == operator.pos: + res = self.pyapi.number_positive(value) + elif expr.fn == operator.not_: + res = self.pyapi.object_not(value) + self.check_int_status(res) + res = self.pyapi.bool_from_bool(res) + elif expr.fn == operator.invert: + res = self.pyapi.number_invert(value) + else: + raise NotImplementedError(expr) + self.check_error(res) + return res + elif expr.op == 'call': + argvals = [self.loadvar(a.name) for a in expr.args] + fn = self.loadvar(expr.func.name) + args = self.pyapi.tuple_pack(argvals) + if expr.vararg: + # Expand *args + varargs = self.pyapi.sequence_tuple( + self.loadvar(expr.vararg.name)) + new_args = self.pyapi.sequence_concat(args, varargs) + self.decref(varargs) + self.decref(args) + args = new_args + if not expr.kws: + # No named arguments + ret = self.pyapi.call(fn, args, None) + else: + # Named arguments + keyvalues = [(k, self.loadvar(v.name)) for k, v in expr.kws] + kws = self.pyapi.dict_pack(keyvalues) + ret = self.pyapi.call(fn, args, kws) + self.decref(kws) + self.decref(args) + self.check_error(ret) + return ret + elif expr.op == 'getattr': + obj = self.loadvar(expr.value.name) + res = self.pyapi.object_getattr(obj, self._freeze_string(expr.attr)) + self.check_error(res) + return res + elif expr.op == 'build_tuple': + items = [self.loadvar(it.name) for it in expr.items] + res = self.pyapi.tuple_pack(items) + self.check_error(res) + return res + elif expr.op == 'build_list': + items = [self.loadvar(it.name) for it in expr.items] + res = self.pyapi.list_pack(items) + self.check_error(res) + return res + elif expr.op == 'build_map': + res = self.pyapi.dict_new(expr.size) + self.check_error(res) + for k, v in expr.items: + key = self.loadvar(k.name) + value = self.loadvar(v.name) + ok = self.pyapi.dict_setitem(res, key, value) + self.check_int_status(ok) + return res + elif expr.op == 'build_set': + items = [self.loadvar(it.name) for it in expr.items] + res = self.pyapi.set_new() + self.check_error(res) + for it in items: + ok = self.pyapi.set_add(res, it) + self.check_int_status(ok) + return res + elif expr.op == 'getiter': + obj = self.loadvar(expr.value.name) + res = self.pyapi.object_getiter(obj) + self.check_error(res) + return res + elif expr.op == 'iternext': + iterobj = self.loadvar(expr.value.name) + item = self.pyapi.iter_next(iterobj) + is_valid = cgutils.is_not_null(self.builder, item) + pair = self.pyapi.tuple_new(2) + with self.builder.if_else(is_valid) as (then, otherwise): + with then: + self.pyapi.tuple_setitem(pair, 0, item) + with otherwise: + self.check_occurred() + # Make the tuple valid by inserting None as dummy + # iteration "result" (it will be ignored). + self.pyapi.tuple_setitem(pair, 0, self.pyapi.make_none()) + self.pyapi.tuple_setitem(pair, 1, self.pyapi.bool_from_bool(is_valid)) + return pair + elif expr.op == 'pair_first': + pair = self.loadvar(expr.value.name) + first = self.pyapi.tuple_getitem(pair, 0) + self.incref(first) + return first + elif expr.op == 'pair_second': + pair = self.loadvar(expr.value.name) + second = self.pyapi.tuple_getitem(pair, 1) + self.incref(second) + return second + elif expr.op == 'exhaust_iter': + iterobj = self.loadvar(expr.value.name) + tup = self.pyapi.sequence_tuple(iterobj) + self.check_error(tup) + # Check tuple size is as expected + tup_size = self.pyapi.tuple_size(tup) + expected_size = self.context.get_constant(types.intp, expr.count) + has_wrong_size = self.builder.icmp_unsigned('!=', + tup_size, expected_size) + with cgutils.if_unlikely(self.builder, has_wrong_size): + self.return_exception(ValueError) + return tup + elif expr.op == 'getitem': + value = self.loadvar(expr.value.name) + index = self.loadvar(expr.index.name) + res = self.pyapi.object_getitem(value, index) + self.check_error(res) + return res + elif expr.op == 'static_getitem': + value = self.loadvar(expr.value.name) + index = self.context.get_constant(types.intp, expr.index) + indexobj = self.pyapi.long_from_ssize_t(index) + self.check_error(indexobj) + res = self.pyapi.object_getitem(value, indexobj) + self.decref(indexobj) + self.check_error(res) + return res + elif expr.op == 'getslice': + target = self.loadvar(expr.target.name) + start = self.loadvar(expr.start.name) + stop = self.loadvar(expr.stop.name) + + slicefn = self.get_builtin_obj("slice") + sliceobj = self.pyapi.call_function_objargs(slicefn, (start, stop)) + self.decref(slicefn) + self.check_error(sliceobj) + + res = self.pyapi.object_getitem(target, sliceobj) + self.check_error(res) + + return res + + elif expr.op == 'cast': + val = self.loadvar(expr.value.name) + self.incref(val) + return val + elif expr.op == 'phi': + raise LoweringError("PHI not stripped") + + elif expr.op == 'null': + # Make null value + return cgutils.get_null_value(self.pyapi.pyobj) + + else: + raise NotImplementedError(expr) + + def lower_const(self, const): + # All constants are frozen inside the environment + index = self.env_manager.add_const(const) + ret = self.env_manager.read_const(index) + self.check_error(ret) + self.incref(ret) + return ret + + def lower_global(self, name, value): + """ + 1) Check global scope dictionary. + 2) Check __builtins__. + 2a) is it a dictionary (for non __main__ module) + 2b) is it a module (for __main__ module) + """ + moddict = self.get_module_dict() + obj = self.pyapi.dict_getitem(moddict, self._freeze_string(name)) + self.incref(obj) # obj is borrowed + + try: + if value in _unsupported_builtins: + raise ForbiddenConstruct("builtins %s() is not supported" + % name, loc=self.loc) + except TypeError: + # `value` is unhashable, ignore + pass + + if hasattr(builtins, name): + obj_is_null = self.is_null(obj) + bbelse = self.builder.basic_block + + with self.builder.if_then(obj_is_null): + mod = self.pyapi.dict_getitem(moddict, + self._freeze_string("__builtins__")) + builtin = self.builtin_lookup(mod, name) + bbif = self.builder.basic_block + + retval = self.builder.phi(self.pyapi.pyobj) + retval.add_incoming(obj, bbelse) + retval.add_incoming(builtin, bbif) + + else: + retval = obj + with cgutils.if_unlikely(self.builder, self.is_null(retval)): + self.pyapi.raise_missing_global_error(name) + self.return_exception_raised() + + return retval + + # ------------------------------------------------------------------------- + + def get_module_dict(self): + return self.env_body.globals + + def get_builtin_obj(self, name): + # XXX The builtins dict could be bound into the environment + moddict = self.get_module_dict() + mod = self.pyapi.dict_getitem(moddict, + self._freeze_string("__builtins__")) + return self.builtin_lookup(mod, name) + + def builtin_lookup(self, mod, name): + """ + Args + ---- + mod: + The __builtins__ dictionary or module, as looked up in + a module's globals. + name: str + The object to lookup + """ + fromdict = self.pyapi.dict_getitem(mod, self._freeze_string(name)) + self.incref(fromdict) # fromdict is borrowed + bbifdict = self.builder.basic_block + + with cgutils.if_unlikely(self.builder, self.is_null(fromdict)): + # This happen if we are using the __main__ module + frommod = self.pyapi.object_getattr(mod, self._freeze_string(name)) + + with cgutils.if_unlikely(self.builder, self.is_null(frommod)): + self.pyapi.raise_missing_global_error(name) + self.return_exception_raised() + + bbifmod = self.builder.basic_block + + builtin = self.builder.phi(self.pyapi.pyobj) + builtin.add_incoming(fromdict, bbifdict) + builtin.add_incoming(frommod, bbifmod) + + return builtin + + def check_occurred(self): + """ + Return if an exception occurred. + """ + err_occurred = cgutils.is_not_null(self.builder, + self.pyapi.err_occurred()) + + with cgutils.if_unlikely(self.builder, err_occurred): + self.return_exception_raised() + + def check_error(self, obj): + """ + Return if *obj* is NULL. + """ + with cgutils.if_unlikely(self.builder, self.is_null(obj)): + self.return_exception_raised() + + return obj + + def check_int_status(self, num, ok_value=0): + """ + Raise an exception if *num* is smaller than *ok_value*. + """ + ok = llvmlite.ir.Constant(num.type, ok_value) + pred = self.builder.icmp_signed('<', num, ok) + with cgutils.if_unlikely(self.builder, pred): + self.return_exception_raised() + + def is_null(self, obj): + return cgutils.is_null(self.builder, obj) + + def return_exception_raised(self): + """ + Return with the currently raised exception. + """ + self.cleanup_vars() + self.call_conv.return_exc(self.builder) + + def init_vars(self, block): + """ + Initialize live variables for *block*. + """ + self._live_vars = set(self.func_ir.get_block_entry_vars(block)) + + def _getvar(self, name, ltype=None): + if name not in self.varmap: + self.varmap[name] = self.alloca(name, ltype=ltype) + return self.varmap[name] + + def loadvar(self, name): + """ + Load the llvm value of the variable named *name*. + """ + # If this raises then the live variables analysis is wrong + assert name in self._live_vars, name + ptr = self.varmap[name] + val = self.builder.load(ptr) + with cgutils.if_unlikely(self.builder, self.is_null(val)): + self.pyapi.raise_missing_name_error(name) + self.return_exception_raised() + return val + + def delvar(self, name): + """ + Delete the variable slot with the given name. This will decref + the corresponding Python object. + """ + # If this raises then the live variables analysis is wrong + self._live_vars.remove(name) + ptr = self._getvar(name) # initializes `name` if not already + self.decref(self.builder.load(ptr)) + # This is a safety guard against double decref's, but really + # the IR should be correct and have only one Del per variable + # and code path. + self.builder.store(cgutils.get_null_value(ptr.type.pointee), ptr) + + def storevar(self, value, name, clobber=False): + """ + Stores a llvm value and allocate stack slot if necessary. + The llvm value can be of arbitrary type. + """ + is_redefine = name in self._live_vars and not clobber + ptr = self._getvar(name, ltype=value.type) + if is_redefine: + old = self.builder.load(ptr) + else: + self._live_vars.add(name) + assert value.type == ptr.type.pointee, (str(value.type), + str(ptr.type.pointee)) + self.builder.store(value, ptr) + # Safe to call decref even on non python object + if is_redefine: + self.decref(old) + + def cleanup_vars(self): + """ + Cleanup live variables. + """ + for name in self._live_vars: + ptr = self._getvar(name) + self.decref(self.builder.load(ptr)) + + def alloca(self, name, ltype=None): + """ + Allocate a stack slot and initialize it to NULL. + The default is to allocate a pyobject pointer. + Use ``ltype`` to override. + """ + if ltype is None: + ltype = self.context.get_value_type(types.pyobject) + with self.builder.goto_block(self.entry_block): + ptr = self.builder.alloca(ltype, name=name) + self.builder.store(cgutils.get_null_value(ltype), ptr) + return ptr + + def _alloca_var(self, name, fetype): + # This is here for API compatibility with lowering.py::Lower. + # NOTE: fetype is unused + return self.alloca(name) + + def incref(self, value): + self.pyapi.incref(value) + + def decref(self, value): + """ + This is allow to be called on non pyobject pointer, in which case + no code is inserted. + """ + lpyobj = self.context.get_value_type(types.pyobject) + if value.type == lpyobj: + self.pyapi.decref(value) + + def _freeze_string(self, string): + """ + Freeze a Python string object into the code. + """ + return self.lower_const(string) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/pythonapi.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/pythonapi.py new file mode 100644 index 0000000000000000000000000000000000000000..5ea6dc2220aea833c010f294befdd28ba31211e9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/pythonapi.py @@ -0,0 +1,1686 @@ +from collections import namedtuple +import contextlib +import pickle +import hashlib +import sys + +from llvmlite import ir +from llvmlite.ir import Constant + +import ctypes +from numba import _helperlib +from numba.core import ( + types, utils, config, lowering, cgutils, imputils, serialize, +) + +PY_UNICODE_1BYTE_KIND = _helperlib.py_unicode_1byte_kind +PY_UNICODE_2BYTE_KIND = _helperlib.py_unicode_2byte_kind +PY_UNICODE_4BYTE_KIND = _helperlib.py_unicode_4byte_kind +PY_UNICODE_WCHAR_KIND = _helperlib.py_unicode_wchar_kind + + +class _Registry(object): + + def __init__(self): + self.functions = {} + + def register(self, typeclass): + assert issubclass(typeclass, types.Type) + def decorator(func): + if typeclass in self.functions: + raise KeyError("duplicate registration for %s" % (typeclass,)) + self.functions[typeclass] = func + return func + return decorator + + def lookup(self, typeclass, default=None): + assert issubclass(typeclass, types.Type) + for cls in typeclass.__mro__: + func = self.functions.get(cls) + if func is not None: + return func + return default + +# Registries of boxing / unboxing implementations +_boxers = _Registry() +_unboxers = _Registry() +_reflectors = _Registry() + +box = _boxers.register +unbox = _unboxers.register +reflect = _reflectors.register + +class _BoxContext(namedtuple("_BoxContext", + ("context", "builder", "pyapi", "env_manager"))): + """ + The facilities required by boxing implementations. + """ + __slots__ = () + + def box(self, typ, val): + return self.pyapi.from_native_value(typ, val, self.env_manager) + + +class _UnboxContext(namedtuple("_UnboxContext", + ("context", "builder", "pyapi"))): + """ + The facilities required by unboxing implementations. + """ + __slots__ = () + + def unbox(self, typ, obj): + return self.pyapi.to_native_value(typ, obj) + + +class _ReflectContext(namedtuple("_ReflectContext", + ("context", "builder", "pyapi", "env_manager", + "is_error"))): + """ + The facilities required by reflection implementations. + """ + __slots__ = () + + # XXX the error bit is currently unused by consumers (e.g. PyCallWrapper) + def set_error(self): + self.builder.store(self.is_error, cgutils.true_bit) + + def box(self, typ, val): + return self.pyapi.from_native_value(typ, val, self.env_manager) + + def reflect(self, typ, val): + return self.pyapi.reflect_native_value(typ, val, self.env_manager) + + +class NativeValue(object): + """ + Encapsulate the result of converting a Python object to a native value, + recording whether the conversion was successful and how to cleanup. + """ + + def __init__(self, value, is_error=None, cleanup=None): + self.value = value + self.is_error = is_error if is_error is not None else cgutils.false_bit + self.cleanup = cleanup + + +class EnvironmentManager(object): + + def __init__(self, pyapi, env, env_body, env_ptr): + assert isinstance(env, lowering.Environment) + self.pyapi = pyapi + self.env = env + self.env_body = env_body + self.env_ptr = env_ptr + + def add_const(self, const): + """ + Add a constant to the environment, return its index. + """ + # All constants are frozen inside the environment + if isinstance(const, str): + const = sys.intern(const) + for index, val in enumerate(self.env.consts): + if val is const: + break + else: + index = len(self.env.consts) + self.env.consts.append(const) + return index + + def read_const(self, index): + """ + Look up constant number *index* inside the environment body. + A borrowed reference is returned. + + The returned LLVM value may have NULL value at runtime which indicates + an error at runtime. + """ + assert index < len(self.env.consts) + + builder = self.pyapi.builder + consts = self.env_body.consts + ret = cgutils.alloca_once(builder, self.pyapi.pyobj, zfill=True) + with builder.if_else(cgutils.is_not_null(builder, consts)) as \ + (br_not_null, br_null): + with br_not_null: + getitem = self.pyapi.list_getitem(consts, index) + builder.store(getitem, ret) + with br_null: + # This can happen when the Environment is accidentally released + # and has subsequently been garbage collected. + self.pyapi.err_set_string( + "PyExc_RuntimeError", + "`env.consts` is NULL in `read_const`", + ) + return builder.load(ret) + + +_IteratorLoop = namedtuple('_IteratorLoop', ('value', 'do_break')) + + +class PythonAPI(object): + """ + Code generation facilities to call into the CPython C API (and related + helpers). + """ + + def __init__(self, context, builder): + """ + Note: Maybe called multiple times when lowering a function + """ + self.context = context + self.builder = builder + + self.module = builder.basic_block.function.module + # A unique mapping of serialized objects in this module + try: + self.module.__serialized + except AttributeError: + self.module.__serialized = {} + + # Initialize types + self.pyobj = self.context.get_argument_type(types.pyobject) + self.pyobjptr = self.pyobj.as_pointer() + self.voidptr = ir.PointerType(ir.IntType(8)) + self.long = ir.IntType(ctypes.sizeof(ctypes.c_long) * 8) + self.ulong = self.long + self.longlong = ir.IntType(ctypes.sizeof(ctypes.c_ulonglong) * 8) + self.ulonglong = self.longlong + self.double = ir.DoubleType() + self.py_ssize_t = self.context.get_value_type(types.intp) + self.cstring = ir.PointerType(ir.IntType(8)) + self.gil_state = ir.IntType(_helperlib.py_gil_state_size * 8) + self.py_buffer_t = ir.ArrayType(ir.IntType(8), _helperlib.py_buffer_size) + self.py_hash_t = self.py_ssize_t + self.py_unicode_1byte_kind = _helperlib.py_unicode_1byte_kind + self.py_unicode_2byte_kind = _helperlib.py_unicode_2byte_kind + self.py_unicode_4byte_kind = _helperlib.py_unicode_4byte_kind + self.py_unicode_wchar_kind = _helperlib.py_unicode_wchar_kind + + def get_env_manager(self, env, env_body, env_ptr): + return EnvironmentManager(self, env, env_body, env_ptr) + + def emit_environment_sentry(self, envptr, return_pyobject=False, + debug_msg=''): + """Emits LLVM code to ensure the `envptr` is not NULL + """ + is_null = cgutils.is_null(self.builder, envptr) + with cgutils.if_unlikely(self.builder, is_null): + if return_pyobject: + fnty = self.builder.function.type.pointee + assert fnty.return_type == self.pyobj + self.err_set_string( + "PyExc_RuntimeError", f"missing Environment: {debug_msg}", + ) + self.builder.ret(self.get_null_object()) + else: + self.context.call_conv.return_user_exc( + self.builder, RuntimeError, + (f"missing Environment: {debug_msg}",), + ) + + # ------ Python API ----- + + # + # Basic object API + # + + def incref(self, obj): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj]) + fn = self._get_function(fnty, name="Py_IncRef") + self.builder.call(fn, [obj]) + + def decref(self, obj): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj]) + fn = self._get_function(fnty, name="Py_DecRef") + self.builder.call(fn, [obj]) + + def get_type(self, obj): + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="numba_py_type") + return self.builder.call(fn, [obj]) + + # + # Argument unpacking + # + + def parse_tuple_and_keywords(self, args, kws, fmt, keywords, *objs): + charptr = ir.PointerType(ir.IntType(8)) + charptrary = ir.PointerType(charptr) + argtypes = [self.pyobj, self.pyobj, charptr, charptrary] + fnty = ir.FunctionType(ir.IntType(32), argtypes, var_arg=True) + fn = self._get_function(fnty, name="PyArg_ParseTupleAndKeywords") + return self.builder.call(fn, [args, kws, fmt, keywords] + list(objs)) + + def parse_tuple(self, args, fmt, *objs): + charptr = ir.PointerType(ir.IntType(8)) + argtypes = [self.pyobj, charptr] + fnty = ir.FunctionType(ir.IntType(32), argtypes, var_arg=True) + fn = self._get_function(fnty, name="PyArg_ParseTuple") + return self.builder.call(fn, [args, fmt] + list(objs)) + + def unpack_tuple(self, args, name, n_min, n_max, *objs): + charptr = ir.PointerType(ir.IntType(8)) + argtypes = [self.pyobj, charptr, self.py_ssize_t, self.py_ssize_t] + fnty = ir.FunctionType(ir.IntType(32), argtypes, var_arg=True) + fn = self._get_function(fnty, name="PyArg_UnpackTuple") + n_min = Constant(self.py_ssize_t, int(n_min)) + n_max = Constant(self.py_ssize_t, int(n_max)) + if isinstance(name, str): + name = self.context.insert_const_string(self.builder.module, name) + return self.builder.call(fn, [args, name, n_min, n_max] + list(objs)) + + # + # Exception and errors + # + + def err_occurred(self): + fnty = ir.FunctionType(self.pyobj, ()) + fn = self._get_function(fnty, name="PyErr_Occurred") + return self.builder.call(fn, ()) + + def err_clear(self): + fnty = ir.FunctionType(ir.VoidType(), ()) + fn = self._get_function(fnty, name="PyErr_Clear") + return self.builder.call(fn, ()) + + def err_set_string(self, exctype, msg): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj, self.cstring]) + fn = self._get_function(fnty, name="PyErr_SetString") + if isinstance(exctype, str): + exctype = self.get_c_object(exctype) + if isinstance(msg, str): + msg = self.context.insert_const_string(self.module, msg) + return self.builder.call(fn, (exctype, msg)) + + def err_format(self, exctype, msg, *format_args): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj, self.cstring], var_arg=True) + fn = self._get_function(fnty, name="PyErr_Format") + if isinstance(exctype, str): + exctype = self.get_c_object(exctype) + if isinstance(msg, str): + msg = self.context.insert_const_string(self.module, msg) + return self.builder.call(fn, (exctype, msg) + tuple(format_args)) + + def raise_object(self, exc=None): + """ + Raise an arbitrary exception (type or value or (type, args) + or None - if reraising). A reference to the argument is consumed. + """ + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj]) + fn = self._get_function(fnty, name="numba_do_raise") + if exc is None: + exc = self.make_none() + return self.builder.call(fn, (exc,)) + + def err_set_object(self, exctype, excval): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PyErr_SetObject") + if isinstance(exctype, str): + exctype = self.get_c_object(exctype) + return self.builder.call(fn, (exctype, excval)) + + def err_set_none(self, exctype): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj]) + fn = self._get_function(fnty, name="PyErr_SetNone") + if isinstance(exctype, str): + exctype = self.get_c_object(exctype) + return self.builder.call(fn, (exctype,)) + + def err_write_unraisable(self, obj): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj]) + fn = self._get_function(fnty, name="PyErr_WriteUnraisable") + return self.builder.call(fn, (obj,)) + + def err_fetch(self, pty, pval, ptb): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobjptr] * 3) + fn = self._get_function(fnty, name="PyErr_Fetch") + return self.builder.call(fn, (pty, pval, ptb)) + + def err_restore(self, ty, val, tb): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj] * 3) + fn = self._get_function(fnty, name="PyErr_Restore") + return self.builder.call(fn, (ty, val, tb)) + + @contextlib.contextmanager + def err_push(self, keep_new=False): + """ + Temporarily push the current error indicator while the code + block is executed. If *keep_new* is True and the code block + raises a new error, the new error is kept, otherwise the old + error indicator is restored at the end of the block. + """ + pty, pval, ptb = [cgutils.alloca_once(self.builder, self.pyobj) + for i in range(3)] + self.err_fetch(pty, pval, ptb) + yield + ty = self.builder.load(pty) + val = self.builder.load(pval) + tb = self.builder.load(ptb) + if keep_new: + new_error = cgutils.is_not_null(self.builder, self.err_occurred()) + with self.builder.if_else(new_error, likely=False) as (if_error, if_ok): + with if_error: + # Code block raised an error, keep it + self.decref(ty) + self.decref(val) + self.decref(tb) + with if_ok: + # Restore previous error + self.err_restore(ty, val, tb) + else: + self.err_restore(ty, val, tb) + + def get_c_object(self, name): + """ + Get a Python object through its C-accessible *name* + (e.g. "PyExc_ValueError"). The underlying variable must be + a `PyObject *`, and the value of that pointer is returned. + """ + # A LLVM global variable is implicitly a pointer to the declared + # type, so fix up by using pyobj.pointee. + return self.context.get_c_value(self.builder, self.pyobj.pointee, name, + dllimport=True) + + def raise_missing_global_error(self, name): + msg = "global name '%s' is not defined" % name + cstr = self.context.insert_const_string(self.module, msg) + self.err_set_string("PyExc_NameError", cstr) + + def raise_missing_name_error(self, name): + msg = "name '%s' is not defined" % name + cstr = self.context.insert_const_string(self.module, msg) + self.err_set_string("PyExc_NameError", cstr) + + def fatal_error(self, msg): + fnty = ir.FunctionType(ir.VoidType(), [self.cstring]) + fn = self._get_function(fnty, name="Py_FatalError") + fn.attributes.add("noreturn") + cstr = self.context.insert_const_string(self.module, msg) + self.builder.call(fn, (cstr,)) + + # + # Concrete dict API + # + + def dict_getitem_string(self, dic, name): + """Lookup name inside dict + + Returns a borrowed reference + """ + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.cstring]) + fn = self._get_function(fnty, name="PyDict_GetItemString") + cstr = self.context.insert_const_string(self.module, name) + return self.builder.call(fn, [dic, cstr]) + + def dict_getitem(self, dic, name): + """Lookup name inside dict + + Returns a borrowed reference + """ + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PyDict_GetItem") + return self.builder.call(fn, [dic, name]) + + def dict_new(self, presize=0): + if presize == 0: + fnty = ir.FunctionType(self.pyobj, ()) + fn = self._get_function(fnty, name="PyDict_New") + return self.builder.call(fn, ()) + else: + fnty = ir.FunctionType(self.pyobj, [self.py_ssize_t]) + fn = self._get_function(fnty, name="_PyDict_NewPresized") + return self.builder.call(fn, + [Constant(self.py_ssize_t, int(presize))]) + + def dict_setitem(self, dictobj, nameobj, valobj): + fnty = ir.FunctionType(ir.IntType(32), (self.pyobj, self.pyobj, + self.pyobj)) + fn = self._get_function(fnty, name="PyDict_SetItem") + return self.builder.call(fn, (dictobj, nameobj, valobj)) + + def dict_setitem_string(self, dictobj, name, valobj): + fnty = ir.FunctionType(ir.IntType(32), (self.pyobj, self.cstring, + self.pyobj)) + fn = self._get_function(fnty, name="PyDict_SetItemString") + cstr = self.context.insert_const_string(self.module, name) + return self.builder.call(fn, (dictobj, cstr, valobj)) + + def dict_pack(self, keyvalues): + """ + Args + ----- + keyvalues: iterable of (str, llvm.Value of PyObject*) + """ + dictobj = self.dict_new() + with self.if_object_ok(dictobj): + for k, v in keyvalues: + self.dict_setitem_string(dictobj, k, v) + return dictobj + + # + # Concrete number APIs + # + + def float_from_double(self, fval): + fnty = ir.FunctionType(self.pyobj, [self.double]) + fn = self._get_function(fnty, name="PyFloat_FromDouble") + return self.builder.call(fn, [fval]) + + def number_as_ssize_t(self, numobj): + fnty = ir.FunctionType(self.py_ssize_t, [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PyNumber_AsSsize_t") + # We don't want any clipping, so pass OverflowError as the 2nd arg + exc_class = self.get_c_object("PyExc_OverflowError") + return self.builder.call(fn, [numobj, exc_class]) + + def number_long(self, numobj): + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PyNumber_Long") + return self.builder.call(fn, [numobj]) + + def long_as_ulonglong(self, numobj): + fnty = ir.FunctionType(self.ulonglong, [self.pyobj]) + fn = self._get_function(fnty, name="PyLong_AsUnsignedLongLong") + return self.builder.call(fn, [numobj]) + + def long_as_longlong(self, numobj): + fnty = ir.FunctionType(self.ulonglong, [self.pyobj]) + fn = self._get_function(fnty, name="PyLong_AsLongLong") + return self.builder.call(fn, [numobj]) + + def long_as_voidptr(self, numobj): + """ + Convert the given Python integer to a void*. This is recommended + over number_as_ssize_t as it isn't affected by signedness. + """ + fnty = ir.FunctionType(self.voidptr, [self.pyobj]) + fn = self._get_function(fnty, name="PyLong_AsVoidPtr") + return self.builder.call(fn, [numobj]) + + def _long_from_native_int(self, ival, func_name, native_int_type, + signed): + fnty = ir.FunctionType(self.pyobj, [native_int_type]) + fn = self._get_function(fnty, name=func_name) + resptr = cgutils.alloca_once(self.builder, self.pyobj) + fn = self._get_function(fnty, name=func_name) + self.builder.store(self.builder.call(fn, [ival]), resptr) + + return self.builder.load(resptr) + + def long_from_long(self, ival): + func_name = "PyLong_FromLong" + fnty = ir.FunctionType(self.pyobj, [self.long]) + fn = self._get_function(fnty, name=func_name) + return self.builder.call(fn, [ival]) + + def long_from_ulong(self, ival): + return self._long_from_native_int(ival, "PyLong_FromUnsignedLong", + self.long, signed=False) + + def long_from_ssize_t(self, ival): + return self._long_from_native_int(ival, "PyLong_FromSsize_t", + self.py_ssize_t, signed=True) + + def long_from_longlong(self, ival): + return self._long_from_native_int(ival, "PyLong_FromLongLong", + self.longlong, signed=True) + + def long_from_ulonglong(self, ival): + return self._long_from_native_int(ival, "PyLong_FromUnsignedLongLong", + self.ulonglong, signed=False) + + def long_from_signed_int(self, ival): + """ + Return a Python integer from any native integer value. + """ + bits = ival.type.width + if bits <= self.long.width: + return self.long_from_long(self.builder.sext(ival, self.long)) + elif bits <= self.longlong.width: + return self.long_from_longlong(self.builder.sext(ival, self.longlong)) + else: + raise OverflowError("integer too big (%d bits)" % (bits)) + + def long_from_unsigned_int(self, ival): + """ + Same as long_from_signed_int, but for unsigned values. + """ + bits = ival.type.width + if bits <= self.ulong.width: + return self.long_from_ulong(self.builder.zext(ival, self.ulong)) + elif bits <= self.ulonglong.width: + return self.long_from_ulonglong(self.builder.zext(ival, self.ulonglong)) + else: + raise OverflowError("integer too big (%d bits)" % (bits)) + + def _get_number_operator(self, name): + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PyNumber_%s" % name) + return fn + + def _call_number_operator(self, name, lhs, rhs, inplace=False): + if inplace: + name = "InPlace" + name + fn = self._get_number_operator(name) + return self.builder.call(fn, [lhs, rhs]) + + def number_add(self, lhs, rhs, inplace=False): + return self._call_number_operator("Add", lhs, rhs, inplace=inplace) + + def number_subtract(self, lhs, rhs, inplace=False): + return self._call_number_operator("Subtract", lhs, rhs, inplace=inplace) + + def number_multiply(self, lhs, rhs, inplace=False): + return self._call_number_operator("Multiply", lhs, rhs, inplace=inplace) + + def number_truedivide(self, lhs, rhs, inplace=False): + return self._call_number_operator("TrueDivide", lhs, rhs, inplace=inplace) + + def number_floordivide(self, lhs, rhs, inplace=False): + return self._call_number_operator("FloorDivide", lhs, rhs, inplace=inplace) + + def number_remainder(self, lhs, rhs, inplace=False): + return self._call_number_operator("Remainder", lhs, rhs, inplace=inplace) + + def number_matrix_multiply(self, lhs, rhs, inplace=False): + return self._call_number_operator("MatrixMultiply", lhs, rhs, inplace=inplace) + + def number_lshift(self, lhs, rhs, inplace=False): + return self._call_number_operator("Lshift", lhs, rhs, inplace=inplace) + + def number_rshift(self, lhs, rhs, inplace=False): + return self._call_number_operator("Rshift", lhs, rhs, inplace=inplace) + + def number_and(self, lhs, rhs, inplace=False): + return self._call_number_operator("And", lhs, rhs, inplace=inplace) + + def number_or(self, lhs, rhs, inplace=False): + return self._call_number_operator("Or", lhs, rhs, inplace=inplace) + + def number_xor(self, lhs, rhs, inplace=False): + return self._call_number_operator("Xor", lhs, rhs, inplace=inplace) + + def number_power(self, lhs, rhs, inplace=False): + fnty = ir.FunctionType(self.pyobj, [self.pyobj] * 3) + fname = "PyNumber_InPlacePower" if inplace else "PyNumber_Power" + fn = self._get_function(fnty, fname) + return self.builder.call(fn, [lhs, rhs, self.borrow_none()]) + + def number_negative(self, obj): + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PyNumber_Negative") + return self.builder.call(fn, (obj,)) + + def number_positive(self, obj): + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PyNumber_Positive") + return self.builder.call(fn, (obj,)) + + def number_float(self, val): + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PyNumber_Float") + return self.builder.call(fn, [val]) + + def number_invert(self, obj): + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PyNumber_Invert") + return self.builder.call(fn, (obj,)) + + def float_as_double(self, fobj): + fnty = ir.FunctionType(self.double, [self.pyobj]) + fn = self._get_function(fnty, name="PyFloat_AsDouble") + return self.builder.call(fn, [fobj]) + + def bool_from_bool(self, bval): + """ + Get a Python bool from a LLVM boolean. + """ + longval = self.builder.zext(bval, self.long) + return self.bool_from_long(longval) + + def bool_from_long(self, ival): + fnty = ir.FunctionType(self.pyobj, [self.long]) + fn = self._get_function(fnty, name="PyBool_FromLong") + return self.builder.call(fn, [ival]) + + def complex_from_doubles(self, realval, imagval): + fnty = ir.FunctionType(self.pyobj, [ir.DoubleType(), ir.DoubleType()]) + fn = self._get_function(fnty, name="PyComplex_FromDoubles") + return self.builder.call(fn, [realval, imagval]) + + def complex_real_as_double(self, cobj): + fnty = ir.FunctionType(ir.DoubleType(), [self.pyobj]) + fn = self._get_function(fnty, name="PyComplex_RealAsDouble") + return self.builder.call(fn, [cobj]) + + def complex_imag_as_double(self, cobj): + fnty = ir.FunctionType(ir.DoubleType(), [self.pyobj]) + fn = self._get_function(fnty, name="PyComplex_ImagAsDouble") + return self.builder.call(fn, [cobj]) + + # + # Concrete slice API + # + def slice_as_ints(self, obj): + """ + Read the members of a slice of integers. + + Returns a (ok, start, stop, step) tuple where ok is a boolean and + the following members are pointer-sized ints. + """ + pstart = cgutils.alloca_once(self.builder, self.py_ssize_t) + pstop = cgutils.alloca_once(self.builder, self.py_ssize_t) + pstep = cgutils.alloca_once(self.builder, self.py_ssize_t) + fnty = ir.FunctionType(ir.IntType(32), + [self.pyobj] + [self.py_ssize_t.as_pointer()] * 3) + fn = self._get_function(fnty, name="numba_unpack_slice") + res = self.builder.call(fn, (obj, pstart, pstop, pstep)) + start = self.builder.load(pstart) + stop = self.builder.load(pstop) + step = self.builder.load(pstep) + return cgutils.is_null(self.builder, res), start, stop, step + + # + # List and sequence APIs + # + + def sequence_getslice(self, obj, start, stop): + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.py_ssize_t, + self.py_ssize_t]) + fn = self._get_function(fnty, name="PySequence_GetSlice") + return self.builder.call(fn, (obj, start, stop)) + + def sequence_tuple(self, obj): + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PySequence_Tuple") + return self.builder.call(fn, [obj]) + + def sequence_concat(self, obj1, obj2): + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PySequence_Concat") + return self.builder.call(fn, [obj1, obj2]) + + def list_new(self, szval): + fnty = ir.FunctionType(self.pyobj, [self.py_ssize_t]) + fn = self._get_function(fnty, name="PyList_New") + return self.builder.call(fn, [szval]) + + def list_size(self, lst): + fnty = ir.FunctionType(self.py_ssize_t, [self.pyobj]) + fn = self._get_function(fnty, name="PyList_Size") + return self.builder.call(fn, [lst]) + + def list_append(self, lst, val): + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PyList_Append") + return self.builder.call(fn, [lst, val]) + + def list_setitem(self, lst, idx, val): + """ + Warning: Steals reference to ``val`` + """ + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.py_ssize_t, + self.pyobj]) + fn = self._get_function(fnty, name="PyList_SetItem") + return self.builder.call(fn, [lst, idx, val]) + + def list_getitem(self, lst, idx): + """ + Returns a borrowed reference. + """ + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.py_ssize_t]) + fn = self._get_function(fnty, name="PyList_GetItem") + if isinstance(idx, int): + idx = self.context.get_constant(types.intp, idx) + return self.builder.call(fn, [lst, idx]) + + def list_setslice(self, lst, start, stop, obj): + if obj is None: + obj = self.get_null_object() + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.py_ssize_t, + self.py_ssize_t, self.pyobj]) + fn = self._get_function(fnty, name="PyList_SetSlice") + return self.builder.call(fn, (lst, start, stop, obj)) + + + # + # Concrete tuple API + # + + def tuple_getitem(self, tup, idx): + """ + Borrow reference + """ + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.py_ssize_t]) + fn = self._get_function(fnty, name="PyTuple_GetItem") + idx = self.context.get_constant(types.intp, idx) + return self.builder.call(fn, [tup, idx]) + + def tuple_pack(self, items): + fnty = ir.FunctionType(self.pyobj, [self.py_ssize_t], var_arg=True) + fn = self._get_function(fnty, name="PyTuple_Pack") + n = self.context.get_constant(types.intp, len(items)) + args = [n] + args.extend(items) + return self.builder.call(fn, args) + + def tuple_size(self, tup): + fnty = ir.FunctionType(self.py_ssize_t, [self.pyobj]) + fn = self._get_function(fnty, name="PyTuple_Size") + return self.builder.call(fn, [tup]) + + def tuple_new(self, count): + fnty = ir.FunctionType(self.pyobj, [ir.IntType(32)]) + fn = self._get_function(fnty, name='PyTuple_New') + return self.builder.call(fn, [self.context.get_constant(types.int32, + count)]) + + def tuple_setitem(self, tuple_val, index, item): + """ + Steals a reference to `item`. + """ + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, ir.IntType(32), self.pyobj]) + setitem_fn = self._get_function(fnty, name='PyTuple_SetItem') + index = self.context.get_constant(types.int32, index) + self.builder.call(setitem_fn, [tuple_val, index, item]) + + # + # Concrete set API + # + + def set_new(self, iterable=None): + if iterable is None: + iterable = self.get_null_object() + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PySet_New") + return self.builder.call(fn, [iterable]) + + def set_add(self, set, value): + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PySet_Add") + return self.builder.call(fn, [set, value]) + + def set_clear(self, set): + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj]) + fn = self._get_function(fnty, name="PySet_Clear") + return self.builder.call(fn, [set]) + + def set_size(self, set): + fnty = ir.FunctionType(self.py_ssize_t, [self.pyobj]) + fn = self._get_function(fnty, name="PySet_Size") + return self.builder.call(fn, [set]) + + def set_update(self, set, iterable): + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="_PySet_Update") + return self.builder.call(fn, [set, iterable]) + + def set_next_entry(self, set, posptr, keyptr, hashptr): + fnty = ir.FunctionType(ir.IntType(32), + [self.pyobj, self.py_ssize_t.as_pointer(), + self.pyobj.as_pointer(), self.py_hash_t.as_pointer()]) + fn = self._get_function(fnty, name="_PySet_NextEntry") + return self.builder.call(fn, (set, posptr, keyptr, hashptr)) + + @contextlib.contextmanager + def set_iterate(self, set): + builder = self.builder + + hashptr = cgutils.alloca_once(builder, self.py_hash_t, name="hashptr") + keyptr = cgutils.alloca_once(builder, self.pyobj, name="keyptr") + posptr = cgutils.alloca_once_value(builder, + Constant(self.py_ssize_t, 0), + name="posptr") + + bb_body = builder.append_basic_block("bb_body") + bb_end = builder.append_basic_block("bb_end") + + builder.branch(bb_body) + def do_break(): + builder.branch(bb_end) + + with builder.goto_block(bb_body): + r = self.set_next_entry(set, posptr, keyptr, hashptr) + finished = cgutils.is_null(builder, r) + with builder.if_then(finished, likely=False): + builder.branch(bb_end) + yield _IteratorLoop(builder.load(keyptr), do_break) + builder.branch(bb_body) + + builder.position_at_end(bb_end) + + # + # GIL APIs + # + + def gil_ensure(self): + """ + Ensure the GIL is acquired. + The returned value must be consumed by gil_release(). + """ + gilptrty = ir.PointerType(self.gil_state) + fnty = ir.FunctionType(ir.VoidType(), [gilptrty]) + fn = self._get_function(fnty, "numba_gil_ensure") + gilptr = cgutils.alloca_once(self.builder, self.gil_state) + self.builder.call(fn, [gilptr]) + return gilptr + + def gil_release(self, gil): + """ + Release the acquired GIL by gil_ensure(). + Must be paired with a gil_ensure(). + """ + gilptrty = ir.PointerType(self.gil_state) + fnty = ir.FunctionType(ir.VoidType(), [gilptrty]) + fn = self._get_function(fnty, "numba_gil_release") + return self.builder.call(fn, [gil]) + + def save_thread(self): + """ + Release the GIL and return the former thread state + (an opaque non-NULL pointer). + """ + fnty = ir.FunctionType(self.voidptr, []) + fn = self._get_function(fnty, name="PyEval_SaveThread") + return self.builder.call(fn, []) + + def restore_thread(self, thread_state): + """ + Restore the given thread state by reacquiring the GIL. + """ + fnty = ir.FunctionType(ir.VoidType(), [self.voidptr]) + fn = self._get_function(fnty, name="PyEval_RestoreThread") + self.builder.call(fn, [thread_state]) + + # + # Generic object private data (a way of associating an arbitrary void * + # pointer to an arbitrary Python object). + # + + def object_get_private_data(self, obj): + fnty = ir.FunctionType(self.voidptr, [self.pyobj]) + fn = self._get_function(fnty, name="numba_get_pyobject_private_data") + return self.builder.call(fn, (obj,)) + + def object_set_private_data(self, obj, ptr): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj, self.voidptr]) + fn = self._get_function(fnty, name="numba_set_pyobject_private_data") + return self.builder.call(fn, (obj, ptr)) + + def object_reset_private_data(self, obj): + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj]) + fn = self._get_function(fnty, name="numba_reset_pyobject_private_data") + return self.builder.call(fn, (obj,)) + + + # + # Other APIs (organize them better!) + # + + def import_module_noblock(self, modname): + fnty = ir.FunctionType(self.pyobj, [self.cstring]) + fn = self._get_function(fnty, name="PyImport_ImportModuleNoBlock") + return self.builder.call(fn, [modname]) + + def call_function_objargs(self, callee, objargs): + fnty = ir.FunctionType(self.pyobj, [self.pyobj], var_arg=True) + fn = self._get_function(fnty, name="PyObject_CallFunctionObjArgs") + args = [callee] + list(objargs) + args.append(self.context.get_constant_null(types.pyobject)) + return self.builder.call(fn, args) + + def call_method(self, callee, method, objargs=()): + cname = self.context.insert_const_string(self.module, method) + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.cstring, self.cstring], + var_arg=True) + fn = self._get_function(fnty, name="PyObject_CallMethod") + fmt = 'O' * len(objargs) + cfmt = self.context.insert_const_string(self.module, fmt) + args = [callee, cname, cfmt] + if objargs: + args.extend(objargs) + args.append(self.context.get_constant_null(types.pyobject)) + return self.builder.call(fn, args) + + def call(self, callee, args=None, kws=None): + if args is None: + args = self.get_null_object() + if kws is None: + kws = self.get_null_object() + fnty = ir.FunctionType(self.pyobj, [self.pyobj] * 3) + fn = self._get_function(fnty, name="PyObject_Call") + return self.builder.call(fn, (callee, args, kws)) + + def object_type(self, obj): + """Emit a call to ``PyObject_Type(obj)`` to get the type of ``obj``. + """ + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PyObject_Type") + return self.builder.call(fn, (obj,)) + + def object_istrue(self, obj): + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj]) + fn = self._get_function(fnty, name="PyObject_IsTrue") + return self.builder.call(fn, [obj]) + + def object_not(self, obj): + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj]) + fn = self._get_function(fnty, name="PyObject_Not") + return self.builder.call(fn, [obj]) + + def object_richcompare(self, lhs, rhs, opstr): + """ + Refer to Python source Include/object.h for macros definition + of the opid. + """ + ops = ['<', '<=', '==', '!=', '>', '>='] + if opstr in ops: + opid = ops.index(opstr) + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.pyobj, ir.IntType(32)]) + fn = self._get_function(fnty, name="PyObject_RichCompare") + lopid = self.context.get_constant(types.int32, opid) + return self.builder.call(fn, (lhs, rhs, lopid)) + elif opstr == 'is': + bitflag = self.builder.icmp_unsigned('==', lhs, rhs) + return self.bool_from_bool(bitflag) + elif opstr == 'is not': + bitflag = self.builder.icmp_unsigned('!=', lhs, rhs) + return self.bool_from_bool(bitflag) + elif opstr in ('in', 'not in'): + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PySequence_Contains") + status = self.builder.call(fn, (rhs, lhs)) + negone = self.context.get_constant(types.int32, -1) + is_good = self.builder.icmp_unsigned('!=', status, negone) + # Stack allocate output and initialize to Null + outptr = cgutils.alloca_once_value(self.builder, + Constant(self.pyobj, None)) + # If PySequence_Contains returns non-error value + with cgutils.if_likely(self.builder, is_good): + if opstr == 'not in': + status = self.builder.not_(status) + # Store the status as a boolean object + truncated = self.builder.trunc(status, ir.IntType(1)) + self.builder.store(self.bool_from_bool(truncated), + outptr) + + return self.builder.load(outptr) + else: + raise NotImplementedError("Unknown operator {op!r}".format( + op=opstr)) + + def iter_next(self, iterobj): + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PyIter_Next") + return self.builder.call(fn, [iterobj]) + + def object_getiter(self, obj): + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PyObject_GetIter") + return self.builder.call(fn, [obj]) + + def object_getattr_string(self, obj, attr): + cstr = self.context.insert_const_string(self.module, attr) + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.cstring]) + fn = self._get_function(fnty, name="PyObject_GetAttrString") + return self.builder.call(fn, [obj, cstr]) + + def object_getattr(self, obj, attr): + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PyObject_GetAttr") + return self.builder.call(fn, [obj, attr]) + + def object_setattr_string(self, obj, attr, val): + cstr = self.context.insert_const_string(self.module, attr) + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.cstring, self.pyobj]) + fn = self._get_function(fnty, name="PyObject_SetAttrString") + return self.builder.call(fn, [obj, cstr, val]) + + def object_setattr(self, obj, attr, val): + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PyObject_SetAttr") + return self.builder.call(fn, [obj, attr, val]) + + def object_delattr_string(self, obj, attr): + # PyObject_DelAttrString() is actually a C macro calling + # PyObject_SetAttrString() with value == NULL. + return self.object_setattr_string(obj, attr, self.get_null_object()) + + def object_delattr(self, obj, attr): + # PyObject_DelAttr() is actually a C macro calling + # PyObject_SetAttr() with value == NULL. + return self.object_setattr(obj, attr, self.get_null_object()) + + def object_getitem(self, obj, key): + """ + Return obj[key] + """ + fnty = ir.FunctionType(self.pyobj, [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PyObject_GetItem") + return self.builder.call(fn, (obj, key)) + + def object_setitem(self, obj, key, val): + """ + obj[key] = val + """ + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PyObject_SetItem") + return self.builder.call(fn, (obj, key, val)) + + def object_delitem(self, obj, key): + """ + del obj[key] + """ + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.pyobj]) + fn = self._get_function(fnty, name="PyObject_DelItem") + return self.builder.call(fn, (obj, key)) + + def string_as_string(self, strobj): + fnty = ir.FunctionType(self.cstring, [self.pyobj]) + fname = "PyUnicode_AsUTF8" + fn = self._get_function(fnty, name=fname) + return self.builder.call(fn, [strobj]) + + def string_as_string_and_size(self, strobj): + """ + Returns a tuple of ``(ok, buffer, length)``. + The ``ok`` is i1 value that is set if ok. + The ``buffer`` is a i8* of the output buffer. + The ``length`` is a i32/i64 (py_ssize_t) of the length of the buffer. + """ + + p_length = cgutils.alloca_once(self.builder, self.py_ssize_t) + fnty = ir.FunctionType(self.cstring, [self.pyobj, + self.py_ssize_t.as_pointer()]) + fname = "PyUnicode_AsUTF8AndSize" + fn = self._get_function(fnty, name=fname) + + buffer = self.builder.call(fn, [strobj, p_length]) + ok = self.builder.icmp_unsigned('!=', + Constant(buffer.type, None), + buffer) + return (ok, buffer, self.builder.load(p_length)) + + def string_as_string_size_and_kind(self, strobj): + """ + Returns a tuple of ``(ok, buffer, length, kind)``. + The ``ok`` is i1 value that is set if ok. + The ``buffer`` is a i8* of the output buffer. + The ``length`` is a i32/i64 (py_ssize_t) of the length of the buffer. + The ``kind`` is a i32 (int32) of the Unicode kind constant + The ``hash`` is a long/uint64_t (py_hash_t) of the Unicode constant hash + """ + p_length = cgutils.alloca_once(self.builder, self.py_ssize_t) + p_kind = cgutils.alloca_once(self.builder, ir.IntType(32)) + p_ascii = cgutils.alloca_once(self.builder, ir.IntType(32)) + p_hash = cgutils.alloca_once(self.builder, self.py_hash_t) + fnty = ir.FunctionType(self.cstring, [self.pyobj, + self.py_ssize_t.as_pointer(), + ir.IntType(32).as_pointer(), + ir.IntType(32).as_pointer(), + self.py_hash_t.as_pointer()]) + fname = "numba_extract_unicode" + fn = self._get_function(fnty, name=fname) + + buffer = self.builder.call( + fn, [strobj, p_length, p_kind, p_ascii, p_hash]) + ok = self.builder.icmp_unsigned('!=', + Constant(buffer.type, None), + buffer) + return (ok, buffer, self.builder.load(p_length), + self.builder.load(p_kind), self.builder.load(p_ascii), + self.builder.load(p_hash)) + + def string_from_string_and_size(self, string, size): + fnty = ir.FunctionType(self.pyobj, [self.cstring, self.py_ssize_t]) + fname = "PyString_FromStringAndSize" + fn = self._get_function(fnty, name=fname) + return self.builder.call(fn, [string, size]) + + def string_from_string(self, string): + fnty = ir.FunctionType(self.pyobj, [self.cstring]) + fname = "PyUnicode_FromString" + fn = self._get_function(fnty, name=fname) + return self.builder.call(fn, [string]) + + def string_from_kind_and_data(self, kind, string, size): + fnty = ir.FunctionType(self.pyobj, [ir.IntType(32), self.cstring, self.py_ssize_t]) + fname = "PyUnicode_FromKindAndData" + fn = self._get_function(fnty, name=fname) + return self.builder.call(fn, [kind, string, size]) + + def bytes_from_string_and_size(self, string, size): + fnty = ir.FunctionType(self.pyobj, [self.cstring, self.py_ssize_t]) + fname = "PyBytes_FromStringAndSize" + fn = self._get_function(fnty, name=fname) + return self.builder.call(fn, [string, size]) + + def object_hash(self, obj): + fnty = ir.FunctionType(self.py_hash_t, [self.pyobj, ]) + fname = "PyObject_Hash" + fn = self._get_function(fnty, name=fname) + return self.builder.call(fn, [obj,]) + + def object_str(self, obj): + fnty = ir.FunctionType(self.pyobj, [self.pyobj]) + fn = self._get_function(fnty, name="PyObject_Str") + return self.builder.call(fn, [obj]) + + def make_none(self): + obj = self.borrow_none() + self.incref(obj) + return obj + + def borrow_none(self): + return self.get_c_object("_Py_NoneStruct") + + def sys_write_stdout(self, fmt, *args): + fnty = ir.FunctionType(ir.VoidType(), [self.cstring], var_arg=True) + fn = self._get_function(fnty, name="PySys_FormatStdout") + return self.builder.call(fn, (fmt,) + args) + + def object_dump(self, obj): + """ + Dump a Python object on C stderr. For debugging purposes. + """ + fnty = ir.FunctionType(ir.VoidType(), [self.pyobj]) + fn = self._get_function(fnty, name="_PyObject_Dump") + return self.builder.call(fn, (obj,)) + + # + # NRT (Numba runtime) APIs + # + + def nrt_adapt_ndarray_to_python(self, aryty, ary, dtypeptr): + assert self.context.enable_nrt, "NRT required" + + intty = ir.IntType(32) + # Embed the Python type of the array (maybe subclass) in the LLVM IR. + serial_aryty_pytype = self.unserialize(self.serialize_object(aryty.box_type)) + + fnty = ir.FunctionType(self.pyobj, + [self.voidptr, self.pyobj, intty, intty, self.pyobj]) + fn = self._get_function(fnty, name="NRT_adapt_ndarray_to_python_acqref") + fn.args[0].add_attribute('nocapture') + + ndim = self.context.get_constant(types.int32, aryty.ndim) + writable = self.context.get_constant(types.int32, int(aryty.mutable)) + + aryptr = cgutils.alloca_once_value(self.builder, ary) + return self.builder.call(fn, [self.builder.bitcast(aryptr, + self.voidptr), + serial_aryty_pytype, + ndim, writable, dtypeptr]) + + def nrt_meminfo_new_from_pyobject(self, data, pyobj): + """ + Allocate a new MemInfo with data payload borrowed from a python + object. + """ + mod = self.builder.module + fnty = ir.FunctionType( + cgutils.voidptr_t, + [cgutils.voidptr_t, cgutils.voidptr_t], + ) + fn = cgutils.get_or_insert_function( + mod, + fnty, + "NRT_meminfo_new_from_pyobject", + ) + fn.args[0].add_attribute('nocapture') + fn.args[1].add_attribute('nocapture') + fn.return_value.add_attribute("noalias") + return self.builder.call(fn, [data, pyobj]) + + def nrt_meminfo_as_pyobject(self, miptr): + mod = self.builder.module + fnty = ir.FunctionType( + self.pyobj, + [cgutils.voidptr_t] + ) + fn = cgutils.get_or_insert_function( + mod, + fnty, + 'NRT_meminfo_as_pyobject', + ) + fn.return_value.add_attribute("noalias") + return self.builder.call(fn, [miptr]) + + def nrt_meminfo_from_pyobject(self, miobj): + mod = self.builder.module + fnty = ir.FunctionType( + cgutils.voidptr_t, + [self.pyobj] + ) + fn = cgutils.get_or_insert_function( + mod, + fnty, + 'NRT_meminfo_from_pyobject', + ) + fn.return_value.add_attribute("noalias") + return self.builder.call(fn, [miobj]) + + def nrt_adapt_ndarray_from_python(self, ary, ptr): + assert self.context.enable_nrt + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.voidptr]) + fn = self._get_function(fnty, name="NRT_adapt_ndarray_from_python") + fn.args[0].add_attribute('nocapture') + fn.args[1].add_attribute('nocapture') + return self.builder.call(fn, (ary, ptr)) + + def nrt_adapt_buffer_from_python(self, buf, ptr): + assert self.context.enable_nrt + fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(self.py_buffer_t), + self.voidptr]) + fn = self._get_function(fnty, name="NRT_adapt_buffer_from_python") + fn.args[0].add_attribute('nocapture') + fn.args[1].add_attribute('nocapture') + return self.builder.call(fn, (buf, ptr)) + + # ------ utils ----- + + def _get_function(self, fnty, name): + return cgutils.get_or_insert_function(self.module, fnty, name) + + def alloca_obj(self): + return self.builder.alloca(self.pyobj) + + def alloca_buffer(self): + """ + Return a pointer to a stack-allocated, zero-initialized Py_buffer. + """ + # Treat the buffer as an opaque array of bytes + ptr = cgutils.alloca_once_value(self.builder, + Constant(self.py_buffer_t, None)) + return ptr + + @contextlib.contextmanager + def if_object_ok(self, obj): + with cgutils.if_likely(self.builder, + cgutils.is_not_null(self.builder, obj)): + yield + + def print_object(self, obj): + strobj = self.object_str(obj) + cstr = self.string_as_string(strobj) + fmt = self.context.insert_const_string(self.module, "%s") + self.sys_write_stdout(fmt, cstr) + self.decref(strobj) + + def print_string(self, text): + fmt = self.context.insert_const_string(self.module, text) + self.sys_write_stdout(fmt) + + def get_null_object(self): + return Constant(self.pyobj, None) + + def return_none(self): + none = self.make_none() + self.builder.ret(none) + + def list_pack(self, items): + n = len(items) + seq = self.list_new(self.context.get_constant(types.intp, n)) + with self.if_object_ok(seq): + for i in range(n): + idx = self.context.get_constant(types.intp, i) + self.incref(items[i]) + self.list_setitem(seq, idx, items[i]) + return seq + + def unserialize(self, structptr): + """ + Unserialize some data. *structptr* should be a pointer to + a {i8* data, i32 length} structure. + """ + fnty = ir.FunctionType(self.pyobj, + (self.voidptr, ir.IntType(32), self.voidptr)) + fn = self._get_function(fnty, name="numba_unpickle") + ptr = self.builder.extract_value(self.builder.load(structptr), 0) + n = self.builder.extract_value(self.builder.load(structptr), 1) + hashed = self.builder.extract_value(self.builder.load(structptr), 2) + return self.builder.call(fn, (ptr, n, hashed)) + + def serialize_uncached(self, obj): + """ + Same as serialize_object(), but don't create a global variable, + simply return a literal {i8* data, i32 length, i8* hashbuf} structure. + """ + # First make the array constant + data = serialize.dumps(obj) + assert len(data) < 2**31 + name = ".const.pickledata.%s" % (id(obj) if config.DIFF_IR == 0 else "DIFF_IR") + bdata = cgutils.make_bytearray(data) + # Make SHA1 hash on the pickled content + # NOTE: update buffer size in numba_unpickle() when changing the + # hash algorithm. + hashed = cgutils.make_bytearray(hashlib.sha1(data).digest()) + arr = self.context.insert_unique_const(self.module, name, bdata) + hasharr = self.context.insert_unique_const( + self.module, f"{name}.sha1", hashed, + ) + # Then populate the structure constant + struct = Constant.literal_struct([ + arr.bitcast(self.voidptr), + Constant(ir.IntType(32), arr.type.pointee.count), + hasharr.bitcast(self.voidptr), + ]) + return struct + + def serialize_object(self, obj): + """ + Serialize the given object in the bitcode, and return it + as a pointer to a {i8* data, i32 length}, structure constant + (suitable for passing to unserialize()). + """ + try: + gv = self.module.__serialized[obj] + except KeyError: + struct = self.serialize_uncached(obj) + name = ".const.picklebuf.%s" % (id(obj) if config.DIFF_IR == 0 else "DIFF_IR") + gv = self.context.insert_unique_const(self.module, name, struct) + # Make the id() (and hence the name) unique while populating the module. + self.module.__serialized[obj] = gv + return gv + + def c_api_error(self): + return cgutils.is_not_null(self.builder, self.err_occurred()) + + def to_native_value(self, typ, obj): + """ + Unbox the Python object as the given Numba type. + A NativeValue instance is returned. + """ + from numba.core.boxing import unbox_unsupported + + impl = _unboxers.lookup(typ.__class__, unbox_unsupported) + c = _UnboxContext(self.context, self.builder, self) + return impl(typ, obj, c) + + def from_native_return(self, typ, val, env_manager): + assert not isinstance(typ, types.Optional), "callconv should have " \ + "prevented the return of " \ + "optional value" + out = self.from_native_value(typ, val, env_manager) + return out + + def from_native_value(self, typ, val, env_manager=None): + """ + Box the native value of the given Numba type. A Python object + pointer is returned (NULL if an error occurred). + This method steals any native (NRT) reference embedded in *val*. + """ + from numba.core.boxing import box_unsupported + + impl = _boxers.lookup(typ.__class__, box_unsupported) + + c = _BoxContext(self.context, self.builder, self, env_manager) + return impl(typ, val, c) + + def reflect_native_value(self, typ, val, env_manager=None): + """ + Reflect the native value onto its Python original, if any. + An error bit (as an LLVM value) is returned. + """ + impl = _reflectors.lookup(typ.__class__) + if impl is None: + # Reflection isn't needed for most types + return cgutils.false_bit + + is_error = cgutils.alloca_once_value(self.builder, cgutils.false_bit) + c = _ReflectContext(self.context, self.builder, self, env_manager, + is_error) + impl(typ, val, c) + return self.builder.load(c.is_error) + + def to_native_generator(self, obj, typ): + """ + Extract the generator structure pointer from a generator *obj* + (a _dynfunc.Generator instance). + """ + gen_ptr_ty = ir.PointerType(self.context.get_data_type(typ)) + value = self.context.get_generator_state(self.builder, obj, gen_ptr_ty) + return NativeValue(value) + + def from_native_generator(self, val, typ, env=None): + """ + Make a Numba generator (a _dynfunc.Generator instance) from a + generator structure pointer *val*. + *env* is an optional _dynfunc.Environment instance to be wrapped + in the generator. + """ + llty = self.context.get_data_type(typ) + assert not llty.is_pointer + gen_struct_size = self.context.get_abi_sizeof(llty) + + gendesc = self.context.get_generator_desc(typ) + + # This is the PyCFunctionWithKeywords generated by PyCallWrapper + genfnty = ir.FunctionType(self.pyobj, [self.pyobj, self.pyobj, self.pyobj]) + genfn = self._get_function(genfnty, name=gendesc.llvm_cpython_wrapper_name) + + # This is the raw finalizer generated by _lower_generator_finalize_func() + finalizerty = ir.FunctionType(ir.VoidType(), [self.voidptr]) + if typ.has_finalizer: + finalizer = self._get_function(finalizerty, name=gendesc.llvm_finalizer_name) + else: + finalizer = Constant(ir.PointerType(finalizerty), None) + + # PyObject *numba_make_generator(state_size, initial_state, nextfunc, finalizer, env) + fnty = ir.FunctionType(self.pyobj, [self.py_ssize_t, + self.voidptr, + ir.PointerType(genfnty), + ir.PointerType(finalizerty), + self.voidptr]) + fn = self._get_function(fnty, name="numba_make_generator") + + state_size = Constant(self.py_ssize_t, gen_struct_size) + initial_state = self.builder.bitcast(val, self.voidptr) + if env is None: + env = self.get_null_object() + env = self.builder.bitcast(env, self.voidptr) + + return self.builder.call(fn, + (state_size, initial_state, genfn, finalizer, env)) + + def numba_array_adaptor(self, ary, ptr): + assert not self.context.enable_nrt + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, self.voidptr]) + fn = self._get_function(fnty, name="numba_adapt_ndarray") + fn.args[0].add_attribute('nocapture') + fn.args[1].add_attribute('nocapture') + return self.builder.call(fn, (ary, ptr)) + + def numba_buffer_adaptor(self, buf, ptr): + fnty = ir.FunctionType(ir.VoidType(), + [ir.PointerType(self.py_buffer_t), self.voidptr]) + fn = self._get_function(fnty, name="numba_adapt_buffer") + fn.args[0].add_attribute('nocapture') + fn.args[1].add_attribute('nocapture') + return self.builder.call(fn, (buf, ptr)) + + def complex_adaptor(self, cobj, cmplx): + fnty = ir.FunctionType(ir.IntType(32), [self.pyobj, cmplx.type]) + fn = self._get_function(fnty, name="numba_complex_adaptor") + return self.builder.call(fn, [cobj, cmplx]) + + def extract_record_data(self, obj, pbuf): + fnty = ir.FunctionType(self.voidptr, + [self.pyobj, ir.PointerType(self.py_buffer_t)]) + fn = self._get_function(fnty, name="numba_extract_record_data") + return self.builder.call(fn, [obj, pbuf]) + + def get_buffer(self, obj, pbuf): + fnty = ir.FunctionType(ir.IntType(32), + [self.pyobj, ir.PointerType(self.py_buffer_t)]) + fn = self._get_function(fnty, name="numba_get_buffer") + return self.builder.call(fn, [obj, pbuf]) + + def release_buffer(self, pbuf): + fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(self.py_buffer_t)]) + fn = self._get_function(fnty, name="numba_release_buffer") + return self.builder.call(fn, [pbuf]) + + def extract_np_datetime(self, obj): + fnty = ir.FunctionType(ir.IntType(64), [self.pyobj]) + fn = self._get_function(fnty, name="numba_extract_np_datetime") + return self.builder.call(fn, [obj]) + + def extract_np_timedelta(self, obj): + fnty = ir.FunctionType(ir.IntType(64), [self.pyobj]) + fn = self._get_function(fnty, name="numba_extract_np_timedelta") + return self.builder.call(fn, [obj]) + + def create_np_datetime(self, val, unit_code): + unit_code = Constant(ir.IntType(32), int(unit_code)) + fnty = ir.FunctionType(self.pyobj, [ir.IntType(64), ir.IntType(32)]) + fn = self._get_function(fnty, name="numba_create_np_datetime") + return self.builder.call(fn, [val, unit_code]) + + def create_np_timedelta(self, val, unit_code): + unit_code = Constant(ir.IntType(32), int(unit_code)) + fnty = ir.FunctionType(self.pyobj, [ir.IntType(64), ir.IntType(32)]) + fn = self._get_function(fnty, name="numba_create_np_timedelta") + return self.builder.call(fn, [val, unit_code]) + + def recreate_record(self, pdata, size, dtype, env_manager): + fnty = ir.FunctionType(self.pyobj, [ir.PointerType(ir.IntType(8)), + ir.IntType(32), self.pyobj]) + fn = self._get_function(fnty, name="numba_recreate_record") + dtypeaddr = env_manager.read_const(env_manager.add_const(dtype)) + return self.builder.call(fn, [pdata, size, dtypeaddr]) + + def string_from_constant_string(self, string): + cstr = self.context.insert_const_string(self.module, string) + sz = self.context.get_constant(types.intp, len(string)) + return self.string_from_string_and_size(cstr, sz) + + def call_jit_code(self, func, sig, args): + """Calls into Numba jitted code and propagate error using the Python + calling convention. + + Parameters + ---------- + func : function + The Python function to be compiled. This function is compiled + in nopython-mode. + sig : numba.typing.Signature + The function signature for *func*. + args : Sequence[llvmlite.binding.Value] + LLVM values to use as arguments. + + Returns + ------- + (is_error, res) : 2-tuple of llvmlite.binding.Value. + is_error : true iff *func* raised an exception. + res : Returned value from *func* iff *is_error* is false. + + If *is_error* is true, this method will adapt the nopython exception + into a Python exception. Caller should return NULL to Python to + indicate an error. + """ + # Compile *func* + builder = self.builder + cres = self.context.compile_subroutine(builder, func, sig) + got_retty = cres.signature.return_type + retty = sig.return_type + if got_retty != retty: + # This error indicates an error in *func* or the caller of this + # method. + raise errors.LoweringError( + f'mismatching signature {got_retty} != {retty}.\n' + ) + # Call into *func* + status, res = self.context.call_internal_no_propagate( + builder, cres.fndesc, sig, args, + ) + # Post-call handling for *func* + is_error_ptr = cgutils.alloca_once(builder, cgutils.bool_t, zfill=True) + res_type = self.context.get_value_type(sig.return_type) + res_ptr = cgutils.alloca_once(builder, res_type, zfill=True) + + # Handle error and adapt the nopython exception into cpython exception + with builder.if_else(status.is_error) as (has_err, no_err): + with has_err: + builder.store(status.is_error, is_error_ptr) + # Set error state in the Python interpreter + self.context.call_conv.raise_error(builder, self, status) + with no_err: + # Handle returned value + res = imputils.fix_returning_optional( + self.context, builder, sig, status, res, + ) + builder.store(res, res_ptr) + + is_error = builder.load(is_error_ptr) + res = builder.load(res_ptr) + return is_error, res + + +class ObjModeUtils: + """Internal utils for calling objmode dispatcher from within NPM code. + """ + def __init__(self, pyapi): + self.pyapi = pyapi + + def load_dispatcher(self, fnty, argtypes): + builder = self.pyapi.builder + tyctx = self.pyapi.context + m = builder.module + + # Add a global variable to cache the objmode dispatcher + gv = ir.GlobalVariable( + m, self.pyapi.pyobj, + name=m.get_unique_name("cached_objmode_dispatcher"), + ) + gv.initializer = gv.type.pointee(None) + gv.linkage = 'internal' + + # Make a basic-block to common exit + bb_end = builder.append_basic_block("bb_end") + + if serialize.is_serialiable(fnty.dispatcher): + serialized_dispatcher = self.pyapi.serialize_object( + (fnty.dispatcher, tuple(argtypes)), + ) + compile_args = self.pyapi.unserialize(serialized_dispatcher) + # unserialize (unpickling) can fail + failed_unser = cgutils.is_null(builder, compile_args) + with builder.if_then(failed_unser): + # early exit. `gv` is still null. + builder.branch(bb_end) + + cached = builder.load(gv) + with builder.if_then(cgutils.is_null(builder, cached)): + if serialize.is_serialiable(fnty.dispatcher): + cls = type(self) + compiler = self.pyapi.unserialize( + self.pyapi.serialize_object(cls._call_objmode_dispatcher) + ) + callee = self.pyapi.call_function_objargs( + compiler, [compile_args], + ) + # Clean up + self.pyapi.decref(compiler) + self.pyapi.decref(compile_args) + else: + entry_pt = fnty.dispatcher.compile(tuple(argtypes)) + callee = tyctx.add_dynamic_addr( + builder, id(entry_pt), info="with_objectmode", + ) + # Incref the dispatcher and cache it + self.pyapi.incref(callee) + builder.store(callee, gv) + # Jump to the exit block + builder.branch(bb_end) + # Define the exit block + builder.position_at_end(bb_end) + callee = builder.load(gv) + return callee + + @staticmethod + def _call_objmode_dispatcher(compile_args): + dispatcher, argtypes = compile_args + entrypt = dispatcher.compile(argtypes) + return entrypt diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/registry.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..eef492c40a4053f550bf04754347f795bb2662cb --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/registry.py @@ -0,0 +1,112 @@ +import contextlib + +from numba.core.descriptors import TargetDescriptor +from numba.core import utils, typing, dispatcher, cpu + +# ----------------------------------------------------------------------------- +# Default CPU target descriptors + +class _NestedContext(object): + _typing_context = None + _target_context = None + + @contextlib.contextmanager + def nested(self, typing_context, target_context): + old_nested = self._typing_context, self._target_context + try: + self._typing_context = typing_context + self._target_context = target_context + yield + finally: + self._typing_context, self._target_context = old_nested + + +class CPUTarget(TargetDescriptor): + options = cpu.CPUTargetOptions + _nested = _NestedContext() + + @utils.cached_property + def _toplevel_target_context(self): + # Lazily-initialized top-level target context, for all threads + return cpu.CPUContext(self.typing_context, self._target_name) + + @utils.cached_property + def _toplevel_typing_context(self): + # Lazily-initialized top-level typing context, for all threads + return typing.Context() + + @property + def target_context(self): + """ + The target context for CPU targets. + """ + nested = self._nested._target_context + if nested is not None: + return nested + else: + return self._toplevel_target_context + + @property + def typing_context(self): + """ + The typing context for CPU targets. + """ + nested = self._nested._typing_context + if nested is not None: + return nested + else: + return self._toplevel_typing_context + + def nested_context(self, typing_context, target_context): + """ + A context manager temporarily replacing the contexts with the + given ones, for the current thread of execution. + """ + return self._nested.nested(typing_context, target_context) + + +# The global CPU target +cpu_target = CPUTarget('cpu') + + +class CPUDispatcher(dispatcher.Dispatcher): + targetdescr = cpu_target + + +class DelayedRegistry(utils.UniqueDict): + """ + A unique dictionary but with deferred initialisation of the values. + + Attributes + ---------- + ondemand: + + A dictionary of key -> value, where value is executed + the first time it is is used. It is used for part of a deferred + initialization strategy. + """ + def __init__(self, *args, **kws): + self.ondemand = utils.UniqueDict() + self.key_type = kws.pop('key_type', None) + self.value_type = kws.pop('value_type', None) + self._type_check = self.key_type or self.value_type + super(DelayedRegistry, self).__init__(*args, **kws) + + def __getitem__(self, item): + if item in self.ondemand: + self[item] = self.ondemand[item]() + del self.ondemand[item] + return super(DelayedRegistry, self).__getitem__(item) + + def __setitem__(self, key, value): + if self._type_check: + def check(x, ty_x): + if isinstance(ty_x, type): + assert ty_x in x.__mro__, (x, ty_x) + else: + assert isinstance(x, ty_x), (x, ty_x) + if self.key_type is not None: + check(key, self.key_type) + if self.value_type is not None: + check(value, self.value_type) + return super(DelayedRegistry, self).__setitem__(key, value) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/removerefctpass.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/removerefctpass.py new file mode 100644 index 0000000000000000000000000000000000000000..98c04869a0bc5988dc936b920340a43ea06b438c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/removerefctpass.py @@ -0,0 +1,120 @@ +""" +Implement a rewrite pass on a LLVM module to remove unnecessary +refcount operations. +""" + +from llvmlite.ir.transforms import CallVisitor + +from numba.core import types + + +class _MarkNrtCallVisitor(CallVisitor): + """ + A pass to mark all NRT_incref and NRT_decref. + """ + def __init__(self): + self.marked = set() + + def visit_Call(self, instr): + if getattr(instr.callee, 'name', '') in _accepted_nrtfns: + self.marked.add(instr) + + +def _rewrite_function(function): + # Mark NRT usage + markpass = _MarkNrtCallVisitor() + markpass.visit_Function(function) + # Remove NRT usage + for bb in function.basic_blocks: + for inst in list(bb.instructions): + if inst in markpass.marked: + bb.instructions.remove(inst) + + +_accepted_nrtfns = 'NRT_incref', 'NRT_decref' + + +def _legalize(module, dmm, fndesc): + """ + Legalize the code in the module. + Returns True if the module is legal for the rewrite pass that removes + unnecessary refcounts. + """ + + def valid_output(ty): + """ + Valid output are any type that does not need refcount + """ + model = dmm[ty] + return not model.contains_nrt_meminfo() + + def valid_input(ty): + """ + Valid input are any type that does not need refcount except Array. + """ + return valid_output(ty) or isinstance(ty, types.Array) + + + # Ensure no reference to function marked as + # "numba_args_may_always_need_nrt" + try: + nmd = module.get_named_metadata("numba_args_may_always_need_nrt") + except KeyError: + # Nothing marked + pass + else: + # Has functions marked as "numba_args_may_always_need_nrt" + if len(nmd.operands) > 0: + # The pass is illegal for this compilation unit. + return False + + # More legalization base on function type + argtypes = fndesc.argtypes + restype = fndesc.restype + calltypes = fndesc.calltypes + + # Legalize function arguments + for argty in argtypes: + if not valid_input(argty): + return False + + # Legalize function return + if not valid_output(restype): + return False + + # Legalize all called functions + for callty in calltypes.values(): + if callty is not None and not valid_output(callty.return_type): + return False + + # Ensure no allocation + for fn in module.functions: + if fn.name.startswith("NRT_"): + if fn.name not in _accepted_nrtfns: + return False + + return True + + +def remove_unnecessary_nrt_usage(function, context, fndesc): + """ + Remove unnecessary NRT incref/decref in the given LLVM function. + It uses highlevel type info to determine if the function does not need NRT. + Such a function does not: + + - return array object(s); + - take arguments that need refcounting except array; + - call function(s) that return refcounted object. + + In effect, the function will not capture or create references that extend + the lifetime of any refcounted objects beyond the lifetime of the function. + + The rewrite is performed in place. + If rewrite has happened, this function returns True, otherwise, it returns False. + """ + dmm = context.data_model_manager + if _legalize(function.module, dmm, fndesc): + _rewrite_function(function) + return True + else: + return False diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/retarget.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/retarget.py new file mode 100644 index 0000000000000000000000000000000000000000..8fa98c78bca31bd72c361e2d8c5316b93ce33044 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/retarget.py @@ -0,0 +1,135 @@ +""" +Implement utils for supporting retargeting of dispatchers. + +WARNING: Features defined in this file are experimental. The API may change + without notice. +""" +import abc +import weakref + +from numba.core import errors + + +class RetargetCache: + """Cache for retargeted dispatchers. + + The cache uses the original dispatcher as the key. + """ + container_type = weakref.WeakKeyDictionary + + def __init__(self): + self._cache = self.container_type() + self._stat_hit = 0 + self._stat_miss = 0 + + def save_cache(self, orig_disp, new_disp): + """Save a dispatcher associated with the given key. + """ + self._cache[orig_disp] = new_disp + + def load_cache(self, orig_disp): + """Load a dispatcher associated with the given key. + """ + out = self._cache.get(orig_disp) + if out is None: + self._stat_miss += 1 + else: + self._stat_hit += 1 + return out + + def items(self): + """Returns the contents of the cache. + """ + return self._cache.items() + + def stats(self): + """Returns stats regarding cache hit/miss. + """ + return {'hit': self._stat_hit, 'miss': self._stat_miss} + + +class BaseRetarget(abc.ABC): + """Abstract base class for retargeting logic. + """ + @abc.abstractmethod + def check_compatible(self, orig_disp): + """Check that the retarget is compatible. + + This method does not return anything meaningful (e.g. None) + Incompatibility is signalled via raising an exception. + """ + pass + + @abc.abstractmethod + def retarget(self, orig_disp): + """Retargets the given dispatcher and returns a new dispatcher-like + callable. Or, returns the original dispatcher if the the target_backend + will not change. + """ + pass + + +class BasicRetarget(BaseRetarget): + """A basic retargeting implementation for a single output target. + + This class has two abstract methods/properties that subclasses must define. + + - `output_target` must return output target name. + - `compile_retarget` must define the logic to retarget the given dispatcher. + + By default, this class uses `RetargetCache` as the internal cache. This + can be modified by overriding the `.cache_type` class attribute. + + """ + cache_type = RetargetCache + + def __init__(self): + self.cache = self.cache_type() + + @abc.abstractproperty + def output_target(self) -> str: + """Returns the output target name. + + See numba/tests/test_retargeting.py for example usage. + """ + pass + + @abc.abstractmethod + def compile_retarget(self, orig_disp): + """Returns the retargeted dispatcher. + + See numba/tests/test_retargeting.py for example usage. + """ + pass + + def check_compatible(self, orig_disp): + """ + This implementation checks that + `self.output_target == orig_disp._required_target_backend` + """ + required_target = orig_disp._required_target_backend + output_target = self.output_target + if required_target is not None: + if output_target != required_target: + m = ("The output target does match the required target: " + f"{output_target} != {required_target}.") + raise errors.CompilerError(m) + + def retarget(self, orig_disp): + """Apply retargeting to orig_disp. + + The retargeted dispatchers are cached for future use. + """ + cache = self.cache + opts = orig_disp.targetoptions + # Skip if the original dispatcher is targeting the same output target + if opts.get('target_backend') == self.output_target: + return orig_disp + cached = cache.load_cache(orig_disp) + # No cache? + if cached is None: + out = self.compile_retarget(orig_disp) + cache.save_cache(orig_disp, out) + else: + out = cached + return out diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/__init__.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..61f55a8ecde8df1f3f8fef4533a9fdff6762b45c --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/__init__.py @@ -0,0 +1,8 @@ +""" +A subpackage hosting Numba IR rewrite passes. +""" + +from .registry import register_rewrite, rewrite_registry, Rewrite +# Register various built-in rewrite passes +from numba.core.rewrites import (static_getitem, static_raise, static_binop, + ir_print) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/ir_print.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/ir_print.py new file mode 100644 index 0000000000000000000000000000000000000000..6d678381bb18ab697d5c1fceb4c12e8cae18e342 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/ir_print.py @@ -0,0 +1,82 @@ +from numba.core import errors, ir +from numba.core.rewrites import register_rewrite, Rewrite + + +@register_rewrite('before-inference') +class RewritePrintCalls(Rewrite): + """ + Rewrite calls to the print() global function to dedicated IR print() nodes. + """ + + def match(self, func_ir, block, typemap, calltypes): + self.prints = prints = {} + self.block = block + # Find all assignments with a right-hand print() call + for inst in block.find_insts(ir.Assign): + if isinstance(inst.value, ir.Expr) and inst.value.op == 'call': + expr = inst.value + try: + callee = func_ir.infer_constant(expr.func) + except errors.ConstantInferenceError: + continue + if callee is print: + if expr.kws: + # Only positional args are supported + msg = ("Numba's print() function implementation does not " + "support keyword arguments.") + raise errors.UnsupportedError(msg, inst.loc) + prints[inst] = expr + return len(prints) > 0 + + def apply(self): + """ + Rewrite `var = call (...)` as a sequence of + `print(...)` and `var = const(None)`. + """ + new_block = self.block.copy() + new_block.clear() + for inst in self.block.body: + if inst in self.prints: + expr = self.prints[inst] + print_node = ir.Print(args=expr.args, vararg=expr.vararg, + loc=expr.loc) + new_block.append(print_node) + assign_node = ir.Assign(value=ir.Const(None, loc=expr.loc), + target=inst.target, + loc=inst.loc) + new_block.append(assign_node) + else: + new_block.append(inst) + return new_block + + +@register_rewrite('before-inference') +class DetectConstPrintArguments(Rewrite): + """ + Detect and store constant arguments to print() nodes. + """ + + def match(self, func_ir, block, typemap, calltypes): + self.consts = consts = {} + self.block = block + for inst in block.find_insts(ir.Print): + if inst.consts: + # Already rewritten + continue + for idx, var in enumerate(inst.args): + try: + const = func_ir.infer_constant(var) + except errors.ConstantInferenceError: + continue + consts.setdefault(inst, {})[idx] = const + + return len(consts) > 0 + + def apply(self): + """ + Store detected constant arguments on their nodes. + """ + for inst in self.block.body: + if inst in self.consts: + inst.consts = self.consts[inst] + return self.block diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/registry.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..ea22fc8e354940235089c208e091382ed6ec87de --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/registry.py @@ -0,0 +1,98 @@ +from collections import defaultdict + +from numba.core import config + + +class Rewrite(object): + '''Defines the abstract base class for Numba rewrites. + ''' + + def __init__(self, state=None): + '''Constructor for the Rewrite class. + ''' + pass + + def match(self, func_ir, block, typemap, calltypes): + '''Overload this method to check an IR block for matching terms in the + rewrite. + ''' + return False + + def apply(self): + '''Overload this method to return a rewritten IR basic block when a + match has been found. + ''' + raise NotImplementedError("Abstract Rewrite.apply() called!") + + +class RewriteRegistry(object): + '''Defines a registry for Numba rewrites. + ''' + _kinds = frozenset(['before-inference', 'after-inference']) + + def __init__(self): + '''Constructor for the rewrite registry. Initializes the rewrites + member to an empty list. + ''' + self.rewrites = defaultdict(list) + + def register(self, kind): + """ + Decorator adding a subclass of Rewrite to the registry for + the given *kind*. + """ + if kind not in self._kinds: + raise KeyError("invalid kind %r" % (kind,)) + def do_register(rewrite_cls): + if not issubclass(rewrite_cls, Rewrite): + raise TypeError('{0} is not a subclass of Rewrite'.format( + rewrite_cls)) + self.rewrites[kind].append(rewrite_cls) + return rewrite_cls + return do_register + + def apply(self, kind, state): + '''Given a pipeline and a dictionary of basic blocks, exhaustively + attempt to apply all registered rewrites to all basic blocks. + ''' + assert kind in self._kinds + blocks = state.func_ir.blocks + old_blocks = blocks.copy() + for rewrite_cls in self.rewrites[kind]: + # Exhaustively apply a rewrite until it stops matching. + rewrite = rewrite_cls(state) + work_list = list(blocks.items()) + while work_list: + key, block = work_list.pop() + matches = rewrite.match(state.func_ir, block, state.typemap, + state.calltypes) + if matches: + if config.DEBUG or config.DUMP_IR: + print("_" * 70) + print("REWRITING (%s):" % rewrite_cls.__name__) + block.dump() + print("_" * 60) + new_block = rewrite.apply() + blocks[key] = new_block + work_list.append((key, new_block)) + if config.DEBUG or config.DUMP_IR: + new_block.dump() + print("_" * 70) + # If any blocks were changed, perform a sanity check. + for key, block in blocks.items(): + if block != old_blocks[key]: + block.verify() + + # Some passes, e.g. _inline_const_arraycall are known to occasionally + # do invalid things WRT ir.Del, others, e.g. RewriteArrayExprs do valid + # things with ir.Del, but the placement is not optimal. The lines below + # fix-up the IR so that ref counts are valid and optimally placed, + # see #4093 for context. This has to be run here opposed to in + # apply() as the CFG needs computing so full IR is needed. + from numba.core import postproc + post_proc = postproc.PostProcessor(state.func_ir) + post_proc.run() + + +rewrite_registry = RewriteRegistry() +register_rewrite = rewrite_registry.register diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/static_binop.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/static_binop.py new file mode 100644 index 0000000000000000000000000000000000000000..33487a67549856c73d655cc1fc59a95eab941f6b --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/static_binop.py @@ -0,0 +1,35 @@ +from numba.core import errors, ir +from numba.core.rewrites import register_rewrite, Rewrite + + +@register_rewrite('before-inference') +class DetectStaticBinops(Rewrite): + """ + Detect constant arguments to select binops. + """ + + # Those operators can benefit from a constant-inferred argument + rhs_operators = {'**'} + + def match(self, func_ir, block, typemap, calltypes): + self.static_lhs = {} + self.static_rhs = {} + self.block = block + # Find binop expressions with a constant lhs or rhs + for expr in block.find_exprs(op='binop'): + try: + if (expr.fn in self.rhs_operators + and expr.static_rhs is ir.UNDEFINED): + self.static_rhs[expr] = func_ir.infer_constant(expr.rhs) + except errors.ConstantInferenceError: + continue + + return len(self.static_lhs) > 0 or len(self.static_rhs) > 0 + + def apply(self): + """ + Store constant arguments that were detected in match(). + """ + for expr, rhs in self.static_rhs.items(): + expr.static_rhs = rhs + return self.block diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/static_getitem.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/static_getitem.py new file mode 100644 index 0000000000000000000000000000000000000000..56343d0eac93cef756834d22c76780e228398a7d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/static_getitem.py @@ -0,0 +1,175 @@ +from numba.core import errors, ir, types +from numba.core.rewrites import register_rewrite, Rewrite + + +@register_rewrite('before-inference') +class RewriteConstGetitems(Rewrite): + """ + Rewrite IR expressions of the kind `getitem(value=arr, index=$constXX)` + where `$constXX` is a known constant as + `static_getitem(value=arr, index=)`. + """ + + def match(self, func_ir, block, typemap, calltypes): + self.getitems = getitems = {} + self.block = block + # Detect all getitem expressions and find which ones can be + # rewritten + for expr in block.find_exprs(op='getitem'): + if expr.op == 'getitem': + try: + const = func_ir.infer_constant(expr.index) + except errors.ConstantInferenceError: + continue + getitems[expr] = const + + return len(getitems) > 0 + + def apply(self): + """ + Rewrite all matching getitems as static_getitems. + """ + new_block = self.block.copy() + new_block.clear() + for inst in self.block.body: + if isinstance(inst, ir.Assign): + expr = inst.value + if expr in self.getitems: + const = self.getitems[expr] + new_expr = ir.Expr.static_getitem(value=expr.value, + index=const, + index_var=expr.index, + loc=expr.loc) + inst = ir.Assign(value=new_expr, target=inst.target, + loc=inst.loc) + new_block.append(inst) + return new_block + + +@register_rewrite('after-inference') +class RewriteStringLiteralGetitems(Rewrite): + """ + Rewrite IR expressions of the kind `getitem(value=arr, index=$XX)` + where `$XX` is a StringLiteral value as + `static_getitem(value=arr, index=)`. + """ + + def match(self, func_ir, block, typemap, calltypes): + """ + Detect all getitem expressions and find which ones have + string literal indexes + """ + self.getitems = getitems = {} + self.block = block + self.calltypes = calltypes + for expr in block.find_exprs(op='getitem'): + if expr.op == 'getitem': + index_ty = typemap[expr.index.name] + if isinstance(index_ty, types.StringLiteral): + getitems[expr] = (expr.index, index_ty.literal_value) + + return len(getitems) > 0 + + def apply(self): + """ + Rewrite all matching getitems as static_getitems where the index + is the literal value of the string. + """ + new_block = ir.Block(self.block.scope, self.block.loc) + for inst in self.block.body: + if isinstance(inst, ir.Assign): + expr = inst.value + if expr in self.getitems: + const, lit_val = self.getitems[expr] + new_expr = ir.Expr.static_getitem(value=expr.value, + index=lit_val, + index_var=expr.index, + loc=expr.loc) + self.calltypes[new_expr] = self.calltypes[expr] + inst = ir.Assign(value=new_expr, target=inst.target, + loc=inst.loc) + new_block.append(inst) + return new_block + + +@register_rewrite('after-inference') +class RewriteStringLiteralSetitems(Rewrite): + """ + Rewrite IR expressions of the kind `setitem(value=arr, index=$XX, value=)` + where `$XX` is a StringLiteral value as + `static_setitem(value=arr, index=, value=)`. + """ + + def match(self, func_ir, block, typemap, calltypes): + """ + Detect all setitem expressions and find which ones have + string literal indexes + """ + self.setitems = setitems = {} + self.block = block + self.calltypes = calltypes + for inst in block.find_insts(ir.SetItem): + index_ty = typemap[inst.index.name] + if isinstance(index_ty, types.StringLiteral): + setitems[inst] = (inst.index, index_ty.literal_value) + + return len(setitems) > 0 + + def apply(self): + """ + Rewrite all matching setitems as static_setitems where the index + is the literal value of the string. + """ + new_block = ir.Block(self.block.scope, self.block.loc) + for inst in self.block.body: + if isinstance(inst, ir.SetItem): + if inst in self.setitems: + const, lit_val = self.setitems[inst] + new_inst = ir.StaticSetItem(target=inst.target, + index=lit_val, + index_var=inst.index, + value=inst.value, + loc=inst.loc) + self.calltypes[new_inst] = self.calltypes[inst] + inst = new_inst + new_block.append(inst) + return new_block + + +@register_rewrite('before-inference') +class RewriteConstSetitems(Rewrite): + """ + Rewrite IR statements of the kind `setitem(target=arr, index=$constXX, ...)` + where `$constXX` is a known constant as + `static_setitem(target=arr, index=, ...)`. + """ + + def match(self, func_ir, block, typemap, calltypes): + self.setitems = setitems = {} + self.block = block + # Detect all setitem statements and find which ones can be + # rewritten + for inst in block.find_insts(ir.SetItem): + try: + const = func_ir.infer_constant(inst.index) + except errors.ConstantInferenceError: + continue + setitems[inst] = const + + return len(setitems) > 0 + + def apply(self): + """ + Rewrite all matching setitems as static_setitems. + """ + new_block = self.block.copy() + new_block.clear() + for inst in self.block.body: + if inst in self.setitems: + const = self.setitems[inst] + new_inst = ir.StaticSetItem(inst.target, const, + inst.index, inst.value, inst.loc) + new_block.append(new_inst) + else: + new_block.append(inst) + return new_block diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/static_raise.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/static_raise.py new file mode 100644 index 0000000000000000000000000000000000000000..61f7b5742b5247c7da4b17413f97b0a229df79ff --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/rewrites/static_raise.py @@ -0,0 +1,79 @@ +from numba.core import errors, ir +from numba.core.rewrites import register_rewrite, Rewrite + + +@register_rewrite('before-inference') +class RewriteConstRaises(Rewrite): + """ + Rewrite IR statements of the kind `raise(value)` + where `value` is the result of instantiating an exception with + constant arguments + into `static_raise(exception_type, constant args)`. + + This allows lowering in nopython mode, where one can't instantiate + exception instances from runtime data. + """ + + def _is_exception_type(self, const): + return isinstance(const, type) and issubclass(const, Exception) + + def _break_constant(self, const, loc): + """ + Break down constant exception. + """ + if isinstance(const, tuple): # it's a tuple(exception class, args) + if not self._is_exception_type(const[0]): + msg = "Encountered unsupported exception constant %r" + raise errors.UnsupportedError(msg % (const[0],), loc) + return const[0], tuple(const[1]) + elif self._is_exception_type(const): + return const, None + else: + if isinstance(const, str): + msg = ("Directly raising a string constant as an exception is " + "not supported.") + else: + msg = "Encountered unsupported constant type used for exception" + raise errors.UnsupportedError(msg, loc) + + def match(self, func_ir, block, typemap, calltypes): + self.raises = raises = {} + self.tryraises = tryraises = {} + self.block = block + # Detect all raise statements and find which ones can be + # rewritten + for inst in block.find_insts((ir.Raise, ir.TryRaise)): + if inst.exception is None: + # re-reraise + exc_type, exc_args = None, None + else: + # raise => find the definition site for + const = func_ir.infer_constant(inst.exception) + loc = inst.exception.loc + exc_type, exc_args = self._break_constant(const, loc) + if isinstance(inst, ir.Raise): + raises[inst] = exc_type, exc_args + elif isinstance(inst, ir.TryRaise): + tryraises[inst] = exc_type, exc_args + else: + raise ValueError('unexpected: {}'.format(type(inst))) + return (len(raises) + len(tryraises)) > 0 + + def apply(self): + """ + Rewrite all matching setitems as static_setitems. + """ + new_block = self.block.copy() + new_block.clear() + for inst in self.block.body: + if inst in self.raises: + exc_type, exc_args = self.raises[inst] + new_inst = ir.StaticRaise(exc_type, exc_args, inst.loc) + new_block.append(new_inst) + elif inst in self.tryraises: + exc_type, exc_args = self.tryraises[inst] + new_inst = ir.StaticTryRaise(exc_type, exc_args, inst.loc) + new_block.append(new_inst) + else: + new_block.append(inst) + return new_block diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/__init__.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5904700316933083c9996ee5e132c620f25014e0 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/__init__.py @@ -0,0 +1 @@ +from .nrt import rtsys diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/_nrt_python.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/_nrt_python.c new file mode 100644 index 0000000000000000000000000000000000000000..b4c847e26e98c595cdc5a639e5e1db6edcc7687d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/_nrt_python.c @@ -0,0 +1,459 @@ +/* + * Definition of NRT functions for marshalling from / to Python objects. + * This module is included by _nrt_pythonmod.c and by pycc-compiled modules. + */ + +#include "../../_pymodule.h" + +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#include +#include + +#include "../../_arraystruct.h" +#include "../../_numba_common.h" +#include "nrt.h" + + +/* + * Create a NRT MemInfo for data owned by a PyObject. + */ + +static void +pyobject_dtor(void *ptr, size_t size, void* info) { + PyGILState_STATE gstate; + PyObject *ownerobj = info; + + gstate = PyGILState_Ensure(); /* ensure the GIL */ + Py_DECREF(ownerobj); /* release the python object */ + PyGILState_Release(gstate); /* release the GIL */ +} + +NUMBA_EXPORT_FUNC(NRT_MemInfo *) +NRT_meminfo_new_from_pyobject(void *data, PyObject *ownerobj) { + size_t dummy_size = 0; + Py_INCREF(ownerobj); + return NRT_MemInfo_new(data, dummy_size, pyobject_dtor, ownerobj); +} + + +/* + * A Python object wrapping a NRT meminfo. + */ + +typedef struct { + PyObject_HEAD + NRT_MemInfo *meminfo; +} MemInfoObject; + + +static +int MemInfo_init(MemInfoObject *self, PyObject *args, PyObject *kwds) { + static char *keywords[] = {"ptr", NULL}; + PyObject *raw_ptr_obj; + void *raw_ptr; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O", keywords, &raw_ptr_obj)) { + return -1; + } + raw_ptr = PyLong_AsVoidPtr(raw_ptr_obj); + NRT_Debug(nrt_debug_print("MemInfo_init self=%p raw_ptr=%p\n", self, raw_ptr)); + + if(PyErr_Occurred()) return -1; + self->meminfo = (NRT_MemInfo *)raw_ptr; + assert (NRT_MemInfo_refcount(self->meminfo) > 0 && "0 refcount"); + return 0; +} + + +static int +MemInfo_getbuffer(PyObject *exporter, Py_buffer *view, int flags) { + Py_ssize_t len; + void *buf; + int readonly = 0; + + MemInfoObject *miobj = (MemInfoObject*)exporter; + NRT_MemInfo *mi = miobj->meminfo; + + buf = NRT_MemInfo_data(mi); + len = NRT_MemInfo_size(mi); + return PyBuffer_FillInfo(view, exporter, buf, len, readonly, flags); +} + +static PyBufferProcs MemInfo_bufferProcs = {MemInfo_getbuffer, NULL}; + +static +PyObject* +MemInfo_acquire(MemInfoObject *self) { + NRT_MemInfo_acquire(self->meminfo); + Py_RETURN_NONE; +} + +static +PyObject* +MemInfo_release(MemInfoObject *self) { + NRT_MemInfo_release(self->meminfo); + Py_RETURN_NONE; +} + +static +PyObject* +MemInfo_get_data(MemInfoObject *self, void *closure) { + return PyLong_FromVoidPtr(NRT_MemInfo_data(self->meminfo)); +} + +static +PyObject* +MemInfo_get_refcount(MemInfoObject *self, void *closure) { + size_t refct = NRT_MemInfo_refcount(self->meminfo); + if ( refct == (size_t)-1 ) { + PyErr_SetString(PyExc_ValueError, "invalid MemInfo"); + return NULL; + } + return PyLong_FromSize_t(refct); +} + +static +PyObject* +MemInfo_get_external_allocator(MemInfoObject *self, void *closure) { + void *p = NRT_MemInfo_external_allocator(self->meminfo); + return PyLong_FromVoidPtr(p); +} + +static +PyObject* +MemInfo_get_parent(MemInfoObject *self, void *closure) { + void *p = NRT_MemInfo_parent(self->meminfo); + if (p) { + Py_INCREF(p); + return (PyObject*)p; + } else { + Py_INCREF(Py_None); + return Py_None; + } +} + +static void +MemInfo_dealloc(MemInfoObject *self) +{ + NRT_MemInfo_release(self->meminfo); + Py_TYPE(self)->tp_free((PyObject*)self); +} + +static PyMethodDef MemInfo_methods[] = { + {"acquire", (PyCFunction)MemInfo_acquire, METH_NOARGS, + "Increment the reference count" + }, + {"release", (PyCFunction)MemInfo_release, METH_NOARGS, + "Decrement the reference count" + }, + {NULL} /* Sentinel */ +}; + + +static PyGetSetDef MemInfo_getsets[] = { + {"data", + (getter)MemInfo_get_data, NULL, + "Get the data pointer as an integer", + NULL}, + {"refcount", + (getter)MemInfo_get_refcount, NULL, + "Get the refcount", + NULL}, + {"external_allocator", + (getter)MemInfo_get_external_allocator, NULL, + "Get the external allocator", + NULL}, + {"parent", + (getter)MemInfo_get_parent, NULL, + NULL}, + {NULL} /* Sentinel */ +}; + + +static PyTypeObject MemInfoType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_nrt_python._MemInfo", /* tp_name*/ + sizeof(MemInfoObject), /* tp_basicsize*/ + 0, /* tp_itemsize*/ + (destructor)MemInfo_dealloc, /* tp_dealloc*/ + 0, /* tp_print*/ + 0, /* tp_getattr*/ + 0, /* tp_setattr*/ + 0, /* tp_compare*/ + 0, /* tp_repr*/ + 0, /* tp_as_number*/ + 0, /* tp_as_sequence*/ + 0, /* tp_as_mapping*/ + 0, /* tp_hash */ + 0, /* tp_call*/ + 0, /* tp_str*/ + 0, /* tp_getattro*/ + 0, /* tp_setattro*/ + &MemInfo_bufferProcs, /* tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags*/ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + MemInfo_methods, /* tp_methods */ + 0, /* tp_members */ + MemInfo_getsets, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)MemInfo_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; + + +/* +Return a MemInfo* as a MemInfoObject* +The NRT reference to the MemInfo is borrowed. +*/ +NUMBA_EXPORT_FUNC(MemInfoObject*) +NRT_meminfo_as_pyobject(NRT_MemInfo *meminfo) { + MemInfoObject *mi; + PyObject *addr; + + addr = PyLong_FromVoidPtr(meminfo); + if (!addr) return NULL; + mi = (MemInfoObject*)PyObject_CallFunctionObjArgs((PyObject *)&MemInfoType, addr, NULL); + Py_DECREF(addr); + if (!mi) return NULL; + return mi; +} + + +/* +Return a MemInfo* from a MemInfoObject* +A new reference is returned. +*/ +NUMBA_EXPORT_FUNC(NRT_MemInfo*) +NRT_meminfo_from_pyobject(MemInfoObject *miobj) { + NRT_MemInfo_acquire(miobj->meminfo); + return miobj->meminfo; +} + + +/* + * Array adaptor code + */ + +NUMBA_EXPORT_FUNC(int) +NRT_adapt_ndarray_from_python(PyObject *obj, arystruct_t* arystruct) { + PyArrayObject *ndary; + int i, ndim; + npy_intp *p; + void *data; + + if (!PyArray_Check(obj)) { + return -1; + } + + ndary = (PyArrayObject*)obj; + ndim = PyArray_NDIM(ndary); + data = PyArray_DATA(ndary); + + arystruct->meminfo = NRT_meminfo_new_from_pyobject((void*)data, obj); + arystruct->data = data; + arystruct->nitems = PyArray_SIZE(ndary); + arystruct->itemsize = PyArray_ITEMSIZE(ndary); + arystruct->parent = obj; + p = arystruct->shape_and_strides; + for (i = 0; i < ndim; i++, p++) { + *p = PyArray_DIM(ndary, i); + } + for (i = 0; i < ndim; i++, p++) { + *p = PyArray_STRIDE(ndary, i); + } + + NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_from_python %p\n", + arystruct->meminfo)); + return 0; +} + +static +PyObject* try_to_return_parent(arystruct_t *arystruct, int ndim, + PyArray_Descr *descr) +{ + int i; + PyArrayObject *array = (PyArrayObject *)arystruct->parent; + + if (!PyArray_Check(arystruct->parent)) + /* Parent is a generic buffer-providing object */ + goto RETURN_ARRAY_COPY; + + if (PyArray_DATA(array) != arystruct->data) + goto RETURN_ARRAY_COPY; + + if (PyArray_NDIM(array) != ndim) + goto RETURN_ARRAY_COPY; + + if (PyObject_RichCompareBool((PyObject *) PyArray_DESCR(array), + (PyObject *) descr, Py_EQ) <= 0) + goto RETURN_ARRAY_COPY; + + for(i = 0; i < ndim; ++i) { + if (PyArray_DIMS(array)[i] != arystruct->shape_and_strides[i]) + goto RETURN_ARRAY_COPY; + if (PyArray_STRIDES(array)[i] != arystruct->shape_and_strides[ndim + i]) + goto RETURN_ARRAY_COPY; + } + + /* Yes, it is the same array + Return new reference */ + Py_INCREF((PyObject *)array); + return (PyObject *)array; + +RETURN_ARRAY_COPY: + return NULL; +} + +/** + * This function is used during the boxing of ndarray type. + * `arystruct` is a structure containing essential information from the + * unboxed array. + * `retty` is the subtype of the NumPy PyArray_Type this function should return. + * This is related to `numba.core.types.Array.box_type`. + * `ndim` is the number of dimension of the array. + * `writeable` corresponds to the "writable" flag in NumPy ndarray. + * `descr` is the NumPy data type description. + * + * This function was renamed in 0.52.0 to specify that it acquires references. + * It used to steal the reference of the arystruct. + * Refer to https://github.com/numba/numba/pull/6446 + */ +NUMBA_EXPORT_FUNC(PyObject *) +NRT_adapt_ndarray_to_python_acqref(arystruct_t* arystruct, PyTypeObject *retty, + int ndim, int writeable, PyArray_Descr *descr) +{ + PyArrayObject *array; + MemInfoObject *miobj = NULL; + PyObject *args; + npy_intp *shape, *strides; + int flags = 0; + + if (descr == NULL) { + PyErr_Format(PyExc_RuntimeError, + "In 'NRT_adapt_ndarray_to_python', 'descr' is NULL"); + return NULL; + } + + if (!NUMBA_PyArray_DescrCheck(descr)) { + PyErr_Format(PyExc_TypeError, + "expected dtype object, got '%.200s'", + Py_TYPE(descr)->tp_name); + return NULL; + } + + if (arystruct->parent) { + PyObject *obj = try_to_return_parent(arystruct, ndim, descr); + if (obj) { + return obj; + } + } + + if (arystruct->meminfo) { + /* wrap into MemInfoObject */ + miobj = PyObject_New(MemInfoObject, &MemInfoType); + args = PyTuple_New(1); + /* SETITEM steals reference */ + PyTuple_SET_ITEM(args, 0, PyLong_FromVoidPtr(arystruct->meminfo)); + NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_to_python arystruct->meminfo=%p\n", arystruct->meminfo)); + /* Note: MemInfo_init() does not incref. This function steals the + * NRT reference, which we need to acquire. + */ + NRT_Debug(nrt_debug_print("NRT_adapt_ndarray_to_python_acqref created MemInfo=%p\n", miobj)); + NRT_MemInfo_acquire(arystruct->meminfo); + if (MemInfo_init(miobj, args, NULL)) { + NRT_Debug(nrt_debug_print("MemInfo_init failed.\n")); + return NULL; + } + Py_DECREF(args); + } + + shape = arystruct->shape_and_strides; + strides = shape + ndim; + Py_INCREF((PyObject *) descr); + array = (PyArrayObject *) PyArray_NewFromDescr(retty, descr, ndim, + shape, strides, arystruct->data, + flags, (PyObject *) miobj); + + if (array == NULL) + return NULL; + + /* Set writable */ +#if NPY_API_VERSION >= 0x00000007 + if (writeable) { + PyArray_ENABLEFLAGS(array, NPY_ARRAY_WRITEABLE); + } + else { + PyArray_CLEARFLAGS(array, NPY_ARRAY_WRITEABLE); + } +#else + if (writeable) { + array->flags |= NPY_WRITEABLE; + } + else { + array->flags &= ~NPY_WRITEABLE; + } +#endif + + if (miobj) { + /* Set the MemInfoObject as the base object */ +#if NPY_API_VERSION >= 0x00000007 + if (-1 == PyArray_SetBaseObject(array, + (PyObject *) miobj)) + { + Py_DECREF(array); + Py_DECREF(miobj); + return NULL; + } +#else + PyArray_BASE(array) = (PyObject *) miobj; +#endif + + } + return (PyObject *) array; +} + +NUMBA_EXPORT_FUNC(void) +NRT_adapt_buffer_from_python(Py_buffer *buf, arystruct_t *arystruct) +{ + int i; + npy_intp *p; + + if (buf->obj) { + /* Allocate new MemInfo only if the buffer has a parent */ + arystruct->meminfo = NRT_meminfo_new_from_pyobject((void*)buf->buf, buf->obj); + } + arystruct->data = buf->buf; + arystruct->itemsize = buf->itemsize; + arystruct->parent = buf->obj; + arystruct->nitems = 1; + p = arystruct->shape_and_strides; + for (i = 0; i < buf->ndim; i++, p++) { + *p = buf->shape[i]; + arystruct->nitems *= buf->shape[i]; + } + for (i = 0; i < buf->ndim; i++, p++) { + *p = buf->strides[i]; + } +} + + +/* Initialization subroutines for modules including this source file */ + +static int +init_nrt_python_module(PyObject *module) +{ + MemInfoType.tp_new = PyType_GenericNew; + if (PyType_Ready(&MemInfoType)) + return -1; + return 0; +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/_nrt_pythonmod.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/_nrt_pythonmod.c new file mode 100644 index 0000000000000000000000000000000000000000..3552c7095770ef6db9050db8324caf4fe63f20c8 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/_nrt_pythonmod.c @@ -0,0 +1,207 @@ +#define NUMBA_EXPORT_FUNC(_rettype) static _rettype +#define NUMBA_EXPORT_DATA(_vartype) static _vartype + +#include "_nrt_python.c" + +static PyObject * +memsys_shutdown(PyObject *self, PyObject *args) { + NRT_MemSys_shutdown(); + Py_RETURN_NONE; +} + +static PyObject * +memsys_use_cpython_allocator(PyObject *self, PyObject *args) { + NRT_MemSys_set_allocator(PyMem_RawMalloc, + PyMem_RawRealloc, + PyMem_RawFree); + Py_RETURN_NONE; +} + +static PyObject * +memsys_set_atomic_inc_dec(PyObject *self, PyObject *args) { + PyObject *addr_inc_obj, *addr_dec_obj; + void *addr_inc, *addr_dec; + if (!PyArg_ParseTuple(args, "OO", &addr_inc_obj, &addr_dec_obj)) { + return NULL; + } + addr_inc = PyLong_AsVoidPtr(addr_inc_obj); + if(PyErr_Occurred()) return NULL; + addr_dec = PyLong_AsVoidPtr(addr_dec_obj); + if(PyErr_Occurred()) return NULL; + NRT_MemSys_set_atomic_inc_dec(addr_inc, addr_dec); + Py_RETURN_NONE; +} + +static PyObject * +memsys_set_atomic_cas(PyObject *self, PyObject *args) { + PyObject *addr_cas_obj; + void *addr_cas; + if (!PyArg_ParseTuple(args, "O", &addr_cas_obj)) { + return NULL; + } + addr_cas = PyLong_AsVoidPtr(addr_cas_obj); + if(PyErr_Occurred()) return NULL; + NRT_MemSys_set_atomic_cas(addr_cas); + Py_RETURN_NONE; +} + +static PyObject * +memsys_get_stats_alloc(PyObject *self, PyObject *args) { + return PyLong_FromSize_t(NRT_MemSys_get_stats_alloc()); +} + +static PyObject * +memsys_get_stats_free(PyObject *self, PyObject *args) { + return PyLong_FromSize_t(NRT_MemSys_get_stats_free()); +} + +static PyObject * +memsys_get_stats_mi_alloc(PyObject *self, PyObject *args) { + return PyLong_FromSize_t(NRT_MemSys_get_stats_mi_alloc()); +} + +static PyObject * +memsys_get_stats_mi_free(PyObject *self, PyObject *args) { + return PyLong_FromSize_t(NRT_MemSys_get_stats_mi_free()); +} + + +/* + * Create a new MemInfo with a owner PyObject + */ +static PyObject * +meminfo_new(PyObject *self, PyObject *args) { + PyObject *addr_data_obj; + void *addr_data; + PyObject *ownerobj; + NRT_MemInfo *mi; + if (!PyArg_ParseTuple(args, "OO", &addr_data_obj, &ownerobj)) { + return NULL; + } + addr_data = PyLong_AsVoidPtr(addr_data_obj); + if (PyErr_Occurred()) + return NULL; + mi = NRT_meminfo_new_from_pyobject(addr_data, ownerobj); + return PyLong_FromVoidPtr(mi); +} + +/* + * Create a new MemInfo with a new NRT allocation + */ +static PyObject * +meminfo_alloc(PyObject *self, PyObject *args) { + NRT_MemInfo *mi; + Py_ssize_t size; + if (!PyArg_ParseTuple(args, "n", &size)) { + return NULL; + } + mi = NRT_MemInfo_alloc(size); + return PyLong_FromVoidPtr(mi); +} + +/* + * Like meminfo_alloc but set memory to zero after allocation and before + * deallocation. + */ +static PyObject * +meminfo_alloc_safe(PyObject *self, PyObject *args) { + NRT_MemInfo *mi; + Py_ssize_t size; + if (!PyArg_ParseTuple(args, "n", &size)) { + return NULL; + } + mi = NRT_MemInfo_alloc_safe(size); + return PyLong_FromVoidPtr(mi); +} + +static PyMethodDef ext_methods[] = { +#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL } +#define declmethod_noargs(func) { #func , ( PyCFunction )func , METH_NOARGS, NULL } + declmethod_noargs(memsys_use_cpython_allocator), + declmethod_noargs(memsys_shutdown), + declmethod(memsys_set_atomic_inc_dec), + declmethod(memsys_set_atomic_cas), + declmethod_noargs(memsys_get_stats_alloc), + declmethod_noargs(memsys_get_stats_free), + declmethod_noargs(memsys_get_stats_mi_alloc), + declmethod_noargs(memsys_get_stats_mi_free), + declmethod(meminfo_new), + declmethod(meminfo_alloc), + declmethod(meminfo_alloc_safe), + { NULL }, +#undef declmethod +}; + + + +static PyObject * +build_c_helpers_dict(void) +{ + PyObject *dct = PyDict_New(); + if (dct == NULL) + goto error; + +#define _declpointer(name, value) do { \ + PyObject *o = PyLong_FromVoidPtr(value); \ + if (o == NULL) goto error; \ + if (PyDict_SetItemString(dct, name, o)) { \ + Py_DECREF(o); \ + goto error; \ + } \ + Py_DECREF(o); \ +} while (0) + +#define declmethod(func) _declpointer(#func, &NRT_##func) +#define declmethod_internal(func) _declpointer(#func, &func) + +declmethod(adapt_ndarray_from_python); +declmethod(adapt_ndarray_to_python_acqref); +declmethod(adapt_buffer_from_python); +declmethod(meminfo_new_from_pyobject); +declmethod(meminfo_as_pyobject); +declmethod(meminfo_from_pyobject); +declmethod(MemInfo_alloc); +declmethod(MemInfo_alloc_safe); +declmethod(MemInfo_alloc_aligned); +declmethod(MemInfo_alloc_safe_aligned); +declmethod(MemInfo_alloc_safe_aligned_external); +declmethod_internal(_nrt_get_sample_external_allocator); +declmethod(MemInfo_alloc_dtor_safe); +declmethod(MemInfo_call_dtor); +declmethod(MemInfo_new_varsize); +declmethod(MemInfo_new_varsize_dtor); +declmethod(MemInfo_varsize_alloc); +declmethod(MemInfo_data); +declmethod(MemInfo_varsize_free); +declmethod(MemInfo_varsize_realloc); +declmethod(MemInfo_release); +declmethod(Allocate); +declmethod(Free); +declmethod(get_api); + + +#undef declmethod +#undef declmethod_internal + return dct; +error: + Py_XDECREF(dct); + return NULL; +} + +MOD_INIT(_nrt_python) { + PyObject *m; + MOD_DEF(m, "_nrt_python", "No docs", ext_methods) + if (m == NULL) + return MOD_ERROR_VAL; + import_array(); + NRT_MemSys_init(); + if (init_nrt_python_module(m)) + return MOD_ERROR_VAL; + + Py_INCREF(&MemInfoType); + PyModule_AddObject(m, "_MemInfo", (PyObject *) (&MemInfoType)); + + PyModule_AddObject(m, "c_helpers", build_c_helpers_dict()); + + return MOD_SUCCESS_VAL(m); +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/context.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/context.py new file mode 100644 index 0000000000000000000000000000000000000000..9b73bbcf6d2a8a456183dcaf591ad4f8dfd0562d --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/context.py @@ -0,0 +1,401 @@ +import functools + +from llvmlite import ir + +from numba.core import types, cgutils, errors + + +class NRTContext(object): + """ + An object providing access to NRT APIs in the lowering pass. + """ + + def __init__(self, context, enabled): + self._context = context + self._enabled = enabled + + def _require_nrt(self): + if not self._enabled: + raise errors.NumbaRuntimeError("NRT required but not enabled") + + def _check_null_result(func): + @functools.wraps(func) + def wrap(self, builder, *args, **kwargs): + memptr = func(self, builder, *args, **kwargs) + msg = "Allocation failed (probably too large)." + cgutils.guard_memory_error(self._context, builder, memptr, msg=msg) + return memptr + return wrap + + @_check_null_result + def allocate(self, builder, size): + """ + Low-level allocate a new memory area of `size` bytes. The result of the + call is checked and if it is NULL, i.e. allocation failed, then a + MemoryError is raised. + """ + return self.allocate_unchecked(builder, size) + + def allocate_unchecked(self, builder, size): + """ + Low-level allocate a new memory area of `size` bytes. Returns NULL to + indicate error/failure to allocate. + """ + self._require_nrt() + + mod = builder.module + fnty = ir.FunctionType(cgutils.voidptr_t, [cgutils.intp_t]) + fn = cgutils.get_or_insert_function(mod, fnty, "NRT_Allocate") + fn.return_value.add_attribute("noalias") + return builder.call(fn, [size]) + + def free(self, builder, ptr): + """ + Low-level free a memory area allocated with allocate(). + """ + self._require_nrt() + + mod = builder.module + fnty = ir.FunctionType(ir.VoidType(), [cgutils.voidptr_t]) + fn = cgutils.get_or_insert_function(mod, fnty, "NRT_Free") + return builder.call(fn, [ptr]) + + @_check_null_result + def meminfo_alloc(self, builder, size): + """ + Allocate a new MemInfo with a data payload of `size` bytes. + + A pointer to the MemInfo is returned. + + The result of the call is checked and if it is NULL, i.e. allocation + failed, then a MemoryError is raised. + """ + return self.meminfo_alloc_unchecked(builder, size) + + def meminfo_alloc_unchecked(self, builder, size): + """ + Allocate a new MemInfo with a data payload of `size` bytes. + + A pointer to the MemInfo is returned. + + Returns NULL to indicate error/failure to allocate. + """ + self._require_nrt() + + mod = builder.module + fnty = ir.FunctionType(cgutils.voidptr_t, [cgutils.intp_t]) + fn = cgutils.get_or_insert_function(mod, fnty, "NRT_MemInfo_alloc_safe") + fn.return_value.add_attribute("noalias") + return builder.call(fn, [size]) + + @_check_null_result + def meminfo_alloc_dtor(self, builder, size, dtor): + """ + Allocate a new MemInfo with a data payload of `size` bytes and a + destructor `dtor`. + + A pointer to the MemInfo is returned. + + The result of the call is checked and if it is NULL, i.e. allocation + failed, then a MemoryError is raised. + """ + return self.meminfo_alloc_dtor_unchecked(builder, size, dtor) + + def meminfo_alloc_dtor_unchecked(self, builder, size, dtor): + """ + Allocate a new MemInfo with a data payload of `size` bytes and a + destructor `dtor`. + + A pointer to the MemInfo is returned. + + Returns NULL to indicate error/failure to allocate. + """ + self._require_nrt() + + mod = builder.module + fnty = ir.FunctionType(cgutils.voidptr_t, + [cgutils.intp_t, cgutils.voidptr_t]) + fn = cgutils.get_or_insert_function(mod, fnty, + "NRT_MemInfo_alloc_dtor_safe") + fn.return_value.add_attribute("noalias") + return builder.call(fn, [size, + builder.bitcast(dtor, cgutils.voidptr_t)]) + + @_check_null_result + def meminfo_alloc_aligned(self, builder, size, align): + """ + Allocate a new MemInfo with an aligned data payload of `size` bytes. + The data pointer is aligned to `align` bytes. `align` can be either + a Python int or a LLVM uint32 value. + + A pointer to the MemInfo is returned. + + The result of the call is checked and if it is NULL, i.e. allocation + failed, then a MemoryError is raised. + """ + return self.meminfo_alloc_aligned_unchecked(builder, size, align) + + def meminfo_alloc_aligned_unchecked(self, builder, size, align): + """ + Allocate a new MemInfo with an aligned data payload of `size` bytes. + The data pointer is aligned to `align` bytes. `align` can be either + a Python int or a LLVM uint32 value. + + A pointer to the MemInfo is returned. + + Returns NULL to indicate error/failure to allocate. + """ + self._require_nrt() + + mod = builder.module + u32 = ir.IntType(32) + fnty = ir.FunctionType(cgutils.voidptr_t, [cgutils.intp_t, u32]) + fn = cgutils.get_or_insert_function(mod, fnty, + "NRT_MemInfo_alloc_safe_aligned") + fn.return_value.add_attribute("noalias") + if isinstance(align, int): + align = self._context.get_constant(types.uint32, align) + else: + assert align.type == u32, "align must be a uint32" + return builder.call(fn, [size, align]) + + @_check_null_result + def meminfo_new_varsize(self, builder, size): + """ + Allocate a MemInfo pointing to a variable-sized data area. The area + is separately allocated (i.e. two allocations are made) so that + re-allocating it doesn't change the MemInfo's address. + + A pointer to the MemInfo is returned. + + The result of the call is checked and if it is NULL, i.e. allocation + failed, then a MemoryError is raised. + """ + return self.meminfo_new_varsize_unchecked(builder, size) + + def meminfo_new_varsize_unchecked(self, builder, size): + """ + Allocate a MemInfo pointing to a variable-sized data area. The area + is separately allocated (i.e. two allocations are made) so that + re-allocating it doesn't change the MemInfo's address. + + A pointer to the MemInfo is returned. + + Returns NULL to indicate error/failure to allocate. + """ + self._require_nrt() + + mod = builder.module + fnty = ir.FunctionType(cgutils.voidptr_t, [cgutils.intp_t]) + fn = cgutils.get_or_insert_function(mod, fnty, + "NRT_MemInfo_new_varsize") + fn.return_value.add_attribute("noalias") + return builder.call(fn, [size]) + + @_check_null_result + def meminfo_new_varsize_dtor(self, builder, size, dtor): + """ + Like meminfo_new_varsize() but also set the destructor for + cleaning up references to objects inside the allocation. + + A pointer to the MemInfo is returned. + + The result of the call is checked and if it is NULL, i.e. allocation + failed, then a MemoryError is raised. + """ + return self.meminfo_new_varsize_dtor_unchecked(builder, size, dtor) + + def meminfo_new_varsize_dtor_unchecked(self, builder, size, dtor): + """ + Like meminfo_new_varsize() but also set the destructor for + cleaning up references to objects inside the allocation. + + A pointer to the MemInfo is returned. + + Returns NULL to indicate error/failure to allocate. + """ + self._require_nrt() + + mod = builder.module + fnty = ir.FunctionType(cgutils.voidptr_t, + [cgutils.intp_t, cgutils.voidptr_t]) + fn = cgutils.get_or_insert_function( + mod, fnty, "NRT_MemInfo_new_varsize_dtor") + return builder.call(fn, [size, dtor]) + + @_check_null_result + def meminfo_varsize_alloc(self, builder, meminfo, size): + """ + Allocate a new data area for a MemInfo created by meminfo_new_varsize(). + The new data pointer is returned, for convenience. + + Contrary to realloc(), this always allocates a new area and doesn't + copy the old data. This is useful if resizing a container needs + more than simply copying the data area (e.g. for hash tables). + + The old pointer will have to be freed with meminfo_varsize_free(). + + The result of the call is checked and if it is NULL, i.e. allocation + failed, then a MemoryError is raised. + """ + return self.meminfo_varsize_alloc_unchecked(builder, meminfo, size) + + def meminfo_varsize_alloc_unchecked(self, builder, meminfo, size): + """ + Allocate a new data area for a MemInfo created by meminfo_new_varsize(). + The new data pointer is returned, for convenience. + + Contrary to realloc(), this always allocates a new area and doesn't + copy the old data. This is useful if resizing a container needs + more than simply copying the data area (e.g. for hash tables). + + The old pointer will have to be freed with meminfo_varsize_free(). + + Returns NULL to indicate error/failure to allocate. + """ + return self._call_varsize_alloc(builder, meminfo, size, + "NRT_MemInfo_varsize_alloc") + + @_check_null_result + def meminfo_varsize_realloc(self, builder, meminfo, size): + """ + Reallocate a data area allocated by meminfo_new_varsize(). + The new data pointer is returned, for convenience. + + The result of the call is checked and if it is NULL, i.e. allocation + failed, then a MemoryError is raised. + """ + return self.meminfo_varsize_realloc_unchecked(builder, meminfo, size) + + def meminfo_varsize_realloc_unchecked(self, builder, meminfo, size): + """ + Reallocate a data area allocated by meminfo_new_varsize(). + The new data pointer is returned, for convenience. + + Returns NULL to indicate error/failure to allocate. + """ + return self._call_varsize_alloc(builder, meminfo, size, + "NRT_MemInfo_varsize_realloc") + + def meminfo_varsize_free(self, builder, meminfo, ptr): + """ + Free a memory area allocated for a NRT varsize object. + Note this does *not* free the NRT object itself! + """ + self._require_nrt() + + mod = builder.module + fnty = ir.FunctionType(ir.VoidType(), + [cgutils.voidptr_t, cgutils.voidptr_t]) + fn = cgutils.get_or_insert_function(mod, fnty, + "NRT_MemInfo_varsize_free") + return builder.call(fn, (meminfo, ptr)) + + def _call_varsize_alloc(self, builder, meminfo, size, funcname): + self._require_nrt() + + mod = builder.module + fnty = ir.FunctionType(cgutils.voidptr_t, + [cgutils.voidptr_t, cgutils.intp_t]) + fn = cgutils.get_or_insert_function(mod, fnty, funcname) + fn.return_value.add_attribute("noalias") + return builder.call(fn, [meminfo, size]) + + def meminfo_data(self, builder, meminfo): + """ + Given a MemInfo pointer, return a pointer to the allocated data + managed by it. This works for MemInfos allocated with all the + above methods. + """ + self._require_nrt() + + from numba.core.runtime.nrtdynmod import meminfo_data_ty + + mod = builder.module + fn = cgutils.get_or_insert_function(mod, meminfo_data_ty, + "NRT_MemInfo_data_fast") + return builder.call(fn, [meminfo]) + + def get_meminfos(self, builder, ty, val): + """Return a list of *(type, meminfo)* inside the given value. + """ + datamodel = self._context.data_model_manager[ty] + members = datamodel.traverse(builder) + + meminfos = [] + if datamodel.has_nrt_meminfo(): + mi = datamodel.get_nrt_meminfo(builder, val) + meminfos.append((ty, mi)) + + for mtyp, getter in members: + field = getter(val) + inner_meminfos = self.get_meminfos(builder, mtyp, field) + meminfos.extend(inner_meminfos) + return meminfos + + def _call_incref_decref(self, builder, typ, value, funcname): + """Call function of *funcname* on every meminfo found in *value*. + """ + self._require_nrt() + + from numba.core.runtime.nrtdynmod import incref_decref_ty + + meminfos = self.get_meminfos(builder, typ, value) + for _, mi in meminfos: + mod = builder.module + fn = cgutils.get_or_insert_function(mod, incref_decref_ty, + funcname) + # XXX "nonnull" causes a crash in test_dyn_array: can this + # function be called with a NULL pointer? + fn.args[0].add_attribute("noalias") + fn.args[0].add_attribute("nocapture") + builder.call(fn, [mi]) + + def incref(self, builder, typ, value): + """ + Recursively incref the given *value* and its members. + """ + self._call_incref_decref(builder, typ, value, "NRT_incref") + + def decref(self, builder, typ, value): + """ + Recursively decref the given *value* and its members. + """ + self._call_incref_decref(builder, typ, value, "NRT_decref") + + def get_nrt_api(self, builder): + """Calls NRT_get_api(), which returns the NRT API function table. + """ + self._require_nrt() + + fnty = ir.FunctionType(cgutils.voidptr_t, ()) + mod = builder.module + fn = cgutils.get_or_insert_function(mod, fnty, "NRT_get_api") + return builder.call(fn, ()) + + def eh_check(self, builder): + """Check if an exception is raised + """ + ctx = self._context + cc = ctx.call_conv + # Inspect the excinfo argument on the function + trystatus = cc.check_try_status(builder) + excinfo = trystatus.excinfo + has_raised = builder.not_(cgutils.is_null(builder, excinfo)) + with builder.if_then(has_raised): + self.eh_end_try(builder) + return has_raised + + def eh_try(self, builder): + """Begin a try-block. + """ + ctx = self._context + cc = ctx.call_conv + cc.set_try_status(builder) + + def eh_end_try(self, builder): + """End a try-block + """ + ctx = self._context + cc = ctx.call_conv + cc.unset_try_status(builder) diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt.c b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt.c new file mode 100644 index 0000000000000000000000000000000000000000..3f67182e4ed9b6b571822e24ede04ffa633bc014 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt.c @@ -0,0 +1,595 @@ +#include +#include /* for memset */ +#include "nrt.h" +#include "assert.h" + +#if !defined MIN +#define MIN(a, b) ((a) < (b)) ? (a) : (b) +#endif + + +typedef int (*atomic_meminfo_cas_func)(void **ptr, void *cmp, + void *repl, void **oldptr); + + +/* NOTE: if changing the layout, please update numba.core.runtime.atomicops */ +struct MemInfo { + size_t refct; + NRT_dtor_function dtor; + void *dtor_info; + void *data; + size_t size; /* only used for NRT allocated memory */ + NRT_ExternalAllocator *external_allocator; +}; + + +/* + * Misc helpers. + */ + +static void nrt_fatal_error(const char *msg) +{ + fprintf(stderr, "Fatal Numba error: %s\n", msg); + fflush(stderr); /* it helps in Windows debug build */ + +#if defined(MS_WINDOWS) && defined(_DEBUG) + DebugBreak(); +#endif + abort(); +} + +/* + * Global resources. + */ + +struct MemSys { + /* Atomic increment and decrement function */ + NRT_atomic_inc_dec_func atomic_inc, atomic_dec; + /* Atomic CAS */ + atomic_meminfo_cas_func atomic_cas; + /* Shutdown flag */ + int shutting; + /* Stats */ + size_t stats_alloc, stats_free, stats_mi_alloc, stats_mi_free; + /* System allocation functions */ + struct { + NRT_malloc_func malloc; + NRT_realloc_func realloc; + NRT_free_func free; + } allocator; +}; + +/* The Memory System object */ +static NRT_MemSys TheMSys; + + +void NRT_MemSys_init(void) { + memset(&TheMSys, 0, sizeof(NRT_MemSys)); + /* Bind to libc allocator */ + TheMSys.allocator.malloc = malloc; + TheMSys.allocator.realloc = realloc; + TheMSys.allocator.free = free; +} + +void NRT_MemSys_shutdown(void) { + TheMSys.shutting = 1; + /* Revert to use our non-atomic stub for all atomic operations + because the JIT-ed version will be removed. + Since we are at interpreter shutdown, + it cannot be running multiple threads anymore. */ + NRT_MemSys_set_atomic_inc_dec_stub(); + NRT_MemSys_set_atomic_cas_stub(); +} + +void NRT_MemSys_set_allocator(NRT_malloc_func malloc_func, + NRT_realloc_func realloc_func, + NRT_free_func free_func) +{ + if ((malloc_func != TheMSys.allocator.malloc || + realloc_func != TheMSys.allocator.realloc || + free_func != TheMSys.allocator.free) && + (TheMSys.stats_alloc != TheMSys.stats_free || + TheMSys.stats_mi_alloc != TheMSys.stats_mi_free)) { + nrt_fatal_error("cannot change allocator while blocks are allocated"); + } + TheMSys.allocator.malloc = malloc_func; + TheMSys.allocator.realloc = realloc_func; + TheMSys.allocator.free = free_func; +} + +void NRT_MemSys_set_atomic_inc_dec(NRT_atomic_inc_dec_func inc, + NRT_atomic_inc_dec_func dec) +{ + TheMSys.atomic_inc = inc; + TheMSys.atomic_dec = dec; +} + +void NRT_MemSys_set_atomic_cas(NRT_atomic_cas_func cas) { + TheMSys.atomic_cas = (atomic_meminfo_cas_func) cas; +} + +size_t NRT_MemSys_get_stats_alloc() { + return TheMSys.stats_alloc; +} + +size_t NRT_MemSys_get_stats_free() { + return TheMSys.stats_free; +} + +size_t NRT_MemSys_get_stats_mi_alloc() { + return TheMSys.stats_mi_alloc; +} + +size_t NRT_MemSys_get_stats_mi_free() { + return TheMSys.stats_mi_free; +} + +static +size_t nrt_testing_atomic_inc(size_t *ptr){ + /* non atomic */ + size_t out = *ptr; + out += 1; + *ptr = out; + return out; +} + +static +size_t nrt_testing_atomic_dec(size_t *ptr){ + /* non atomic */ + size_t out = *ptr; + out -= 1; + *ptr = out; + return out; +} + +static +int nrt_testing_atomic_cas(void* volatile *ptr, void *cmp, void *val, + void * *oldptr){ + /* non atomic */ + void *old = *ptr; + *oldptr = old; + if (old == cmp) { + *ptr = val; + return 1; + } + return 0; + +} + +void NRT_MemSys_set_atomic_inc_dec_stub(void){ + NRT_MemSys_set_atomic_inc_dec(nrt_testing_atomic_inc, + nrt_testing_atomic_dec); +} + +void NRT_MemSys_set_atomic_cas_stub(void) { + NRT_MemSys_set_atomic_cas(nrt_testing_atomic_cas); +} + + +/* + * The MemInfo structure. + */ + +void NRT_MemInfo_init(NRT_MemInfo *mi,void *data, size_t size, + NRT_dtor_function dtor, void *dtor_info, + NRT_ExternalAllocator *external_allocator) +{ + mi->refct = 1; /* starts with 1 refct */ + mi->dtor = dtor; + mi->dtor_info = dtor_info; + mi->data = data; + mi->size = size; + mi->external_allocator = external_allocator; + NRT_Debug(nrt_debug_print("NRT_MemInfo_init mi=%p external_allocator=%p\n", mi, external_allocator)); + /* Update stats */ + TheMSys.atomic_inc(&TheMSys.stats_mi_alloc); +} + +NRT_MemInfo *NRT_MemInfo_new(void *data, size_t size, + NRT_dtor_function dtor, void *dtor_info) +{ + NRT_MemInfo *mi = NRT_Allocate(sizeof(NRT_MemInfo)); + if (mi != NULL) { + NRT_Debug(nrt_debug_print("NRT_MemInfo_new mi=%p\n", mi)); + NRT_MemInfo_init(mi, data, size, dtor, dtor_info, NULL); + } + return mi; +} + +size_t NRT_MemInfo_refcount(NRT_MemInfo *mi) { + /* Should never returns 0 for a valid MemInfo */ + if (mi && mi->data) + return mi->refct; + else{ + return (size_t)-1; + } +} + +static +void nrt_internal_dtor_safe(void *ptr, size_t size, void *info) { + NRT_Debug(nrt_debug_print("nrt_internal_dtor_safe %p, %p\n", ptr, info)); + /* See NRT_MemInfo_alloc_safe() */ + memset(ptr, 0xDE, MIN(size, 256)); +} + +static +void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out, NRT_ExternalAllocator *allocator) { + NRT_MemInfo *mi = NULL; + NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data %p\n", allocator)); + char *base = NRT_Allocate_External(sizeof(NRT_MemInfo) + size, allocator); + if (base == NULL) { + *mi_out = NULL; /* set meminfo to NULL as allocation failed */ + return NULL; /* return early as allocation failed */ + } + mi = (NRT_MemInfo *) base; + *mi_out = mi; + return base + sizeof(NRT_MemInfo); +} + + +static +void nrt_internal_custom_dtor_safe(void *ptr, size_t size, void *info) { + NRT_dtor_function dtor = info; + NRT_Debug(nrt_debug_print("nrt_internal_custom_dtor_safe %p, %p\n", + ptr, info)); + if (dtor) { + dtor(ptr, size, NULL); + } + + nrt_internal_dtor_safe(ptr, size, NULL); +} + + +NRT_MemInfo *NRT_MemInfo_alloc(size_t size) { + NRT_MemInfo *mi = NULL; + void *data = nrt_allocate_meminfo_and_data(size, &mi, NULL); + if (data == NULL) { + return NULL; /* return early as allocation failed */ + } + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc %p\n", data)); + NRT_MemInfo_init(mi, data, size, NULL, NULL, NULL); + return mi; +} + +NRT_MemInfo *NRT_MemInfo_alloc_external(size_t size, NRT_ExternalAllocator *allocator) { + NRT_MemInfo *mi = NULL; + void *data = nrt_allocate_meminfo_and_data(size, &mi, allocator); + if (data == NULL) { + return NULL; /* return early as allocation failed */ + } + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc %p\n", data)); + NRT_MemInfo_init(mi, data, size, NULL, NULL, allocator); + return mi; +} + +NRT_MemInfo *NRT_MemInfo_alloc_safe(size_t size) { + return NRT_MemInfo_alloc_dtor_safe(size, NULL); +} + +NRT_MemInfo* NRT_MemInfo_alloc_dtor_safe(size_t size, NRT_dtor_function dtor) { + NRT_MemInfo *mi = NULL; + void *data = nrt_allocate_meminfo_and_data(size, &mi, NULL); + if (data == NULL) { + return NULL; /* return early as allocation failed */ + } + /* Only fill up a couple cachelines with debug markers, to minimize + overhead. */ + memset(data, 0xCB, MIN(size, 256)); + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_dtor_safe %p %zu\n", data, size)); + NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, dtor, NULL); + return mi; +} + + +static +void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, + NRT_MemInfo **mi, NRT_ExternalAllocator *allocator) +{ + size_t offset = 0, intptr = 0, remainder = 0; + NRT_Debug(nrt_debug_print("nrt_allocate_meminfo_and_data_align %p\n", allocator)); + char *base = nrt_allocate_meminfo_and_data(size + 2 * align, mi, allocator); + if (base == NULL) { + return NULL; /* return early as allocation failed */ + } + intptr = (size_t) base; + /* See if we are aligned */ + remainder = intptr % align; + if (remainder == 0){ /* Yes */ + offset = 0; + } else { /* No, move forward `offset` bytes */ + offset = align - remainder; + } + return base + offset; +} + +NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align) { + NRT_MemInfo *mi = NULL; + void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, NULL); + if (data == NULL) { + return NULL; /* return early as allocation failed */ + } + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_aligned %p\n", data)); + NRT_MemInfo_init(mi, data, size, NULL, NULL, NULL); + return mi; +} + +NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned(size_t size, unsigned align) { + NRT_MemInfo *mi = NULL; + void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, NULL); + if (data == NULL) { + return NULL; /* return early as allocation failed */ + } + /* Only fill up a couple cachelines with debug markers, to minimize + overhead. */ + memset(data, 0xCB, MIN(size, 256)); + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned %p %zu\n", + data, size)); + NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size, NULL); + return mi; +} + +NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned_external(size_t size, unsigned align, NRT_ExternalAllocator *allocator) { + NRT_MemInfo *mi = NULL; + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned_external %p\n", allocator)); + void *data = nrt_allocate_meminfo_and_data_align(size, align, &mi, allocator); + if (data == NULL) { + return NULL; /* return early as allocation failed */ + } + /* Only fill up a couple cachelines with debug markers, to minimize + overhead. */ + memset(data, 0xCB, MIN(size, 256)); + NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_safe_aligned %p %zu\n", + data, size)); + NRT_MemInfo_init(mi, data, size, nrt_internal_dtor_safe, (void*)size, allocator); + return mi; +} + +void NRT_dealloc(NRT_MemInfo *mi) { + NRT_Debug(nrt_debug_print("NRT_dealloc meminfo: %p external_allocator: %p\n", mi, mi->external_allocator)); + if (mi->external_allocator) { + mi->external_allocator->free(mi, mi->external_allocator->opaque_data); + TheMSys.atomic_inc(&TheMSys.stats_free); + } else { + NRT_Free(mi); + } +} + +void NRT_MemInfo_destroy(NRT_MemInfo *mi) { + NRT_dealloc(mi); + TheMSys.atomic_inc(&TheMSys.stats_mi_free); +} + +void NRT_MemInfo_acquire(NRT_MemInfo *mi) { + NRT_Debug(nrt_debug_print("NRT_MemInfo_acquire %p refct=%zu\n", mi, + mi->refct)); + assert(mi->refct > 0 && "RefCt cannot be zero"); + TheMSys.atomic_inc(&mi->refct); +} + +void NRT_MemInfo_call_dtor(NRT_MemInfo *mi) { + NRT_Debug(nrt_debug_print("NRT_MemInfo_call_dtor %p\n", mi)); + if (mi->dtor && !TheMSys.shutting) + /* We have a destructor and the system is not shutting down */ + mi->dtor(mi->data, mi->size, mi->dtor_info); + /* Clear and release MemInfo */ + NRT_MemInfo_destroy(mi); +} + +void NRT_MemInfo_release(NRT_MemInfo *mi) { + NRT_Debug(nrt_debug_print("NRT_MemInfo_release %p refct=%zu\n", mi, + mi->refct)); + assert (mi->refct > 0 && "RefCt cannot be 0"); + /* RefCt drop to zero */ + if (TheMSys.atomic_dec(&mi->refct) == 0) { + NRT_MemInfo_call_dtor(mi); + } +} + +void* NRT_MemInfo_data(NRT_MemInfo* mi) { + return mi->data; +} + +size_t NRT_MemInfo_size(NRT_MemInfo* mi) { + return mi->size; +} + +void * NRT_MemInfo_external_allocator(NRT_MemInfo *mi) { + NRT_Debug(nrt_debug_print("NRT_MemInfo_external_allocator meminfo: %p external_allocator: %p\n", mi, mi->external_allocator)); + return mi->external_allocator; +} + +void *NRT_MemInfo_parent(NRT_MemInfo *mi) { + return mi->dtor_info; +} + +void NRT_MemInfo_dump(NRT_MemInfo *mi, FILE *out) { + fprintf(out, "MemInfo %p refcount %zu\n", mi, mi->refct); +} + +/* + * Resizable buffer API. + */ + +static void +nrt_varsize_dtor(void *ptr, size_t size, void *info) { + NRT_Debug(nrt_debug_print("nrt_varsize_dtor %p\n", ptr)); + if (info) { + /* call element dtor */ + typedef void dtor_fn_t(void *ptr); + dtor_fn_t *dtor = info; + dtor(ptr); + } + NRT_Free(ptr); +} + +NRT_MemInfo *NRT_MemInfo_new_varsize(size_t size) +{ + NRT_MemInfo *mi = NULL; + void *data = NRT_Allocate(size); + if (data == NULL) { + return NULL; /* return early as allocation failed */ + } + + mi = NRT_MemInfo_new(data, size, nrt_varsize_dtor, NULL); + NRT_Debug(nrt_debug_print("NRT_MemInfo_new_varsize size=%zu " + "-> meminfo=%p, data=%p\n", size, mi, data)); + return mi; +} + +NRT_MemInfo *NRT_MemInfo_new_varsize_dtor(size_t size, NRT_dtor_function dtor) { + NRT_MemInfo *mi = NRT_MemInfo_new_varsize(size); + if (mi) { + mi->dtor_info = dtor; + } + return mi; +} + +void *NRT_MemInfo_varsize_alloc(NRT_MemInfo *mi, size_t size) +{ + if (mi->dtor != nrt_varsize_dtor) { + nrt_fatal_error("ERROR: NRT_MemInfo_varsize_alloc called " + "with a non varsize-allocated meminfo"); + return NULL; /* unreachable */ + } + mi->data = NRT_Allocate(size); + if (mi->data == NULL) + return NULL; + mi->size = size; + NRT_Debug(nrt_debug_print("NRT_MemInfo_varsize_alloc %p size=%zu " + "-> data=%p\n", mi, size, mi->data)); + return mi->data; +} + +void *NRT_MemInfo_varsize_realloc(NRT_MemInfo *mi, size_t size) +{ + if (mi->dtor != nrt_varsize_dtor) { + nrt_fatal_error("ERROR: NRT_MemInfo_varsize_realloc called " + "with a non varsize-allocated meminfo"); + return NULL; /* unreachable */ + } + mi->data = NRT_Reallocate(mi->data, size); + if (mi->data == NULL) + return NULL; + mi->size = size; + NRT_Debug(nrt_debug_print("NRT_MemInfo_varsize_realloc %p size=%zu " + "-> data=%p\n", mi, size, mi->data)); + return mi->data; +} + +void NRT_MemInfo_varsize_free(NRT_MemInfo *mi, void *ptr) +{ + NRT_Free(ptr); + if (ptr == mi->data) + mi->data = NULL; +} + +/* + * Low-level allocation wrappers. + */ + +void* NRT_Allocate(size_t size) { + return NRT_Allocate_External(size, NULL); +} + +void* NRT_Allocate_External(size_t size, NRT_ExternalAllocator *allocator) { + void *ptr = NULL; + if (allocator) { + ptr = allocator->malloc(size, allocator->opaque_data); + NRT_Debug(nrt_debug_print("NRT_Allocate_External custom bytes=%zu ptr=%p\n", size, ptr)); + } else { + ptr = TheMSys.allocator.malloc(size); + NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr)); + } + TheMSys.atomic_inc(&TheMSys.stats_alloc); + return ptr; +} + +void *NRT_Reallocate(void *ptr, size_t size) { + void *new_ptr = TheMSys.allocator.realloc(ptr, size); + NRT_Debug(nrt_debug_print("NRT_Reallocate bytes=%zu ptr=%p -> %p\n", + size, ptr, new_ptr)); + return new_ptr; +} + +void NRT_Free(void *ptr) { + NRT_Debug(nrt_debug_print("NRT_Free %p\n", ptr)); + TheMSys.allocator.free(ptr); + TheMSys.atomic_inc(&TheMSys.stats_free); +} + +/* + * Sample external allocator implementation for internal testing. + */ + +static int sample_external_opaque_data = 0xabacad; + +static +void* sample_external_malloc(size_t size, void* opaque_data) { + if (opaque_data != &sample_external_opaque_data) return NULL; + return TheMSys.allocator.malloc(size); +} + +static +void* sample_external_realloc(void *ptr, size_t new_size, void *opaque_data) { + if (opaque_data != &sample_external_opaque_data) return NULL; + return TheMSys.allocator.realloc(ptr, new_size); +} + +static +void sample_external_free(void *ptr, void* opaque_data) { + TheMSys.allocator.free(ptr); +} + +static NRT_ExternalAllocator sample_external_allocator = { + // malloc + sample_external_malloc, + // realloc + sample_external_realloc, + // free + sample_external_free, + // opaque_data + &sample_external_opaque_data +}; + +NRT_ExternalAllocator* _nrt_get_sample_external_allocator() { + return &sample_external_allocator; +} + +/* + * Debugging printf function used internally + */ +void nrt_debug_print(char *fmt, ...) { + va_list args; + + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); +} + + +static +void nrt_manage_memory_dtor(void *data, size_t size, void *info) { + NRT_managed_dtor* dtor = (NRT_managed_dtor*)info; + dtor(data); +} + +static +NRT_MemInfo* nrt_manage_memory(void *data, NRT_managed_dtor dtor) { + return NRT_MemInfo_new(data, 0, nrt_manage_memory_dtor, dtor); +} + + +static const +NRT_api_functions nrt_functions_table = { + NRT_MemInfo_alloc, + NRT_MemInfo_alloc_external, + nrt_manage_memory, + NRT_MemInfo_acquire, + NRT_MemInfo_release, + NRT_MemInfo_data +}; + + +const NRT_api_functions* NRT_get_api(void) { + return &nrt_functions_table; +} diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt.h new file mode 100644 index 0000000000000000000000000000000000000000..2bfc9033ecdf7f60821a4e06c64947fddcbf7849 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt.h @@ -0,0 +1,272 @@ +/* +All functions described here are threadsafe. +*/ + +#ifndef NUMBA_NRT_H_ +#define NUMBA_NRT_H_ + + +#include +#include +#include "../../_numba_common.h" + +#include "nrt_external.h" + +/* Debugging facilities - enabled at compile-time */ +/* #undef NDEBUG */ +#if 0 +# define NRT_Debug(X) {X; fflush(stdout); } +#else +# define NRT_Debug(X) if (0) { X; } +#endif + +/* TypeDefs */ +typedef void (*NRT_dtor_function)(void *ptr, size_t size, void *info); +typedef void (*NRT_dealloc_func)(void *ptr, void *dealloc_info); +typedef size_t (*NRT_atomic_inc_dec_func)(size_t *ptr); +typedef int (*NRT_atomic_cas_func)(void * volatile *ptr, void *cmp, void *repl, + void **oldptr); + +typedef struct MemSys NRT_MemSys; + +typedef void *(*NRT_malloc_func)(size_t size); +typedef void *(*NRT_realloc_func)(void *ptr, size_t new_size); +typedef void (*NRT_free_func)(void *ptr); + +/* Memory System API */ + +/* Initialize the memory system */ +VISIBILITY_HIDDEN +void NRT_MemSys_init(void); + +/* Shutdown the memory system */ +VISIBILITY_HIDDEN +void NRT_MemSys_shutdown(void); + +/* + * Register the system allocation functions + */ +VISIBILITY_HIDDEN +void NRT_MemSys_set_allocator(NRT_malloc_func, NRT_realloc_func, NRT_free_func); + +/* + * Register the atomic increment and decrement functions + */ +VISIBILITY_HIDDEN +void NRT_MemSys_set_atomic_inc_dec(NRT_atomic_inc_dec_func inc, + NRT_atomic_inc_dec_func dec); + + +/* + * Register the atomic compare and swap function + */ +VISIBILITY_HIDDEN +void NRT_MemSys_set_atomic_cas(NRT_atomic_cas_func cas); + +/* + * Register a non-atomic STUB for increment and decrement + */ +VISIBILITY_HIDDEN +void NRT_MemSys_set_atomic_inc_dec_stub(void); + +/* + * Register a non-atomic STUB for compare and swap + */ +VISIBILITY_HIDDEN +void NRT_MemSys_set_atomic_cas_stub(void); + +/* + * The following functions get internal statistics of the memory subsystem. + */ +VISIBILITY_HIDDEN +size_t NRT_MemSys_get_stats_alloc(void); +VISIBILITY_HIDDEN +size_t NRT_MemSys_get_stats_free(void); +VISIBILITY_HIDDEN +size_t NRT_MemSys_get_stats_mi_alloc(void); +VISIBILITY_HIDDEN +size_t NRT_MemSys_get_stats_mi_free(void); + +/* Memory Info API */ + +/* Create a new MemInfo for external memory + * + * data: data pointer being tracked + * dtor: destructor to execute + * dtor_info: additional information to pass to the destructor + */ +VISIBILITY_HIDDEN +NRT_MemInfo* NRT_MemInfo_new(void *data, size_t size, + NRT_dtor_function dtor, void *dtor_info); + +/* + * The `external_allocator` is for experimental API to customize the allocator. + * Set to NULL to use the default builtin allocator. + */ +VISIBILITY_HIDDEN +void NRT_MemInfo_init(NRT_MemInfo *mi, void *data, size_t size, + NRT_dtor_function dtor, void *dtor_info, + NRT_ExternalAllocator *external_allocator); + +/* + * Returns the refcount of a MemInfo or (size_t)-1 if error. + */ +VISIBILITY_HIDDEN +size_t NRT_MemInfo_refcount(NRT_MemInfo *mi); + +/* + * Allocate memory of `size` bytes and return a pointer to a MemInfo structure + * that describes the allocation + */ +VISIBILITY_HIDDEN +NRT_MemInfo *NRT_MemInfo_alloc(size_t size); + +NRT_MemInfo *NRT_MemInfo_alloc_external(size_t size, NRT_ExternalAllocator *allocator); + +/* + * The "safe" NRT_MemInfo_alloc performs additional steps to help debug + * memory errors. + * It is guaranteed to: + * - zero-fill to the memory region after allocation and before deallocation. + * - may do more in the future + */ +VISIBILITY_HIDDEN +NRT_MemInfo *NRT_MemInfo_alloc_safe(size_t size); + +/* + * Similar to NRT_MemInfo_alloc_safe but with a custom dtor. + */ +VISIBILITY_HIDDEN +NRT_MemInfo* NRT_MemInfo_alloc_dtor_safe(size_t size, NRT_dtor_function dtor); + +/* + * Aligned versions of the NRT_MemInfo_alloc and NRT_MemInfo_alloc_safe. + * These take an additional argument `align` for number of bytes to align to. + */ +VISIBILITY_HIDDEN +NRT_MemInfo *NRT_MemInfo_alloc_aligned(size_t size, unsigned align); +VISIBILITY_HIDDEN +NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned(size_t size, unsigned align); + +/* + * Experimental. + * A variation to use an external allocator. + */ +NRT_MemInfo *NRT_MemInfo_alloc_safe_aligned_external(size_t size, unsigned align, NRT_ExternalAllocator *allocator); + +/* + * Internal API. + * Release a MemInfo. Calls NRT_MemSys_insert_meminfo. + */ +VISIBILITY_HIDDEN +void NRT_MemInfo_destroy(NRT_MemInfo *mi); + +/* + * Acquire a reference to a MemInfo + */ +VISIBILITY_HIDDEN +void NRT_MemInfo_acquire(NRT_MemInfo* mi); + +/* + * Release a reference to a MemInfo + */ +VISIBILITY_HIDDEN +void NRT_MemInfo_release(NRT_MemInfo* mi); + +/* + * Internal/Compiler API. + * Invoke the registered destructor of a MemInfo. + */ +VISIBILITY_HIDDEN +void NRT_MemInfo_call_dtor(NRT_MemInfo *mi); + +/* + * Returns the data pointer + */ +VISIBILITY_HIDDEN +void* NRT_MemInfo_data(NRT_MemInfo* mi); + +/* + * Returns the allocated size + */ +VISIBILITY_HIDDEN +size_t NRT_MemInfo_size(NRT_MemInfo* mi); + + +/* + * Experimental. + * Returns the external allocator + */ +VISIBILITY_HIDDEN +void* NRT_MemInfo_external_allocator(NRT_MemInfo* mi); + +/* + * Returns the parent MemInfo + */ +VISIBILITY_HIDDEN +void* NRT_MemInfo_parent(NRT_MemInfo* mi); + + +/* + * NRT API for resizable buffers. + */ +VISIBILITY_HIDDEN +NRT_MemInfo *NRT_MemInfo_new_varsize(size_t size); +VISIBILITY_HIDDEN +NRT_MemInfo *NRT_MemInfo_new_varsize_dtor(size_t size, NRT_dtor_function dtor); +VISIBILITY_HIDDEN +void *NRT_MemInfo_varsize_alloc(NRT_MemInfo *mi, size_t size); +VISIBILITY_HIDDEN +void *NRT_MemInfo_varsize_realloc(NRT_MemInfo *mi, size_t size); +VISIBILITY_HIDDEN +void NRT_MemInfo_varsize_free(NRT_MemInfo *mi, void *ptr); + +/* + * Print debug info to FILE + */ +VISIBILITY_HIDDEN +void NRT_MemInfo_dump(NRT_MemInfo *mi, FILE *out); + + +/* Low-level allocation wrappers. */ + +/* + * Allocate memory of `size` bytes. + */ +VISIBILITY_HIDDEN void* NRT_Allocate(size_t size); + +/* + * Experimental + * + * An alternative allocator that allows using an external allocator. + */ +VISIBILITY_HIDDEN void* NRT_Allocate_External(size_t size, NRT_ExternalAllocator *allocator); + +/* + * Deallocate memory pointed by `ptr`. + */ +VISIBILITY_HIDDEN void NRT_Free(void *ptr); + +/* + * Reallocate memory at `ptr`. + */ +VISIBILITY_HIDDEN void *NRT_Reallocate(void *ptr, size_t size); + +/* + * Debugging printf function used internally + */ +VISIBILITY_HIDDEN void nrt_debug_print(char *fmt, ...); + +/* + * Get API function table. + */ +VISIBILITY_HIDDEN const NRT_api_functions* NRT_get_api(void); + + +/* + * FOR INTERNAL USE ONLY. + * Get a sample external allocator for testing + */ +VISIBILITY_HIDDEN NRT_ExternalAllocator* _nrt_get_sample_external_allocator(void); + +#endif /* NUMBA_NRT_H_ */ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt.py new file mode 100644 index 0000000000000000000000000000000000000000..fef29e3c8c25e60b436a8d3d58708fe5ecbf0305 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt.py @@ -0,0 +1,136 @@ +from collections import namedtuple +from weakref import finalize as _finalize + +from numba.core.runtime import nrtdynmod +from llvmlite import binding as ll + +from numba.core.compiler_lock import global_compiler_lock +from numba.core.typing.typeof import typeof_impl +from numba.core import types +from numba.core.runtime import _nrt_python as _nrt + +_nrt_mstats = namedtuple("nrt_mstats", ["alloc", "free", "mi_alloc", "mi_free"]) + + +class _Runtime(object): + def __init__(self): + self._init = False + + @global_compiler_lock + def initialize(self, ctx): + """Initializes the NRT + + Must be called before any actual call to the NRT API. + Safe to be called multiple times. + """ + if self._init: + # Already initialized + return + + # Register globals into the system + for py_name in _nrt.c_helpers: + if py_name.startswith("_"): + # internal API + c_name = py_name + else: + c_name = "NRT_" + py_name + c_address = _nrt.c_helpers[py_name] + ll.add_symbol(c_name, c_address) + + # Compile atomic operations + self._library = nrtdynmod.compile_nrt_functions(ctx) + + self._ptr_inc = self._library.get_pointer_to_function("nrt_atomic_add") + self._ptr_dec = self._library.get_pointer_to_function("nrt_atomic_sub") + self._ptr_cas = self._library.get_pointer_to_function("nrt_atomic_cas") + + # Install atomic ops to NRT + _nrt.memsys_set_atomic_inc_dec(self._ptr_inc, self._ptr_dec) + _nrt.memsys_set_atomic_cas(self._ptr_cas) + + self._init = True + + def _init_guard(self): + if not self._init: + msg = "Runtime must be initialized before use." + raise RuntimeError(msg) + + @staticmethod + def shutdown(): + """ + Shutdown the NRT + Safe to be called without calling Runtime.initialize first + """ + _nrt.memsys_shutdown() + + @property + def library(self): + """ + Return the Library object containing the various NRT functions. + """ + self._init_guard() + return self._library + + def meminfo_new(self, data, pyobj): + """ + Returns a MemInfo object that tracks memory at `data` owned by `pyobj`. + MemInfo will acquire a reference on `pyobj`. + The release of MemInfo will release a reference on `pyobj`. + """ + self._init_guard() + mi = _nrt.meminfo_new(data, pyobj) + return MemInfo(mi) + + def meminfo_alloc(self, size, safe=False): + """ + Allocate a new memory of `size` bytes and returns a MemInfo object + that tracks the allocation. When there is no more reference to the + MemInfo object, the underlying memory will be deallocated. + + If `safe` flag is True, the memory is allocated using the `safe` scheme. + This is used for debugging and testing purposes. + See `NRT_MemInfo_alloc_safe()` in "nrt.h" for details. + """ + self._init_guard() + if size < 0: + msg = f"Cannot allocate a negative number of bytes: {size}." + raise ValueError(msg) + if safe: + mi = _nrt.meminfo_alloc_safe(size) + else: + mi = _nrt.meminfo_alloc(size) + if mi == 0: # alloc failed or size was 0 and alloc returned NULL. + msg = f"Requested allocation of {size} bytes failed." + raise MemoryError(msg) + return MemInfo(mi) + + def get_allocation_stats(self): + """ + Returns a namedtuple of (alloc, free, mi_alloc, mi_free) for count of + each memory operations. + """ + # No init guard needed to access stats members + return _nrt_mstats(alloc=_nrt.memsys_get_stats_alloc(), + free=_nrt.memsys_get_stats_free(), + mi_alloc=_nrt.memsys_get_stats_mi_alloc(), + mi_free=_nrt.memsys_get_stats_mi_free()) + + +# Alias to _nrt_python._MemInfo +MemInfo = _nrt._MemInfo + + +@typeof_impl.register(MemInfo) +def typeof_meminfo(val, c): + return types.MemInfoPointer(types.voidptr) + + +# Create runtime +_nrt.memsys_use_cpython_allocator() +rtsys = _Runtime() + +# Install finalizer +_finalize(rtsys, _Runtime.shutdown) + +# Avoid future use of the class +del _Runtime diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt_external.h b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt_external.h new file mode 100644 index 0000000000000000000000000000000000000000..8689550157b66bb8280db0056ece315d014b6209 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrt_external.h @@ -0,0 +1,65 @@ +#ifndef NUMBA_NRT_EXTERNAL_H_ +#define NUMBA_NRT_EXTERNAL_H_ + +#include + +typedef struct MemInfo NRT_MemInfo; + +typedef void NRT_managed_dtor(void *data); + +typedef void *(*NRT_external_malloc_func)(size_t size, void *opaque_data); +typedef void *(*NRT_external_realloc_func)(void *ptr, size_t new_size, void *opaque_data); +typedef void (*NRT_external_free_func)(void *ptr, void *opaque_data); + +struct ExternalMemAllocator { + NRT_external_malloc_func malloc; + NRT_external_realloc_func realloc; + NRT_external_free_func free; + void *opaque_data; +}; + +typedef struct ExternalMemAllocator NRT_ExternalAllocator; + +typedef struct { + /* Methods to create MemInfos. + + MemInfos are like smart pointers for objects that are managed by the Numba. + */ + + /* Allocate memory + + *nbytes* is the number of bytes to be allocated + + Returning a new reference. + */ + NRT_MemInfo* (*allocate)(size_t nbytes); + /* Allocates memory using an external allocator but still using Numba's MemInfo. + * + * NOTE: An externally provided allocator must behave the same way as C99 + * stdlib.h's "malloc" function with respect to return value + * (including the behaviour that occurs when requesting an allocation + * of size 0 bytes). + */ + NRT_MemInfo* (*allocate_external)(size_t nbytes, NRT_ExternalAllocator *allocator); + + /* Convert externally allocated memory into a MemInfo. + + *data* is the memory pointer + *dtor* is the deallocator of the memory + */ + NRT_MemInfo* (*manage_memory)(void *data, NRT_managed_dtor dtor); + + /* Acquire a reference */ + void (*acquire)(NRT_MemInfo* mi); + + /* Release a reference */ + void (*release)(NRT_MemInfo* mi); + + /* Get MemInfo data pointer */ + void* (*get_data)(NRT_MemInfo* mi); + +} NRT_api_functions; + + + +#endif /* NUMBA_NRT_EXTERNAL_H_ */ diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrtdynmod.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrtdynmod.py new file mode 100644 index 0000000000000000000000000000000000000000..c8cc1973d3f791ab398dec1b1d474d0fd4e13cf9 --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrtdynmod.py @@ -0,0 +1,215 @@ +""" +Dynamically generate the NRT module +""" + + +from numba.core import config +from numba.core import types, cgutils +from llvmlite import ir, binding + + +_word_type = ir.IntType(config.MACHINE_BITS) +_pointer_type = ir.PointerType(ir.IntType(8)) + +_meminfo_struct_type = ir.LiteralStructType([ + _word_type, # size_t refct + _pointer_type, # dtor_function dtor + _pointer_type, # void *dtor_info + _pointer_type, # void *data + _word_type, # size_t size + ]) + + +incref_decref_ty = ir.FunctionType(ir.VoidType(), [_pointer_type]) +meminfo_data_ty = ir.FunctionType(_pointer_type, [_pointer_type]) + + +def _define_nrt_meminfo_data(module): + """ + Implement NRT_MemInfo_data_fast in the module. This allows LLVM + to inline lookup of the data pointer. + """ + fn = cgutils.get_or_insert_function(module, meminfo_data_ty, + "NRT_MemInfo_data_fast") + builder = ir.IRBuilder(fn.append_basic_block()) + [ptr] = fn.args + struct_ptr = builder.bitcast(ptr, _meminfo_struct_type.as_pointer()) + data_ptr = builder.load(cgutils.gep(builder, struct_ptr, 0, 3)) + builder.ret(data_ptr) + + +def _define_nrt_incref(module, atomic_incr): + """ + Implement NRT_incref in the module + """ + fn_incref = cgutils.get_or_insert_function(module, incref_decref_ty, + "NRT_incref") + # Cannot inline this for refcount pruning to work + fn_incref.attributes.add('noinline') + builder = ir.IRBuilder(fn_incref.append_basic_block()) + [ptr] = fn_incref.args + is_null = builder.icmp_unsigned("==", ptr, cgutils.get_null_value(ptr.type)) + with cgutils.if_unlikely(builder, is_null): + builder.ret_void() + + word_ptr = builder.bitcast(ptr, atomic_incr.args[0].type) + if config.DEBUG_NRT: + cgutils.printf(builder, "*** NRT_Incref %zu [%p]\n", builder.load(word_ptr), + ptr) + builder.call(atomic_incr, [word_ptr]) + builder.ret_void() + + +def _define_nrt_decref(module, atomic_decr): + """ + Implement NRT_decref in the module + """ + fn_decref = cgutils.get_or_insert_function(module, incref_decref_ty, + "NRT_decref") + # Cannot inline this for refcount pruning to work + fn_decref.attributes.add('noinline') + calldtor = ir.Function(module, + ir.FunctionType(ir.VoidType(), [_pointer_type]), + name="NRT_MemInfo_call_dtor") + + builder = ir.IRBuilder(fn_decref.append_basic_block()) + [ptr] = fn_decref.args + is_null = builder.icmp_unsigned("==", ptr, cgutils.get_null_value(ptr.type)) + with cgutils.if_unlikely(builder, is_null): + builder.ret_void() + + + # For memory fence usage, see https://llvm.org/docs/Atomics.html + + # A release fence is used before the relevant write operation. + # No-op on x86. On POWER, it lowers to lwsync. + builder.fence("release") + + word_ptr = builder.bitcast(ptr, atomic_decr.args[0].type) + + if config.DEBUG_NRT: + cgutils.printf(builder, "*** NRT_Decref %zu [%p]\n", builder.load(word_ptr), + ptr) + newrefct = builder.call(atomic_decr, + [word_ptr]) + + refct_eq_0 = builder.icmp_unsigned("==", newrefct, + ir.Constant(newrefct.type, 0)) + with cgutils.if_unlikely(builder, refct_eq_0): + # An acquire fence is used after the relevant read operation. + # No-op on x86. On POWER, it lowers to lwsync. + builder.fence("acquire") + builder.call(calldtor, [ptr]) + builder.ret_void() + + +# Set this to True to measure the overhead of atomic refcounts compared +# to non-atomic. +_disable_atomicity = 0 + + +def _define_atomic_inc_dec(module, op, ordering): + """Define a llvm function for atomic increment/decrement to the given module + Argument ``op`` is the operation "add"/"sub". Argument ``ordering`` is + the memory ordering. The generated function returns the new value. + """ + ftype = ir.FunctionType(_word_type, [_word_type.as_pointer()]) + fn_atomic = ir.Function(module, ftype, name="nrt_atomic_{0}".format(op)) + + [ptr] = fn_atomic.args + bb = fn_atomic.append_basic_block() + builder = ir.IRBuilder(bb) + ONE = ir.Constant(_word_type, 1) + if not _disable_atomicity: + oldval = builder.atomic_rmw(op, ptr, ONE, ordering=ordering) + # Perform the operation on the old value so that we can pretend returning + # the "new" value. + res = getattr(builder, op)(oldval, ONE) + builder.ret(res) + else: + oldval = builder.load(ptr) + newval = getattr(builder, op)(oldval, ONE) + builder.store(newval, ptr) + builder.ret(oldval) + + return fn_atomic + + +def _define_atomic_cas(module, ordering): + """Define a llvm function for atomic compare-and-swap. + The generated function is a direct wrapper of the LLVM cmpxchg with the + difference that the a int indicate success (1) or failure (0) is returned + and the last argument is a output pointer for storing the old value. + + Note + ---- + On failure, the generated function behaves like an atomic load. The loaded + value is stored to the last argument. + """ + ftype = ir.FunctionType(ir.IntType(32), [_word_type.as_pointer(), + _word_type, _word_type, + _word_type.as_pointer()]) + fn_cas = ir.Function(module, ftype, name="nrt_atomic_cas") + + [ptr, cmp, repl, oldptr] = fn_cas.args + bb = fn_cas.append_basic_block() + builder = ir.IRBuilder(bb) + outtup = builder.cmpxchg(ptr, cmp, repl, ordering=ordering) + old, ok = cgutils.unpack_tuple(builder, outtup, 2) + builder.store(old, oldptr) + builder.ret(builder.zext(ok, ftype.return_type)) + + return fn_cas + + +def _define_nrt_unresolved_abort(ctx, module): + """ + Defines an abort function due to unresolved symbol. + + The function takes no args and will always raise an exception. + It should be safe to call this function with incorrect number of arguments. + """ + fnty = ctx.call_conv.get_function_type(types.none, ()) + fn = ir.Function(module, fnty, name="nrt_unresolved_abort") + bb = fn.append_basic_block() + builder = ir.IRBuilder(bb) + msg = "numba jitted function aborted due to unresolved symbol" + ctx.call_conv.return_user_exc(builder, RuntimeError, (msg,)) + return fn + + +def create_nrt_module(ctx): + """ + Create an IR module defining the LLVM NRT functions. + A (IR module, library) tuple is returned. + """ + codegen = ctx.codegen() + library = codegen.create_library("nrt") + + # Implement LLVM module with atomic ops + ir_mod = library.create_ir_module("nrt_module") + + atomic_inc = _define_atomic_inc_dec(ir_mod, "add", ordering='monotonic') + atomic_dec = _define_atomic_inc_dec(ir_mod, "sub", ordering='monotonic') + _define_atomic_cas(ir_mod, ordering='monotonic') + + _define_nrt_meminfo_data(ir_mod) + _define_nrt_incref(ir_mod, atomic_inc) + _define_nrt_decref(ir_mod, atomic_dec) + + _define_nrt_unresolved_abort(ctx, ir_mod) + + return ir_mod, library + + +def compile_nrt_functions(ctx): + """ + Compile all LLVM NRT functions and return a library containing them. + The library is created using the given target context. + """ + ir_mod, library = create_nrt_module(ctx) + + library.add_ir_module(ir_mod) + library.finalize() + + return library diff --git a/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrtopt.py b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrtopt.py new file mode 100644 index 0000000000000000000000000000000000000000..2a6f56b09108d06ed4522ff259a9ee49c279fcfd --- /dev/null +++ b/cv/3d_detection/centerpoint/pytorch/numba/numba/core/runtime/nrtopt.py @@ -0,0 +1,182 @@ +""" +NRT specific optimizations +""" +import re +from collections import defaultdict, deque +from llvmlite import binding as ll +from numba.core import cgutils + +_regex_incref = re.compile(r'\s*(?:tail)?\s*call void @NRT_incref\((.*)\)') +_regex_decref = re.compile(r'\s*(?:tail)?\s*call void @NRT_decref\((.*)\)') +_regex_bb = re.compile( + r'|'.join([ + # unnamed BB is just a plain number + r'[0-9]+:', + # with a proper identifier (see llvm langref) + r'[\'"]?[-a-zA-Z$._0-9][-a-zA-Z$._0-9]*[\'"]?:', + # is a start of a function definition + r'^define', + # no name + r'^;\s*