Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
pascal1129 committed Nov 8, 2018
0 parents commit dfb0eca
Show file tree
Hide file tree
Showing 8 changed files with 945 additions and 0 deletions.
62 changes: 62 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# for airbus_rle_to_coco

# Data files and directories common in repo root
datasets/
dataset/
logs/
*.h5
results/
temp/
tmp/
test/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# VS Studio Code
.vscode

# PyCharm
.idea/

# Dropbox
.dropbox.attr

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# dotenv
.env

# virtualenv
.venv
venv/
ENV/
33 changes: 33 additions & 0 deletions 0_airbus_delete_empty_im.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import pandas as pd
import numpy as np
from PIL import Image

dataset_train = '../datasets/ships_train2018'
csv_train = '../datasets/train_ship_segmentations_v2.csv'

if __name__ == '__main__':
# read_csv_file
df = pd.read_csv(csv_train)
print("Dataframe lines : ",df.shape[0])

# delete annotations without ship
df = df.dropna(axis=0)
num_of_ships = df.shape[0]
print("Inastances : ",num_of_ships)

# create an empty set to store images with ship
images = set()
for line in range(num_of_ships):
if df.iloc[line,0] not in images:
images.add(df.iloc[line,0])
print("Images with ship: ",len(images))

# Delete images without ship
count = 0
ims = os.listdir(dataset_train)
for im in ims:
if im not in images:
os.remove(os.path.join(im_path, im))
count += 1
print('%d images is deleted.'%(count))
52 changes: 52 additions & 0 deletions 0_csv_show_RLE.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import numpy as np
import pandas as pd
from skimage.data import imread
import matplotlib.pyplot as plt
import os
np.set_printoptions(threshold=np.inf) # print all numpy ndarray

# ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
# mask_rle(string) --> rle_decode() --> np.ndarry(np.unit8)
# shape: (height,width) , 1 - mask, 0 - background
def rle_decode(mask_rle, shape=(768, 768)):
s = mask_rle.split()
starts = np.asarray(s[0::2], dtype=int)
lengths = np.asarray(s[1::2], dtype=int)

starts -= 1
ends = starts + lengths
img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
for lo, hi in zip(starts, ends):
img[lo:hi] = 1
return img.reshape(shape).T # Needed to align to RLE direction

def csv_show_rle(ImageId, dataset_dir, df):
img = imread(os.path.join(dataset_dir, ImageId))
rle_masks = df.loc[df['ImageId'] == ImageId, 'EncodedPixels'].tolist()

# Take the individual ship masks and create a single mask array for all ships
all_masks = np.zeros((768, 768))
for mask in rle_masks:
all_masks += rle_decode(mask)

fig, axarr = plt.subplots(1, 3)
axarr[0].axis('off'),
axarr[1].axis('off'),
axarr[2].axis('off')
axarr[0].imshow(img),
axarr[1].imshow(all_masks),
axarr[2].imshow(img)
axarr[2].imshow(all_masks, alpha=0.4)
plt.tight_layout(h_pad=0.1, w_pad=0.1)
# plt.savefig( os.path.join(ROOT_DIR, '../tmp', 'tmp.png') )
plt.show()

if __name__ == "__main__":
dataset_train = '../datasets/ships_train2018'
dataset_test = '../datasets/ships_test2018'
csv_train = '../datasets/train_ship_segmentations_v2.csv'
csv_test = '../submit/rle_submit.csv'

df = pd.read_csv(csv_train)
ImageId = '0ba29cbcf.jpg'
csv_show_rle(ImageId, dataset_train, df)
143 changes: 143 additions & 0 deletions 1_ships_to_coco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import datetime
import json
import os
import re
import fnmatch
from PIL import Image
import numpy as np
from pycococreatortools import pycococreatortools
import pandas as pd

from skimage.data import imread
import matplotlib.pyplot as plt

dataset_train = '../datasets/ships_train2018'
csv_train = '../datasets/train_ship_segmentations_v2.csv'
IMAGE_DIR = dataset_train

df = pd.read_csv(csv_train ) # read csv file

INFO = {
"description": "Kaggle Dataset",
"url": "/~https://github.com/pascal1129",
"version": "0.1.0",
"year": 2018,
"contributor": "pascal1129",
"date_created": datetime.datetime.utcnow().isoformat(' ')
}

LICENSES = [
{
"id": 1,
"name": "Attribution-NonCommercial-ShareAlike License",
"url": "http://creativecommons.org/licenses/by-nc-sa/2.0/"
}
]

CATEGORIES = [
{
'id': 1,
'name': 'ship',
'supercategory': 'ship',
},
]

# ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
# mask_rle(string) --> rle_decode() --> np.ndarry(np.unit8)
# shape: (height,width) , 1 - mask, 0 - background
def rle_decode(mask_rle, shape=(768, 768)):
s = mask_rle.split()
starts = np.asarray(s[0::2], dtype=int)
lengths = np.asarray(s[1::2], dtype=int)

starts -= 1
ends = starts + lengths
img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
for lo, hi in zip(starts, ends):
img[lo:hi] = 1
return img.reshape(shape).T # Needed to align to RLE direction


def filter_for_jpeg(root, files):
file_types = ['*.jpeg', '*.jpg']
file_types = r'|'.join([fnmatch.translate(x) for x in file_types])
files = [os.path.join(root, f) for f in files]
files = [f for f in files if re.match(file_types, f)]

return files

def save_bad_ann(image_name, mask, segmentation_id):
img = imread(os.path.join(IMAGE_DIR, image_name))
fig, axarr = plt.subplots(1, 3)
axarr[0].axis('off')
axarr[1].axis('off')
axarr[2].axis('off')
axarr[0].imshow(img)
axarr[1].imshow(mask)
axarr[2].imshow(img)
axarr[2].imshow(mask, alpha=0.4)
plt.tight_layout(h_pad=0.1, w_pad=0.1)
if not os.path.exists('tmp'):
os.makedirs('tmp')
plt.savefig( os.path.join('./tmp', image_name.split('.')[0] +'_' +str(segmentation_id) +'.png') )
plt.close()

def main():
# 最终放进json文件里的字典
coco_output = {
"info": INFO,
"licenses": LICENSES,
"categories": CATEGORIES,
"images": [], # 放一个空列表占位置,后面再append
"annotations": []
}

image_id = 1
segmentation_id = 1

# 最外层的循环是图片,因为图片的基本信息需要共享
# IMAGE_DIR路径下找到所有的图片
for root, _, files in os.walk(IMAGE_DIR):
image_paths = filter_for_jpeg(root, files) # 图片文件地址
num_of_image_files = len(image_paths) # 图片个数

# 遍历每一张图片
for image_path in image_paths:
# 提取图片信息
image = Image.open(image_path)
image_name = os.path.basename(image_path) # 不需要具体的路径,只要图片文件名
image_info = pycococreatortools.create_image_info(
image_id, image_name, image.size)
coco_output["images"].append(image_info)

# 内层循环是mask,把每一张图片的mask搜索出来
rle_masks = df.loc[df['ImageId'] == image_name, 'EncodedPixels'].tolist()
num_of_rle_masks = len(rle_masks)

for index in range(num_of_rle_masks):
binary_mask = rle_decode(rle_masks[index])
class_id = 1 # 所有图片的类别都是1,ship
category_info = {'id': class_id, 'is_crowd': 0}
annotation_info = pycococreatortools.create_annotation_info(
segmentation_id, image_id, category_info, binary_mask,
image.size, tolerance=2)

# 不是所有的标注都会被转换,低质量标注会被过滤掉
# 正常的标注加入数据集,不好的标注保存供观察
if annotation_info is not None:
coco_output["annotations"].append(annotation_info)
else:
save_bad_ann(image_name, binary_mask, segmentation_id)

# 无论标注是否被写入数据集,均分配一个编号
segmentation_id = segmentation_id + 1

print("%d of %d is done."%(image_id,num_of_image_files))
image_id = image_id + 1

with open('{}/annotations/instances_ships_train2018.json'.format(ROOT_DIR), 'w') as output_json_file:
# json.dump(coco_output, output_json_file)
json.dump(coco_output, output_json_file,indent=4)

if __name__ == "__main__":
main()
Loading

0 comments on commit dfb0eca

Please sign in to comment.