Hello 👋
This Repository contains implementations of some models that I've built from scratch using mostly PyTorch. Feel free to contribute or add any other popular architecture. Make sure to install eniops (einstein operations) as used at a few places to deal with higher dimensional tensor vectors. pip install einops
.
If you want to know more about implementing ViTs from scratch you can follow me on Medium I have a series going on, where I comprehensively explain various architectures from scratch.
@article{ronneberger2015unet,
title={U-Net: Convolutional Networks for Biomedical Image Segmentation},
author={Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
journal={arXiv preprint arXiv:1505.04597},
year={2015}
}
@article{he2016deep,
title={Deep Residual Learning for Image Recognition},
author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
journal={arXiv preprint arXiv:1512.03385},
year={2016}
}
@article{dosovitskiy2020image,
title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others},
journal={arXiv preprint arXiv:2010.11929},
year={2020}
}
@article{he2021masked,
title={Masked Autoencoder for Vision Transformer},
author={He, Kaiming and Chen, Xiangyu and Xie, Saining and Li, Yang and Doll{\'a}r, Piotr and Girshick, Ross},
journal={arXiv preprint arXiv:2111.06377},
year={2021}
}
@article{dosovitskiy2020image,
title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
author={Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others},
journal={arXiv preprint arXiv:2010.11929},
year={2020}
}
@article{caron2021emerging,
title={Emerging Properties in Self-Supervised Vision Transformers},
author={Caron, Mathilde and Touvron, Hugo and Misra, Ishan and Jégou, Hervé and Mairal, Julien and Bojanowski, Piotr and Joulin, Armand},
journal={arXiv preprint arXiv:2104.14294},
year={2021}
}
@article{liu2021swin,
title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},
author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
journal={arXiv preprint arXiv:2103.14030},
year={2021}
}
@article{touvron2021going,
title={Going Deeper with Image Transformers},
author={Touvron, Hugo and Cord, Matthieu and Douze, Matthijs and Massa, Francesco and Sablayrolles, Alexandre and J{\'e}gou, Herv{\'e}},
journal={arXiv preprint arXiv:2103.17239},
year={2021}
}
@article{assael2016lipnet,
title={LipNet: End-to-End Sentence-level Lipreading},
author={Assael, Yannis M. and Shillingford, Brendan and Whiteson, Shimon and de Freitas, Nando},
journal={arXiv preprint arXiv:1611.01599},
year={2016}
}
@article{radford2021learning,
title={Learning Transferable Visual Models From Natural Language Supervision},
author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pam and Clark, Jack and others},
journal={arXiv preprint arXiv:2103.00020},
year={2021}
}