diff --git a/PyTorch/built-in/mm/OpenSora1.1/.isort.cfg b/PyTorch/built-in/mm/OpenSora1.1/.isort.cfg
deleted file mode 100644
index ccbf575fdbfacd185cf880431ad81462e0ae8fdf..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/.isort.cfg
+++ /dev/null
@@ -1,7 +0,0 @@
-[settings]
-line_length = 120
-multi_line_output=3
-include_trailing_comma = true
-ignore_comments = true
-profile = black
-honor_noqa = true
diff --git a/PyTorch/built-in/mm/OpenSora1.1/LICENSE b/PyTorch/built-in/mm/OpenSora1.1/LICENSE
deleted file mode 100644
index 7327c123dd164dc24fc361a8eaf37c62125c3aa2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/LICENSE
+++ /dev/null
@@ -1,681 +0,0 @@
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
- =========================================================================
- This project is inspired by the listed projects and is subject to the following licenses:
-
- 1. Latte (https://github.com/Vchitect/Latte/blob/main/LICENSE)
-
- Copyright 2024 Latte
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
- 2. PixArt-alpha (https://github.com/PixArt-alpha/PixArt-alpha/blob/master/LICENSE)
-
- Copyright (C) 2024 PixArt-alpha/PixArt-alpha
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published
- by the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see .
-
- 3. dpm-solver (https://github.com/LuChengTHU/dpm-solver/blob/main/LICENSE)
-
- MIT License
-
- Copyright (c) 2022 Cheng Lu
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-
- 4. DiT (https://github.com/facebookresearch/DiT/blob/main/LICENSE.txt)
-
- Attribution-NonCommercial 4.0 International
-
- =======================================================================
-
- Creative Commons Corporation ("Creative Commons") is not a law firm and
- does not provide legal services or legal advice. Distribution of
- Creative Commons public licenses does not create a lawyer-client or
- other relationship. Creative Commons makes its licenses and related
- information available on an "as-is" basis. Creative Commons gives no
- warranties regarding its licenses, any material licensed under their
- terms and conditions, or any related information. Creative Commons
- disclaims all liability for damages resulting from their use to the
- fullest extent possible.
-
- Using Creative Commons Public Licenses
-
- Creative Commons public licenses provide a standard set of terms and
- conditions that creators and other rights holders may use to share
- original works of authorship and other material subject to copyright
- and certain other rights specified in the public license below. The
- following considerations are for informational purposes only, are not
- exhaustive, and do not form part of our licenses.
-
- Considerations for licensors: Our public licenses are
- intended for use by those authorized to give the public
- permission to use material in ways otherwise restricted by
- copyright and certain other rights. Our licenses are
- irrevocable. Licensors should read and understand the terms
- and conditions of the license they choose before applying it.
- Licensors should also secure all rights necessary before
- applying our licenses so that the public can reuse the
- material as expected. Licensors should clearly mark any
- material not subject to the license. This includes other CC-
- licensed material, or material used under an exception or
- limitation to copyright. More considerations for licensors:
- wiki.creativecommons.org/Considerations_for_licensors
-
- Considerations for the public: By using one of our public
- licenses, a licensor grants the public permission to use the
- licensed material under specified terms and conditions. If
- the licensor's permission is not necessary for any reason--for
- example, because of any applicable exception or limitation to
- copyright--then that use is not regulated by the license. Our
- licenses grant only permissions under copyright and certain
- other rights that a licensor has authority to grant. Use of
- the licensed material may still be restricted for other
- reasons, including because others have copyright or other
- rights in the material. A licensor may make special requests,
- such as asking that all changes be marked or described.
- Although not required by our licenses, you are encouraged to
- respect those requests where reasonable. More_considerations
- for the public:
- wiki.creativecommons.org/Considerations_for_licensees
-
- =======================================================================
-
- Creative Commons Attribution-NonCommercial 4.0 International Public
- License
-
- By exercising the Licensed Rights (defined below), You accept and agree
- to be bound by the terms and conditions of this Creative Commons
- Attribution-NonCommercial 4.0 International Public License ("Public
- License"). To the extent this Public License may be interpreted as a
- contract, You are granted the Licensed Rights in consideration of Your
- acceptance of these terms and conditions, and the Licensor grants You
- such rights in consideration of benefits the Licensor receives from
- making the Licensed Material available under these terms and
- conditions.
-
- Section 1 -- Definitions.
-
- a. Adapted Material means material subject to Copyright and Similar
- Rights that is derived from or based upon the Licensed Material
- and in which the Licensed Material is translated, altered,
- arranged, transformed, or otherwise modified in a manner requiring
- permission under the Copyright and Similar Rights held by the
- Licensor. For purposes of this Public License, where the Licensed
- Material is a musical work, performance, or sound recording,
- Adapted Material is always produced where the Licensed Material is
- synched in timed relation with a moving image.
-
- b. Adapter's License means the license You apply to Your Copyright
- and Similar Rights in Your contributions to Adapted Material in
- accordance with the terms and conditions of this Public License.
-
- c. Copyright and Similar Rights means copyright and/or similar rights
- closely related to copyright including, without limitation,
- performance, broadcast, sound recording, and Sui Generis Database
- Rights, without regard to how the rights are labeled or
- categorized. For purposes of this Public License, the rights
- specified in Section 2(b)(1)-(2) are not Copyright and Similar
- Rights.
- d. Effective Technological Measures means those measures that, in the
- absence of proper authority, may not be circumvented under laws
- fulfilling obligations under Article 11 of the WIPO Copyright
- Treaty adopted on December 20, 1996, and/or similar international
- agreements.
-
- e. Exceptions and Limitations means fair use, fair dealing, and/or
- any other exception or limitation to Copyright and Similar Rights
- that applies to Your use of the Licensed Material.
-
- f. Licensed Material means the artistic or literary work, database,
- or other material to which the Licensor applied this Public
- License.
-
- g. Licensed Rights means the rights granted to You subject to the
- terms and conditions of this Public License, which are limited to
- all Copyright and Similar Rights that apply to Your use of the
- Licensed Material and that the Licensor has authority to license.
-
- h. Licensor means the individual(s) or entity(ies) granting rights
- under this Public License.
-
- i. NonCommercial means not primarily intended for or directed towards
- commercial advantage or monetary compensation. For purposes of
- this Public License, the exchange of the Licensed Material for
- other material subject to Copyright and Similar Rights by digital
- file-sharing or similar means is NonCommercial provided there is
- no payment of monetary compensation in connection with the
- exchange.
-
- j. Share means to provide material to the public by any means or
- process that requires permission under the Licensed Rights, such
- as reproduction, public display, public performance, distribution,
- dissemination, communication, or importation, and to make material
- available to the public including in ways that members of the
- public may access the material from a place and at a time
- individually chosen by them.
-
- k. Sui Generis Database Rights means rights other than copyright
- resulting from Directive 96/9/EC of the European Parliament and of
- the Council of 11 March 1996 on the legal protection of databases,
- as amended and/or succeeded, as well as other essentially
- equivalent rights anywhere in the world.
-
- l. You means the individual or entity exercising the Licensed Rights
- under this Public License. Your has a corresponding meaning.
-
- Section 2 -- Scope.
-
- a. License grant.
-
- 1. Subject to the terms and conditions of this Public License,
- the Licensor hereby grants You a worldwide, royalty-free,
- non-sublicensable, non-exclusive, irrevocable license to
- exercise the Licensed Rights in the Licensed Material to:
-
- a. reproduce and Share the Licensed Material, in whole or
- in part, for NonCommercial purposes only; and
-
- b. produce, reproduce, and Share Adapted Material for
- NonCommercial purposes only.
-
- 2. Exceptions and Limitations. For the avoidance of doubt, where
- Exceptions and Limitations apply to Your use, this Public
- License does not apply, and You do not need to comply with
- its terms and conditions.
-
- 3. Term. The term of this Public License is specified in Section
- 6(a).
-
- 4. Media and formats; technical modifications allowed. The
- Licensor authorizes You to exercise the Licensed Rights in
- all media and formats whether now known or hereafter created,
- and to make technical modifications necessary to do so. The
- Licensor waives and/or agrees not to assert any right or
- authority to forbid You from making technical modifications
- necessary to exercise the Licensed Rights, including
- technical modifications necessary to circumvent Effective
- Technological Measures. For purposes of this Public License,
- simply making modifications authorized by this Section 2(a)
- (4) never produces Adapted Material.
-
- 5. Downstream recipients.
-
- a. Offer from the Licensor -- Licensed Material. Every
- recipient of the Licensed Material automatically
- receives an offer from the Licensor to exercise the
- Licensed Rights under the terms and conditions of this
- Public License.
-
- b. No downstream restrictions. You may not offer or impose
- any additional or different terms or conditions on, or
- apply any Effective Technological Measures to, the
- Licensed Material if doing so restricts exercise of the
- Licensed Rights by any recipient of the Licensed
- Material.
-
- 6. No endorsement. Nothing in this Public License constitutes or
- may be construed as permission to assert or imply that You
- are, or that Your use of the Licensed Material is, connected
- with, or sponsored, endorsed, or granted official status by,
- the Licensor or others designated to receive attribution as
- provided in Section 3(a)(1)(A)(i).
-
- b. Other rights.
-
- 1. Moral rights, such as the right of integrity, are not
- licensed under this Public License, nor are publicity,
- privacy, and/or other similar personality rights; however, to
- the extent possible, the Licensor waives and/or agrees not to
- assert any such rights held by the Licensor to the limited
- extent necessary to allow You to exercise the Licensed
- Rights, but not otherwise.
-
- 2. Patent and trademark rights are not licensed under this
- Public License.
-
- 3. To the extent possible, the Licensor waives any right to
- collect royalties from You for the exercise of the Licensed
- Rights, whether directly or through a collecting society
- under any voluntary or waivable statutory or compulsory
- licensing scheme. In all other cases the Licensor expressly
- reserves any right to collect such royalties, including when
- the Licensed Material is used other than for NonCommercial
- purposes.
-
- Section 3 -- License Conditions.
-
- Your exercise of the Licensed Rights is expressly made subject to the
- following conditions.
-
- a. Attribution.
-
- 1. If You Share the Licensed Material (including in modified
- form), You must:
-
- a. retain the following if it is supplied by the Licensor
- with the Licensed Material:
-
- i. identification of the creator(s) of the Licensed
- Material and any others designated to receive
- attribution, in any reasonable manner requested by
- the Licensor (including by pseudonym if
- designated);
-
- ii. a copyright notice;
-
- iii. a notice that refers to this Public License;
-
- iv. a notice that refers to the disclaimer of
- warranties;
-
- v. a URI or hyperlink to the Licensed Material to the
- extent reasonably practicable;
-
- b. indicate if You modified the Licensed Material and
- retain an indication of any previous modifications; and
-
- c. indicate the Licensed Material is licensed under this
- Public License, and include the text of, or the URI or
- hyperlink to, this Public License.
-
- 2. You may satisfy the conditions in Section 3(a)(1) in any
- reasonable manner based on the medium, means, and context in
- which You Share the Licensed Material. For example, it may be
- reasonable to satisfy the conditions by providing a URI or
- hyperlink to a resource that includes the required
- information.
-
- 3. If requested by the Licensor, You must remove any of the
- information required by Section 3(a)(1)(A) to the extent
- reasonably practicable.
-
- 4. If You Share Adapted Material You produce, the Adapter's
- License You apply must not prevent recipients of the Adapted
- Material from complying with this Public License.
-
- Section 4 -- Sui Generis Database Rights.
-
- Where the Licensed Rights include Sui Generis Database Rights that
- apply to Your use of the Licensed Material:
-
- a. for the avoidance of doubt, Section 2(a)(1) grants You the right
- to extract, reuse, reproduce, and Share all or a substantial
- portion of the contents of the database for NonCommercial purposes
- only;
-
- b. if You include all or a substantial portion of the database
- contents in a database in which You have Sui Generis Database
- Rights, then the database in which You have Sui Generis Database
- Rights (but not its individual contents) is Adapted Material; and
-
- c. You must comply with the conditions in Section 3(a) if You Share
- all or a substantial portion of the contents of the database.
-
- For the avoidance of doubt, this Section 4 supplements and does not
- replace Your obligations under this Public License where the Licensed
- Rights include other Copyright and Similar Rights.
-
- Section 5 -- Disclaimer of Warranties and Limitation of Liability.
-
- a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
- EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
- AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
- ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
- IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
- WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
- PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
- ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
- KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
- ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
-
- b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
- TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
- NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
- INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
- COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
- USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
- ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
- DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
- IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
-
- c. The disclaimer of warranties and limitation of liability provided
- above shall be interpreted in a manner that, to the extent
- possible, most closely approximates an absolute disclaimer and
- waiver of all liability.
-
- Section 6 -- Term and Termination.
-
- a. This Public License applies for the term of the Copyright and
- Similar Rights licensed here. However, if You fail to comply with
- this Public License, then Your rights under this Public License
- terminate automatically.
-
- b. Where Your right to use the Licensed Material has terminated under
- Section 6(a), it reinstates:
-
- 1. automatically as of the date the violation is cured, provided
- it is cured within 30 days of Your discovery of the
- violation; or
-
- 2. upon express reinstatement by the Licensor.
-
- For the avoidance of doubt, this Section 6(b) does not affect any
- right the Licensor may have to seek remedies for Your violations
- of this Public License.
-
- c. For the avoidance of doubt, the Licensor may also offer the
- Licensed Material under separate terms or conditions or stop
- distributing the Licensed Material at any time; however, doing so
- will not terminate this Public License.
-
- d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
- License.
-
- Section 7 -- Other Terms and Conditions.
-
- a. The Licensor shall not be bound by any additional or different
- terms or conditions communicated by You unless expressly agreed.
-
- b. Any arrangements, understandings, or agreements regarding the
- Licensed Material not stated herein are separate from and
- independent of the terms and conditions of this Public License.
-
- Section 8 -- Interpretation.
-
- a. For the avoidance of doubt, this Public License does not, and
- shall not be interpreted to, reduce, limit, restrict, or impose
- conditions on any use of the Licensed Material that could lawfully
- be made without permission under this Public License.
-
- b. To the extent possible, if any provision of this Public License is
- deemed unenforceable, it shall be automatically reformed to the
- minimum extent necessary to make it enforceable. If the provision
- cannot be reformed, it shall be severed from this Public License
- without affecting the enforceability of the remaining terms and
- conditions.
-
- c. No term or condition of this Public License will be waived and no
- failure to comply consented to unless expressly agreed to by the
- Licensor.
-
- d. Nothing in this Public License constitutes or may be interpreted
- as a limitation upon, or waiver of, any privileges and immunities
- that apply to the Licensor or You, including from the legal
- processes of any jurisdiction or authority.
-
- =======================================================================
-
- Creative Commons is not a party to its public
- licenses. Notwithstanding, Creative Commons may elect to apply one of
- its public licenses to material it publishes and in those instances
- will be considered the “Licensor.” The text of the Creative Commons
- public licenses is dedicated to the public domain under the CC0 Public
- Domain Dedication. Except for the limited purpose of indicating that
- material is shared under a Creative Commons public license or as
- otherwise permitted by the Creative Commons policies published at
- creativecommons.org/policies, Creative Commons does not authorize the
- use of the trademark "Creative Commons" or any other trademark or logo
- of Creative Commons without its prior written consent including,
- without limitation, in connection with any unauthorized modifications
- to any of its public licenses or any other arrangements,
- understandings, or agreements concerning use of licensed material. For
- the avoidance of doubt, this paragraph does not form part of the
- public licenses.
-
- Creative Commons may be contacted at creativecommons.org.
-
- 5. OpenDiT (https://github.com/NUS-HPC-AI-Lab/OpenDiT/blob/master/LICENSE)
-
- Copyright OpenDiT
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-
diff --git a/PyTorch/built-in/mm/OpenSora1.1/README.md b/PyTorch/built-in/mm/OpenSora1.1/README.md
deleted file mode 100644
index 50a728a4fecbbf92f491619abc589412c9686e28..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/README.md
+++ /dev/null
@@ -1,254 +0,0 @@
-
-# OpenSora1.1 for PyTorch
-# 目录
-
-- [OpenSora1.1 for PyTorch](#opensora11-for-pytorch)
-- [目录](#目录)
-- [简介](#简介)
- - [模型介绍](#模型介绍)
- - [支持任务列表](#支持任务列表)
- - [代码实现](#代码实现)
-- [STDiT2(在研版本)](#stdit2在研版本)
- - [准备训练环境](#准备训练环境)
- - [安装模型环境](#安装模型环境)
- - [安装昇腾环境](#安装昇腾环境)
- - [准备数据集](#准备数据集)
- - [训练数据集准备](#训练数据集准备)
- - [获取预训练模型](#获取预训练模型)
- - [快速开始](#快速开始)
- - [训练任务](#训练任务)
- - [开始训练](#开始训练)
- - [推理任务](#推理任务)
- - [开始推理](#开始推理)
-- [公网地址说明](#公网地址说明)
-- [变更说明](#变更说明)
- - [变更](#变更)
-- [FAQ](#faq)
-
-# 简介
-## 模型介绍
-
-OpenSora是HPC AI Tech开发的开源高效复现类Sora视频生成方案。OpenSora不仅实现了先进视频生成技术的低成本普及,还提供了一个精简且用户友好的方案,简化了视频制作的复杂性。
-本仓库主要将OpenSora1.1的STDiT2模型的任务迁移到了昇腾NPU上,并进行极致性能优化。
-
-## 支持任务列表
-
-本仓已经支持以下模型任务类型
-
-| 模型 | 任务列表 | 是否支持 |
-|:-----------:|:----:|:-----:|
-| STDiT2-XL/2 | 在线训练 | ✔ |
-| STDiT2-XL/2 | 在线推理 | ✔ |
-
-
-## 代码实现
-
-- 参考实现:
-
- ```
- url=https://github.com/hpcaitech/Open-Sora
- commit_id=74b645350b0f7a0ed802f87243c23edd1504c26d
- ```
-
-- 适配昇腾 AI 处理器的实现:
-
- ```
- url=https://gitee.com/ascend/ModelZoo-PyTorch.git
- code_path=PyTorch/built-in/mm/
- ```
-
-
-# STDiT2(在研版本)
-
-## 准备训练环境
-
-### 安装模型环境
-
-
- **表 3** 三方库版本支持表
-
- | 三方库 | 支持版本 |
- |:-----------:|:------:|
- | PyTorch | 2.1.0 |
- | TorchVision | 0.16.0 |
-
-
- 在模型根目录下执行以下命令,安装模型对应PyTorch版本需要的依赖。
-
-
- ```python
- source ${cann_install_path}/ascend-toolkit/set_env.sh # 激活cann环境
- cd OpenSora1.1
- pip install -v -e . # 安装本地代码仓,同时自动安装依赖
- ```
- 安装mindspeed:
- ```
- git clone https://gitee.com/ascend/MindSpeed.git
- pip install -e MindSpeed
- ```
- 获取 Megatron-LM 并指定 commit id:
- ```
- git clone https://github.com/NVIDIA/Megatron-LM.git
- cd Megatron-LM
- git checkout core_r0.6.0
- ```
-
-### 安装昇腾环境
-
- 请参考昇腾社区中《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》文档搭建昇腾环境,本仓已支持表4中软件版本。
-
-
- **表 4** 昇腾软件版本支持表
-
- | 软件类型 | 支持版本 |
- | :--------: |:--------:|
- | FrameworkPTAdapter | 在研版本 |
- | CANN | 在研版本 |
- | 昇腾NPU固件 | 在研版本 |
- | 昇腾NPU驱动 | 在研版本 |
-
-
-
-### 准备数据集
-#### 训练数据集准备
-数据集准备请参考官网,链接如下:
-https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file#data-processing
-
-### 获取预训练模型
-
-1. 联网情况下,预训练模型会自动下载。
-
-2. 无网络时,用户可访问huggingface官网自行下载,文件namespace如下:
-
- ```
- PixArt-alpha/PixArt-alpha # PixArt-XL-2-512x512模型(训练用)
- stabilityai/sd-vae-ft-ema # vae模型
- DeepFloyd/t5-v1_1-xxl # t5模型
- hpcai-tech/OpenSora-STDiT-v2-stage2 # 预训练权重(推理用)
- hpcai-tech/OpenSora-STDiT-v2-stage3 # 预训练权重(推理用)
- ```
-
- > **说明:**
- > 在线推理时,对`hpcai-tech/OpenSora-STDiT-v2-stage3`和`hpcai-tech/OpenSora-STDiT-v2-stage3`模型需做一些离线转换,转换成.pth格式。提供参考用例:
- > ```python
- > import os
- > import torch
- > import safetensors
- > data = safetensors.torch.load_file('./hpcai-tech/OpenSora-STDiT-v2-stage2/model.safetensors')
- > data["state_dict"] = data
- > torch.save(data, os.path.splitext('./hpcai-tech/OpenSora-STDiT-v2-stage2/model.safetensors')[0]+'.pth')
- > ```
-
-
-3. 获取对应的预训练模型后,在以下配置文件中将`model`、`vae`的`from_pretrained`参数设置为本地预训练模型绝对路径。
- ```shell
- configs/opensora-v1-1/inference/sample.py
- configs/opensora-v1-1/train/stage1.py
- configs/opensora-v1-1/train/stage2.py
- configs/opensora-v1-1/train/stage3.py
- ```
-
-4. 将下载好的t5模型放在本工程目录下的`DeepFloyd`目录下,组织结构如下:
- ```
- $OpenSora1.1
- ├── DeepFloyd
- ├── ├── t5-v1_1-xxl
- ├── ├── ├── config.json
- ├── ├── ├── pytorch_model-00001-of-00002.bin
- ├── ├── ├── ...
- └── ...
- ```
-
-## 快速开始
-### 训练任务
-本任务主要以预训练模型为主,展示训练任务,包含单机单卡和单机多卡的训练。
-#### 开始训练
-1. 进入解压后的源码包根目录。
-
- ```
- cd /${模型文件夹名称}
- ```
-2. 准备训练数据。
-按照官网流程,准备对应数据集,处理数据并得到包含数据信息的csv文件,放在模型文件夹下,如图:
- ```
- $OpenSora1.1
- ├── train_data.csv
- └── ...
- ```
-
-2. 运行训练脚本。
-
- 用户可以按照自己训练需要进行参数配置,以下给出单卡和多卡的一种训练示例。
- ```shell
- bash test/train_full_1p_opensorav1_1.sh --data_path=train_data.csv
- # 混合精度BF16,单卡训练,stage1
- ```
-
- ```shell
- bash test/train_full_8p_opensorav1_1.sh --data_path=train_data.csv
- # 混合精度BF16,八卡训练,stage1
- ```
- 对于本模型,可以采用绑核优化,以绑核方式启动。
- 绑核方法参考:https://gitee.com/ascend/att/tree/master/profiler/affinity_cpu_bind
- 本模型使用示例如下:
- ```
- python3 bind_core.py \
- -app="bash test/train_full_18p_opensorav1_1.sh --data_path=train_data.csv"
- ```
-
-### 推理任务
-本任务主要以预训练模型为主,展示推理任务,包括单卡在线推理。
-#### 开始推理
-1. 进入解压后的源码包根目录。
-
- ```
- cd /${模型文件夹名称}
- ```
-
-
-2. 运行推理的脚本。
-
-- 单机单卡推理
- ```shell
- bash test/infer_full_1p_opensorav1_1.sh --ckpt_path=/path/to/OpenSora-STDiT-v2-stage3/model.pth # 混精bf16 在线推理
- ```
-- 推理脚本参数说明如下
- ```shell
- test/infer_full_1p_opensorav1_1.sh
- --batch_size //设置batch_size
- --ckpt_path //推理加载的模型地址
- --prompt //测试用的prompt
- --num_frames //生成视频的总帧数
- --img_h //生成视频的宽
- --img_w //生成视频的高
-
- scripts/inference.py
- config //配置文件路径
- --seed //随机种子
- --ckpt-path //推理加载的模型文件路径
- --batch-size //设置batch_size
- --prompt-path //推理使用的prompt文件路径
- --prompt //测试用的prompt
- --num-frames //生成视频的总帧数
- --image-size //生成视频的分辨率
- --fps //生成视频的帧率
- --save-dir //输出视频的路径
- --num-sampling-steps //推理的采样步数
- --cfg-scale //无分类器引导的权重系数
- ```
-
-
-
-# 公网地址说明
-代码涉及公网地址参考 public_address_statement.md
-
-# 变更说明
-
-## 变更
-
-2024.04.29:OpenSora1.1 STDiT2 bf16训练和推理任务首次发布。
-
-# FAQ
-
-
-
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/all_category.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/all_category.txt
deleted file mode 100644
index fd797edb2107daadc90636e3830eb9387380ff6f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/all_category.txt
+++ /dev/null
@@ -1,800 +0,0 @@
-a black dog wearing halloween costume
-spider making a web
-bat eating fruits while hanging
-a snake crawling on a wooden flooring
-a close up video of a dragonfly
-macro shot of ladybug on green leaf plant
-chameleon eating ant
-a bee feeding on nectars
-bird nests on a tree captured with moving camera
-a squirrel eating nuts
-close up video of snail
-top view of a hermit crab crawling on a wooden surface
-cat licking another cat
-red dragonfly perched on green leaf
-close up view of a brown caterpillar crawling on green leaf
-ants eating dead spider
-an eagle on a tree branch
-a frog eating an ant
-white rabbit near the fence
-a gorilla eating a carrot
-close up of wolf
-a meerkat looking around
-a hyena in a zoo
-lemur eating grass leaves
-an owl being trained by a man
-a lizard on a bamboo
-brown chicken hunting for its food
-video of parrots perched on bird stand
-underwater footage of an octopus in a coral reef
-a cute pomeranian dog playing with a soccer ball
-white fox on rock
-close up footage of a horse figurine
-giraffe feeding on a tree in a savannah
-curious cat sitting and looking around
-hummingbird hawk moth flying near pink flowers
-close up of a scorpion on a rock
-close up on fish in net
-koala eating leaves from a branch
-a pod of dolphins swirling in the sea catching forage fish
-low angle view of a hawk perched on a tree branch
-a lion standing on wild grass
-deer grazing in the field
-elephant herd in a savanna
-close up on lobster under water
-hedgehog crossing road in forest
-a sheep eating yellow flowers from behind a wire fence
-twin sisters and a turtle
-a pig wallowing in mud
-flock of goose eating on the lake water
-cow in a field irritated with flies
-a close up shot of a fly
-cheetah lying on the grass
-close up of a lemur
-close up shot of a kangaroo itching in the sand
-a tortoise covered with algae
-turkey in cage
-a great blue heron bird in the lakeside
-crab with shell in aquarium
-a seagull walking on shore
-an american crocodile
-a tiger walking inside a cage
-alligator in the nature
-a raccoon climbing a tree
-wild rabbit in a green meadow
-group of ring tailed lemurs
-a clouded leopard on a tree branch
-duck grooming its feathers
-an african penguin walking on a beach
-a video of a peacock
-close up shot of a wild bear
-baby rhino plays with mom
-porcupine climbs tree branches
-close up of a natterjack toad on a rock
-a sleeping orangutan
-mother whale swimming with babies
-a bear wearing red jersey
-pink jellyfish swimming underwater in a blue sea
-beautiful clown fish swimming
-animation of disposable objects shaped as a whale
-paper cut out of a pair of hands a whale and a heart
-vertical video of camel roaming in the field during daytime
-a still video of mosquito biting human
-a curious sloth hanging from a tree branch
-a plastic flamingo bird stumbles from the wind
-a wolf in its natural habitat
-a monkey sitting in the stone and scratching his head
-bat hanging upside down
-a red panda eating leaves
-snake on ground
-a harbour seal swimming near the shore
-shark swimming in the sea
-otter on branch while eating
-goat standing over a rock
-a troop of monkey on top of a mountain
-a zebra eating grass on the field
-a colorful butterfly perching on a bud
-a snail crawling on a leaf
-zookeeper showering a baby elephant
-a beetle emerging from the sand
-a nine banded armadillo searching for food
-an apartment building with balcony
-asian garden and medieval castle
-illuminated tower in berlin
-a wooden house overseeing the lake
-a crowd of people in a plaza in front of a government building
-a church interior
-jewish friends posing with hanukkah menorah in a cabin house
-a destroyed building after a missile attack in ukraine
-abandoned building in the woods
-drone video of an abandoned school building in pripyat ukraine
-elegant university building
-architecture and designs of buildings in central london
-a pancake tower with chocolate syrup and strawberries on top
-an ancient white building
-friends hanging out at a coffee house
-house front door with christmas decorations
-city night dark building
-a bird house hanging on a tree branch
-sacred sculpture in a temple
-high angle shot of a clock tower
-modern wooden house interior
-the interior of an abandoned building
-opera house overlooking sea
-a concrete structure near the green trees
-dome like building in scotland
-low angle shot of a building
-tower on hill
-a miniature house
-eiffel tower from the seine river
-low angle footage of an apartment building
-island with pier and antique building
-asian historic architecture
-drone footage of a beautiful mansion
-mosque in the middle east
-building a tent and hammock in the forest camping site
-top view of a high rise building
-house covered in snow
-skyscraper at night
-house in village
-a casino with people outside the building
-silhouette of a building
-a woman climbing a tree house
-drone view of house near lake during golden hour
-an under construction concrete house
-a watch tower by the sea
-exterior view of arabic style building
-video of a hotel building
-red paper lantern decorations hanging outside a building
-house on seashore
-aerial footage of the palace of culture and science building in warsaw poland
-aerial video of stuttgart tv tower in germany
-aerial view of the highway and building in a city
-drone shot of a skyscraper san francisco california usa
-waterfall and house
-view of the sky through a building
-drone footage of a house on top of the mountain
-abandoned house in the nature
-clouds hovering over a mansion
-light house on the ocean
-buddhist temple at sunrise
-people walking by a graveyard near a mosque at sunset
-view of lifeguard tower on the beach
-scenic view of a house in the mountains
-the landscape in front of a government building
-aerial footage of a building and its surrounding landscape in winter
-time lapse of a cloudy sky behind a transmission tower
-blue ocean near the brown castle
-fog over temple
-house in countryside top view
-building under construction
-turkish flag waving on old tower
-the georgian building
-close up shot of a steel structure
-the atrium and interior design of a multi floor building
-city view reflected on a glass building
-aerial view of a luxurious house with pool
-an unpaved road leading to the house
-drone footage of a lookout tower in mountain landscape
-wind turbines on hill behind building
-time lapse footage of the sun light in front of a small house porch
-a building built with lots of stairways
-overcast over house on seashore
-the view of the sydney opera house from the other side of the harbor
-candle on a jar and a house figurine on a surface
-video of a farm and house
-a dilapidated building made of bricks
-a view of a unique building from a moving vehicle
-aerial footage of a tall building in cambodia
-push in shot of a huge house
-a beach house built over a seawall protected from the sea waves
-exotic house surrounded by trees
-drone video of a house surrounded by tropical vegetation
-drone footage of a building beside a pond
-observation tower on hill in forest
-a tree house in the woods
-a video of vessel structure during daytime
-fire in front of illuminated building at night
-a footage of a wooden house on a wheat field
-tilt shot of a solar panel below a light tower
-water tower on the desert
-freshly baked finger looking cookies
-video of fake blood in wine glass
-halloween food art
-a person slicing a vegetable
-a serving of pumpkin dish in a plate
-close up view of green leafy vegetable
-a birthday cake in the plate
-video of a slice papaya fruit
-a muffin with a burning candle and a love sign by a ceramic mug
-a jack o lantern designed cookie
-baked bread with chocolate
-a broccoli soup on wooden table
-a freshly brewed coffee on a pink mug
-grabbing sourdough neapolitan style pizza slices
-person cooking mushrooms in frying pan
-rice grains placed on a reusable cloth bag
-slices of kiwi fruit
-grilling a steak on a pan grill
-close up of bread popping out of a toaster
-man eating noodle
-preparing a cocktail drink
-close up pasta with bacon on plate
-milk and cinnamon rolls
-boy getting a dumpling using chopsticks
-a mother preparing food with her kids
-man using his phone while eating
-fresh salmon salad on a plate
-cutting cucumbers into long thin slices as ingredient for sushi roll
-a steaming cup of tea by the window
-a glass filled with beer
-a kid eating popcorn while watching tv
-close up shot of fried fish on the plate
-a man eating a donut
-person making a vegetarian dish
-spreading cheese on bagel
-close up view of a man drinking red wine
-a couple having breakfast in a restaurant
-a student eating her sandwich
-girl peeling a banana
-red rice in a small bowl
-pancake with blueberry on the top
-green apple fruit on white wooden table
-a man eating a taco by the bar
-making of a burrito
-squeezing lemon into salad
-a chef cutting sushi rolls
-video of a delicious dessert
-deep frying a crab on a wok in high fire
-close up video of a orange juice
-video of a cooked chicken breast
-woman holding a pineapple
-a woman eating a bar of chocolate
-decorating christmas cookie
-squeezing a slice of fruit
-tuna sashimi on a plate
-a strawberry fruit mixed in an alcoholic drink
-preparing hot dogs in a grill
-a woman cutting a tomato
-an orange fruit cut in half
-a coconut fruit with drinking straw
-woman holding a dragon fruit
-a woman pouring hot beverage on a cup
-waffles with whipped cream and fruit
-focus shot of an insect at the bottom of a fruit
-preparing a healthy broccoli dish
-man eating snack at picnic
-close up video of a grilled shrimp skewer
-a woman mixing a smoothie drinks
-close up video of woman having a bite of jelly
-businessman drinking whiskey at the bar counter of a hotel lounge
-cutting an onion with a knife over a wooden chopping board
-fresh lemonade in bottles
-grilling a meat on a charcoal grill
-people enjoying asian cuisine
-close up footage of a hot dish on a clay pot
-pork ribs dish
-waffle with strawberry and syrup for breakfast
-tofu dish with rose garnish
-uncooked pork meat
-egg yolk being dumped over gourmet dish
-tasty brunch dish close up
-little boy pretending to eat the watermelon
-slicing roasted beef
-close up of a chef adding teriyaki sauce to a dish
-flat lay mexican dish
-a person placing an octopus dish on a marble surface
-close up of tea leaves brewing in a glass kettle
-adding fresh herbs to soup dish
-a scoop of roasted coffee beans
-fresh dim sum set up on a bamboo steam tray for cooking
-a girl putting ketchup on food at the kitchen
-cooking on electric stove
-a woman with a slice of a pie
-grapes and wine on a wooden board
-man taking picture of his food
-hamburger and fries on restaurant table
-close up video of japanese food
-a cracker sandwich with cheese filling for snack
-barista preparing matcha tea
-close up of onion rings being deep fried
-people carving a pumpkin
-people sitting on a sofa
-a man with a muertos face painting
-man walking in the dark
-men in front of their computer editing photos
-men loading christmas tree on tow truck
-woman washing the dishes
-woman adding honey to the cinnamon rolls
-two women kissing and smiling
-three women looking at watercolor paintings
-a family wearing paper bag masks
-a family posing for the camera
-a boy covering a rose flower with a dome glass
-boy sitting on grass petting a dog
-a girl in her tennis sportswear
-a girl coloring the cardboard
-silhouette of the couple during sunset
-couple dancing with body paint
-a child playing with water
-a woman with her child sitting on a couch in the living room
-a group of friend place doing hand gestures of agreement
-friends having a group selfie
-friends talking while on the basketball court
-group of people protesting
-a group of campers with a cute dog
-a group of photographers taking pictures at the north western gardens in llandudno north wales
-a group of students laughing and talking
-a group of martial artist warming up
-a person playing golf
-a person walking on a wet wooden bridge
-person doing a leg exercise
-ice hockey athlete on rink
-a young athlete training in swimming
-chess player dusting a chessboard
-baseball player holding his bat
-a bearded man putting a vinyl record on a vinyl player
-an orchestra finishes a performance
-people applauding the performance of the kids
-band performance at the recording studio
-father and his children playing jenga game
-people playing a board game
-man playing a video game
-a man video recording the movie in theater
-man and a woman eating while watching a movie
-movie crew talking together
-a director explaining the movie scene
-man and woman listening to music on car
-man playing music
-couple dancing slow dance with sun glare
-a ballerina practicing in the dance studio
-father and son holding hands
-father and daughter talking together
-a mother and her kids engaged in a video call
-mother and daughter reading a book together
-a mother teaching her daughter playing a violin
-kid in a halloween costume
-a happy kid playing the ukulele
-a chef slicing a cucumber
-chef wearing his gloves properly
-brother and sister using hammock
-girl applying sunblock to her brother
-a girl pushing the chair while her sister is on the chair
-colleagues talking in office building
-fighter practice kicking
-a woman fighter in her cosplay costume
-an engineer holding blueprints while talking with her colleague
-a young woman looking at vr controllers with her friend
-workmates teasing a colleague in the work
-a male police officer talking on the radio
-teacher holding a marker while talking
-teacher writing on her notebook
-a young student attending her online classes
-a student showing his classmates his wand
-a male vendor selling fruits
-a shirtless male climber
-a sound engineer listening to music
-female talking to a psychiatrist in a therapy session
-young female activist posing with flag
-a man in a hoodie and woman with a red bandana talking to each other and smiling
-a medium close up of women wearing kimonos
-a male interviewer listening to a person talking
-a social worker having a conversation with the foster parents
-a farm worker harvesting onions
-worker packing street food
-worker and client at barber shop
-elderly man lifting kettlebell
-mom assisting son in riding a bicycle
-dad watching her daughter eat
-young guy with vr headset
-pregnant woman exercising with trainer
-a fortune teller talking to a client
-wizard doing a ritual on a woman
-a footage of an actor on a movie scene
-a man holding a best actor trophy
-a singer of a music band
-a young singer performing on stage
-young dancer practicing at home
-seller showing room to a couple
-cab driver talking to passenger
-a policeman talking to the car driver
-kids celebrating halloween at home
-little boy helping mother in kitchen
-video of a indoor green plant
-a girl arranges a christmas garland hanging by the kitchen cabinet
-candle burning in dark room
-couple having fun and goofing around the bedroom
-girls jumping up and down in the bedroom
-woman and man in pajamas working from home
-a muslim family sitting and talking in the living room
-family enjoying snack time while sitting in the living room
-woman holding an animal puppet and a little girl playing together at the living room
-kids playing in the indoor tent
-young people celebrating new year at the office
-a woman writing on the sticky note in the office
-a woman exercising at home over a yoga mat
-girls preparing easter decorations at home
-dog on floor in room
-turning on a fluorescent light inside a room
-colleagues talking to each other near the office windows
-a woman recording herself while exercising at home
-music room
-different kind of tools kept in a utility room
-sofa beds and other furniture
-a girl finding her brother reading a book in the bedroom
-an elegant ceramic plant pot and hanging plant on indoor
-furniture inside a bedroom
-interior design of the bar section
-living room with party decoration
-firewood burning in dark room
-a young woman playing the ukulele at home
-woman painting at home
-a woman in a locker room
-video of a bathroom interior
-the interior design of a jewish synagogue
-a woman in protective suit disinfecting the kitchen
-modern minimalist home interior
-modern interior design of a coffee shop
-person arranging minimalist furniture
-aerial shot of interior of the warehouse
-a room of a manufacturing facility
-interior of catholic
-interior design of a restaurant
-a female model in a changing room looking herself in mirror
-men walking in the office hallway
-people sitting in a conference room
-the interior design of a shopping mall
-chandeliers in room
-lucerne railway station interior
-a female fencer posing in a foggy room
-a toolbox and a paint roller beside a huge package in a room
-bedroom in hotel
-a woman lying in the operating room
-a chef holding and checking kitchen utensils
-a couple singing in the shower room together
-a woman cleaning mess in the living room
-an empty meeting room with natural light
-person dancing in a dark room
-close up on blood in hospital room
-a couple resting on their home floor
-a young female staff at courier office
-a man entering the gym locker room
-a bored man sitting by the tv at home
-woman dancing in indoor garden
-rubble in the interior of an abandoned house
-indoor farm in a greenhouse
-man doing handstand in indoor garden
-an abandoned indoor swimming pool
-home decorations on top of a cabinet
-graffiti art on the interior walls of an abandoned mansion
-indoor wall climbing activity
-sunlight inside a room
-teenage girl roller skating at indoor rink
-home deco with lighted
-baby in the shower room
-men enjoying office christmas party
-a bedroom with a brick wall
-actors prepping in the dressing room
-kids playing at an indoor playground
-a person sanitizing an office space using smoke machine
-mother and daughter choosing clothes at home
-a woman sitting by the indoor fire pit
-man standing on the corner of the room while looking around
-person assembling furniture
-a family stacking cardboard boxes in a room
-family having fun in the dining room
-person disinfecting a room
-a woman washing strawberries in the kitchen sink
-modern office waiting room
-close up view of a person slicing with a kitchen knife
-boiling coffee on a stove in the kitchen
-modern equipment used in a home studio
-interior of a recording studio
-people working in a call center office
-band performing at a home concert
-a group of people watching a concert in a room
-people packing their furniture
-young employees in office holding a certificate
-a criminal inside a dark room handcuffed in a table
-couple browsing and looking for furniture in the store
-workspace at home
-video of a indoor green plant
-close up view of a plant
-close up shot of a burning plant
-plucking leaves from plant
-a plant on gold pot with glass lid
-a branch of a tree and a plant
-a leafless tree
-close up shot of fern leaf
-close up video of strawberry plant
-plant with blooming flowers
-close up video of flower petals
-watering yellow plant
-beautiful flower decoration
-cannabis flower in a jar
-a footage of the tree leaves
-a red leaf plant
-close up view of a white christmas tree
-snow pouring on a tree
-close up shot of white flowers on the tree
-leaves in the trees daytime
-a dead tree lying on a grass field
-tree branches in a flowing river
-purple flowers with leaves
-a coconut tree by the house
-close up on flower in winter
-bamboo leaves backlit by the sun
-close up video of a wet flower
-a man putting a flower in a box
-dropping flower petals on a wooden bowl
-a close up shot of gypsophila flower
-variety of succulent plants on a garden
-variety of trees and plants in a botanical garden
-forest of deciduous trees
-a stack of dried leaves burning in a forest
-tall forest trees on a misty morning
-close up view of dewdrops on a leaf
-close up view of white petaled flower
-removing a pineapple leaf
-a dragonfly perched on a leaf
-butterfly pollinating flower
-person visiting and checking a corn plant
-woman picking beans from a plant
-woman plucking mint leaves
-single tree in the middle of farmland
-a plant on a soil
-drone footage of a tree on farm field
-a tractor harvesting lavender flower
-people putting christmas ornaments on a christmas tree
-jack o lantern hanging on a tree
-tree with halloween decoration
-flower field near the waterfall
-truck carrying the tree logs
-raindrops falling on leaves
-shot of a palm tree swaying with the wind
-squirrels on a tree branch
-person holding a flower
-a fallen tree trunk
-tree with golden leaves
-cherry tree
-wind blows through leaves of the tree in autumn
-a leaf on a glass
-the long trunks of tall trees in the forest
-trees in the forest during sunny day
-close up video of tree bark
-reflection of tree branches
-trunks of many trees in the forest
-tree leaves providing shades from the sun
-leaves swaying in the wind
-low angle shot of baobab tree
-bare trees in forest
-a plant surrounded by fallen leaves
-a couple preparing food and pruning a plant
-a man cutting a tree bark
-oranges on a tree branch
-plant connected on the stones
-video of a sawmill machine cutting tree log
-women drying flower petals
-macro view of an agave plant
-a video of a person tying a plant on a string
-green moss in forest nature
-coconut tree near sea under blue sky
-the canopy of a coconut tree
-a man leaning on a tree at the beach
-a full grown plant on a pot
-candle wax dripping on flower petals
-close up of leaves in autumn
-a woman opening a book with a flower inside
-a man holding leaves looking at the camera
-a shadow of a swaying plant
-a tree and concrete structure under a blue and cloudy sky
-trimming excess leaves on a potted plant
-the changing color of the tree leaves during autumn season
-a gooseberry tree swayed by the wind
-forest trees and a medieval castle at sunset
-woman cut down tree
-an old oak tree in a park across the street from a hotel
-wild flowers growing in a forest ground
-a mossy fountain and green plants in a botanical garden
-mansion with beautiful garden
-ants on a dragon fruit flower
-scenery of desert landscape
-landscape agriculture farm tractor
-burning slash piles in the forest
-graveyard at sunset
-view of a jack o lantern with pumpkins in a smoky garden
-sun view through a spider web
-view of the sea from an abandoned building
-close up view of a full moon
-close up view of lighted candles
-close up view of swaying white flowers and leaves
-scenery of a relaxing beach
-selective focus video of grass during sunny day
-aerial view of brown dry landscape
-fireworks display in the sky at night
-a bonfire near river
-mountain view
-waterfalls in between mountain
-a picturesque view of nature
-exotic view of a riverfront city
-tall trees in the forest under the clear sky
-snow on branches in forest
-stream in the nature
-an airplane flying above the sea of clouds
-scenic video of sunset
-view of houses with bush fence under a blue and cloudy sky
-scenic view from wooden pathway
-scenic view of a tropical beach
-drone footage of waves crashing on beach shore
-a scenic view of the golden hour at norway
-time lapse video of foggy mountain forest
-brown mountain during fall season
-video of ocean during daytime
-boat sailing in the ocean
-top view of yachts
-beautiful scenery of flowing waterfalls and river
-wild ducks paddling on the lake surface
-a relaxing scenery of beach view under cloudy sky
-natural rock formations on beach under cloudy sky
-a palm tree against blue sky
-video of sailboat on a lake during sunset
-aerial view of snow piles
-time lapse of a sunset sky in the countryside
-aerial footage of a statue
-time lapse video of a farm during sunset
-clouds formation in the sky at sunset
-aerial shot of a village
-drone shot of a beautiful sunrise at the mountains
-time lapse video of foggy morning during sunrise
-sun shining between tree leaves at sunrise
-video of lake during dawn
-vehicles traveling on roadway under cloudy sky
-view of golden domed church
-a monument under the blue sky
-firecrackers in the sky
-view of fruit signage in the farm
-a dark clouds over shadowing the full moon
-view of the amazon river
-a big river swamp in a dense forest
-a blooming cherry blossom tree under a blue sky with white clouds
-a river waterfall cascading down the plunge basin
-flooded landscape with palm trees
-a blurry waterfall background
-waterfall in the mountains
-aerial footage of a city at night
-pond by small waterfall in forest
-aerial view of farmlands at the bay of lake
-rice terraces in the countryside
-a highway built across an agricultural area in the countryside
-gloomy morning in the countryside
-drone shot of an abandoned coliseum on a snowy mountain top
-boat sailing in the middle of ocean
-drone shot of the grass field
-natural landscape of mountain and sea with islets developed into a community
-aerial view of zaporizhia in ukraine
-aerial footage of a herd
-an aerial footage of a red sky
-grass and plants growing in the remains of an abandoned house
-view from hill on city
-aerial view on orthodox church
-aerial view of bay in croatia
-a footage of a frozen river
-overlooking view of a city at daylight
-view outside the cemetery
-clear sky with moon over meadow
-clouds over railway
-aerial footage of moving vehicles on the road at night
-aerial view of town and park
-top view of skyscrapers
-top view of the empire state building in manhattan
-top view of the central park in new york city
-sheep running in a grass field
-clear sky over factory
-smoke and fire in birds eye view
-view of a pathway with snow melting on its side
-ferry under bridge on river near city in malaysia
-mountain slopes covered in green vegetation
-panoramic view of a town surrounded by snow covered mountains
-aerial view of a palace
-top view of vehicles driving on the intersection
-a graveyard by a church in a mountain landscape
-a modern railway station in malaysia use for public transportation
-drone footage of amsterdam metro station
-train arriving at a station
-red vehicle driving on field
-close up view of flashing emergency vehicle lighting
-vehicle with fertilizer on field
-a highway built across an agricultural area in the countryside
-drone footage of motorcycles driving on country road between agricultural fields
-a road in the woods under fog
-footage of a car driving through a wheat field
-vehicle stops for an ambulance passing through city traffic
-emergency vehicle parked outside the casino
-zombies attacking a woman and a boy inside a car
-woman seating inside the car while chewing
-video of passengers riding a double decker bus during night
-traffic in london street at night
-elderly couple checking engine of automobile
-a green vintage automobile with an open hood parked in a parking area
-close up of a prototype automobile with exposed engine on the back seat of the car
-aerial view of road in forest
-train departing from station
-aerial view of a train passing by a bridge
-video of a train tracks
-video footage of a subway
-video of blinking traffic lights
-couple walking out on the subway
-time lapse of a subway tunnel
-monitor board inside the subway
-metro train at night
-zoom in video of a tram passing by city
-young man using laptop in the tram
-man reading a book at bus stop
-close up shot of a moving taxi
-night travel in london street on a public bus
-red bus in a rainy city
-flow of traffic in the city
-close up shot of a yellow taxi turning left
-two women calling for a taxi
-drone view of an illuminated bridge across a river
-policeman in police car talking on radio
-airplane taking off at night
-view through window in airplane
-an airplane in the sky
-helicopter landing on the street
-a pilot getting out of a helicopter
-a helicopter flying under blue sky
-boat sailing in the middle of the ocean
-girl playing with a toy boat
-silhouette of a boat on sea during golden hour
-a boat travelling around the lake
-road on mountain ridge
-ship sailing on danube river
-slow motion video of a ship water trail in the sea
-drone footage of a wreck ship on shore
-a white yacht traveling on a river and passing under the bridge
-female teenagers drinking champagne in the yacht
-video of yacht sailing in the ocean
-red combine harvester on road on field
-a woman sitting on a bicycle while using a mobile phone
-a woman sitting on a motorcycle looking around
-three teenagers fixing a bicycle
-a woman in a halloween costume posing on a motorcycle
-a parked motorcycle on a foggy roadside
-cable car near sea shore
-a truck travelling in the road
-footage of the road without any traffic
-a road sign
-love padlocks on a bridge
-camera moving at highway construction site
-vehicles driving on highway
-a motorbike on highway at timelapse mode
-point of view of a car driving through a tunnel
-time lapse of heavy traffic on an avenue
-ferry boat on city canal
-black vintage car in museum
-a zigzag road across a forest
-people crossing the road
-video of a kayak boat in a river
-a person paddling a wooden boat in a lake
-a car charging in the parking area
-cars parked on the road
-footage of the street with people and vehicle passing by in the rain
-traffic on busy city street
-a woman getting out of the car to walk with their dog
-yacht sailing through the ocean
-people in queue to military ship
-man wearing motorcycle helmet looking at the camera
-empty seats in the bus
-empty boat on the water
-cargo train traveling on the mountainside
-cruise ship in harbor
-counting down at traffic lights
-pressing the car ignition
-fire truck driving on the road
-a footage of a broken bicycle
-drone footage of an ambulance on the road
-slow motion footage of a racing car
-ship sailing on sea against sunset
-big cargo ship passing on the shore
-back view of man and woman walking on unpaved road
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/all_dimension.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/all_dimension.txt
deleted file mode 100644
index f26fbf80daa8be879b25c527dfe583a422d8ccf9..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/all_dimension.txt
+++ /dev/null
@@ -1,946 +0,0 @@
-In a still frame, a stop sign
-a toilet, frozen in time
-a laptop, frozen in time
-A tranquil tableau of alley
-A tranquil tableau of bar
-A tranquil tableau of barn
-A tranquil tableau of bathroom
-A tranquil tableau of bedroom
-A tranquil tableau of cliff
-In a still frame, courtyard
-In a still frame, gas station
-A tranquil tableau of house
-indoor gymnasium, frozen in time
-A tranquil tableau of indoor library
-A tranquil tableau of kitchen
-A tranquil tableau of palace
-In a still frame, parking lot
-In a still frame, phone booth
-A tranquil tableau of restaurant
-A tranquil tableau of tower
-A tranquil tableau of a bowl
-A tranquil tableau of an apple
-A tranquil tableau of a bench
-A tranquil tableau of a bed
-A tranquil tableau of a chair
-A tranquil tableau of a cup
-A tranquil tableau of a dining table
-In a still frame, a pear
-A tranquil tableau of a bunch of grapes
-A tranquil tableau of a bowl on the kitchen counter
-A tranquil tableau of a beautiful, handcrafted ceramic bowl
-A tranquil tableau of an antique bowl
-A tranquil tableau of an exquisite mahogany dining table
-A tranquil tableau of a wooden bench in the park
-A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers
-In a still frame, a park bench with a view of the lake
-A tranquil tableau of a vintage rocking chair was placed on the porch
-A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars
-A tranquil tableau of the phone booth was tucked away in a quiet alley
-a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time
-A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside
-A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow
-In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water
-In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape
-In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens
-In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels
-A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility
-In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity
-static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water
-A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night
-A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water
-In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square
-In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner
-A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy
-A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins
-A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes
-A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades
-In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall
-A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels
-A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour
-In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting
-In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light
-A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon
-A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon
-A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space
-In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk
-In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier
-A tranquil tableau of a country estate's library featured elegant wooden shelves
-A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently
-A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm
-A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden
-In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface
-In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation
-A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms
-A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time
-a bird and a cat
-a cat and a dog
-a dog and a horse
-a horse and a sheep
-a sheep and a cow
-a cow and an elephant
-an elephant and a bear
-a bear and a zebra
-a zebra and a giraffe
-a giraffe and a bird
-a chair and a couch
-a couch and a potted plant
-a potted plant and a tv
-a tv and a laptop
-a laptop and a remote
-a remote and a keyboard
-a keyboard and a cell phone
-a cell phone and a book
-a book and a clock
-a clock and a backpack
-a backpack and an umbrella
-an umbrella and a handbag
-a handbag and a tie
-a tie and a suitcase
-a suitcase and a vase
-a vase and scissors
-scissors and a teddy bear
-a teddy bear and a frisbee
-a frisbee and skis
-skis and a snowboard
-a snowboard and a sports ball
-a sports ball and a kite
-a kite and a baseball bat
-a baseball bat and a baseball glove
-a baseball glove and a skateboard
-a skateboard and a surfboard
-a surfboard and a tennis racket
-a tennis racket and a bottle
-a bottle and a chair
-an airplane and a train
-a train and a boat
-a boat and an airplane
-a bicycle and a car
-a car and a motorcycle
-a motorcycle and a bus
-a bus and a traffic light
-a traffic light and a fire hydrant
-a fire hydrant and a stop sign
-a stop sign and a parking meter
-a parking meter and a truck
-a truck and a bicycle
-a toilet and a hair drier
-a hair drier and a toothbrush
-a toothbrush and a sink
-a sink and a toilet
-a wine glass and a chair
-a cup and a couch
-a fork and a potted plant
-a knife and a tv
-a spoon and a laptop
-a bowl and a remote
-a banana and a keyboard
-an apple and a cell phone
-a sandwich and a book
-an orange and a clock
-broccoli and a backpack
-a carrot and an umbrella
-a hot dog and a handbag
-a pizza and a tie
-a donut and a suitcase
-a cake and a vase
-an oven and scissors
-a toaster and a teddy bear
-a microwave and a frisbee
-a refrigerator and skis
-a bicycle and an airplane
-a car and a train
-a motorcycle and a boat
-a person and a toilet
-a person and a hair drier
-a person and a toothbrush
-a person and a sink
-A person is riding a bike
-A person is marching
-A person is roller skating
-A person is tasting beer
-A person is clapping
-A person is drawing
-A person is petting animal (not cat)
-A person is eating watermelon
-A person is playing harp
-A person is wrestling
-A person is riding scooter
-A person is sweeping floor
-A person is skateboarding
-A person is dunking basketball
-A person is playing flute
-A person is stretching leg
-A person is tying tie
-A person is skydiving
-A person is shooting goal (soccer)
-A person is playing piano
-A person is finger snapping
-A person is canoeing or kayaking
-A person is laughing
-A person is digging
-A person is clay pottery making
-A person is shooting basketball
-A person is bending back
-A person is shaking hands
-A person is bandaging
-A person is push up
-A person is catching or throwing frisbee
-A person is playing trumpet
-A person is flying kite
-A person is filling eyebrows
-A person is shuffling cards
-A person is folding clothes
-A person is smoking
-A person is tai chi
-A person is squat
-A person is playing controller
-A person is throwing axe
-A person is giving or receiving award
-A person is air drumming
-A person is taking a shower
-A person is planting trees
-A person is sharpening knives
-A person is robot dancing
-A person is rock climbing
-A person is hula hooping
-A person is writing
-A person is bungee jumping
-A person is pushing cart
-A person is cleaning windows
-A person is cutting watermelon
-A person is cheerleading
-A person is washing hands
-A person is ironing
-A person is cutting nails
-A person is hugging
-A person is trimming or shaving beard
-A person is jogging
-A person is making bed
-A person is washing dishes
-A person is grooming dog
-A person is doing laundry
-A person is knitting
-A person is reading book
-A person is baby waking up
-A person is massaging legs
-A person is brushing teeth
-A person is crawling baby
-A person is motorcycling
-A person is driving car
-A person is sticking tongue out
-A person is shaking head
-A person is sword fighting
-A person is doing aerobics
-A person is strumming guitar
-A person is riding or walking with horse
-A person is archery
-A person is catching or throwing baseball
-A person is playing chess
-A person is rock scissors paper
-A person is using computer
-A person is arranging flowers
-A person is bending metal
-A person is ice skating
-A person is climbing a rope
-A person is crying
-A person is dancing ballet
-A person is getting a haircut
-A person is running on treadmill
-A person is kissing
-A person is counting money
-A person is barbequing
-A person is peeling apples
-A person is milking cow
-A person is shining shoes
-A person is making snowman
-A person is sailing
-a person swimming in ocean
-a person giving a presentation to a room full of colleagues
-a person washing the dishes
-a person eating a burger
-a person walking in the snowstorm
-a person drinking coffee in a cafe
-a person playing guitar
-a bicycle leaning against a tree
-a bicycle gliding through a snowy field
-a bicycle slowing down to stop
-a bicycle accelerating to gain speed
-a car stuck in traffic during rush hour
-a car turning a corner
-a car slowing down to stop
-a car accelerating to gain speed
-a motorcycle cruising along a coastal highway
-a motorcycle turning a corner
-a motorcycle slowing down to stop
-a motorcycle gliding through a snowy field
-a motorcycle accelerating to gain speed
-an airplane soaring through a clear blue sky
-an airplane taking off
-an airplane landing smoothly on a runway
-an airplane accelerating to gain speed
-a bus turning a corner
-a bus stuck in traffic during rush hour
-a bus accelerating to gain speed
-a train speeding down the tracks
-a train crossing over a tall bridge
-a train accelerating to gain speed
-a truck turning a corner
-a truck anchored in a tranquil bay
-a truck stuck in traffic during rush hour
-a truck slowing down to stop
-a truck accelerating to gain speed
-a boat sailing smoothly on a calm lake
-a boat slowing down to stop
-a boat accelerating to gain speed
-a bird soaring gracefully in the sky
-a bird building a nest from twigs and leaves
-a bird flying over a snowy forest
-a cat grooming itself meticulously with its tongue
-a cat playing in park
-a cat drinking water
-a cat running happily
-a dog enjoying a peaceful walk
-a dog playing in park
-a dog drinking water
-a dog running happily
-a horse bending down to drink water from a river
-a horse galloping across an open field
-a horse taking a peaceful walk
-a horse running to join a herd of its kind
-a sheep bending down to drink water from a river
-a sheep taking a peaceful walk
-a sheep running to join a herd of its kind
-a cow bending down to drink water from a river
-a cow chewing cud while resting in a tranquil barn
-a cow running to join a herd of its kind
-an elephant spraying itself with water using its trunk to cool down
-an elephant taking a peaceful walk
-an elephant running to join a herd of its kind
-a bear catching a salmon in its powerful jaws
-a bear sniffing the air for scents of food
-a bear climbing a tree
-a bear hunting for prey
-a zebra bending down to drink water from a river
-a zebra running to join a herd of its kind
-a zebra taking a peaceful walk
-a giraffe bending down to drink water from a river
-a giraffe taking a peaceful walk
-a giraffe running to join a herd of its kind
-a person
-a bicycle
-a car
-a motorcycle
-an airplane
-a bus
-a train
-a truck
-a boat
-a traffic light
-a fire hydrant
-a stop sign
-a parking meter
-a bench
-a bird
-a cat
-a dog
-a horse
-a sheep
-a cow
-an elephant
-a bear
-a zebra
-a giraffe
-a backpack
-an umbrella
-a handbag
-a tie
-a suitcase
-a frisbee
-skis
-a snowboard
-a sports ball
-a kite
-a baseball bat
-a baseball glove
-a skateboard
-a surfboard
-a tennis racket
-a bottle
-a wine glass
-a cup
-a fork
-a knife
-a spoon
-a bowl
-a banana
-an apple
-a sandwich
-an orange
-broccoli
-a carrot
-a hot dog
-a pizza
-a donut
-a cake
-a chair
-a couch
-a potted plant
-a bed
-a dining table
-a toilet
-a tv
-a laptop
-a remote
-a keyboard
-a cell phone
-a microwave
-an oven
-a toaster
-a sink
-a refrigerator
-a book
-a clock
-a vase
-scissors
-a teddy bear
-a hair drier
-a toothbrush
-a red bicycle
-a green bicycle
-a blue bicycle
-a yellow bicycle
-an orange bicycle
-a purple bicycle
-a pink bicycle
-a black bicycle
-a white bicycle
-a red car
-a green car
-a blue car
-a yellow car
-an orange car
-a purple car
-a pink car
-a black car
-a white car
-a red bird
-a green bird
-a blue bird
-a yellow bird
-an orange bird
-a purple bird
-a pink bird
-a black bird
-a white bird
-a black cat
-a white cat
-an orange cat
-a yellow cat
-a red umbrella
-a green umbrella
-a blue umbrella
-a yellow umbrella
-an orange umbrella
-a purple umbrella
-a pink umbrella
-a black umbrella
-a white umbrella
-a red suitcase
-a green suitcase
-a blue suitcase
-a yellow suitcase
-an orange suitcase
-a purple suitcase
-a pink suitcase
-a black suitcase
-a white suitcase
-a red bowl
-a green bowl
-a blue bowl
-a yellow bowl
-an orange bowl
-a purple bowl
-a pink bowl
-a black bowl
-a white bowl
-a red chair
-a green chair
-a blue chair
-a yellow chair
-an orange chair
-a purple chair
-a pink chair
-a black chair
-a white chair
-a red clock
-a green clock
-a blue clock
-a yellow clock
-an orange clock
-a purple clock
-a pink clock
-a black clock
-a white clock
-a red vase
-a green vase
-a blue vase
-a yellow vase
-an orange vase
-a purple vase
-a pink vase
-a black vase
-a white vase
-A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style
-A beautiful coastal beach in spring, waves lapping on sand, oil painting
-A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo
-A beautiful coastal beach in spring, waves lapping on sand, black and white
-A beautiful coastal beach in spring, waves lapping on sand, pixel art
-A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style
-A beautiful coastal beach in spring, waves lapping on sand, animated style
-A beautiful coastal beach in spring, waves lapping on sand, watercolor painting
-A beautiful coastal beach in spring, waves lapping on sand, surrealism style
-The bund Shanghai, Van Gogh style
-The bund Shanghai, oil painting
-The bund Shanghai by Hokusai, in the style of Ukiyo
-The bund Shanghai, black and white
-The bund Shanghai, pixel art
-The bund Shanghai, in cyberpunk style
-The bund Shanghai, animated style
-The bund Shanghai, watercolor painting
-The bund Shanghai, surrealism style
-a shark is swimming in the ocean, Van Gogh style
-a shark is swimming in the ocean, oil painting
-a shark is swimming in the ocean by Hokusai, in the style of Ukiyo
-a shark is swimming in the ocean, black and white
-a shark is swimming in the ocean, pixel art
-a shark is swimming in the ocean, in cyberpunk style
-a shark is swimming in the ocean, animated style
-a shark is swimming in the ocean, watercolor painting
-a shark is swimming in the ocean, surrealism style
-A panda drinking coffee in a cafe in Paris, Van Gogh style
-A panda drinking coffee in a cafe in Paris, oil painting
-A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo
-A panda drinking coffee in a cafe in Paris, black and white
-A panda drinking coffee in a cafe in Paris, pixel art
-A panda drinking coffee in a cafe in Paris, in cyberpunk style
-A panda drinking coffee in a cafe in Paris, animated style
-A panda drinking coffee in a cafe in Paris, watercolor painting
-A panda drinking coffee in a cafe in Paris, surrealism style
-A cute happy Corgi playing in park, sunset, Van Gogh style
-A cute happy Corgi playing in park, sunset, oil painting
-A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo
-A cute happy Corgi playing in park, sunset, black and white
-A cute happy Corgi playing in park, sunset, pixel art
-A cute happy Corgi playing in park, sunset, in cyberpunk style
-A cute happy Corgi playing in park, sunset, animated style
-A cute happy Corgi playing in park, sunset, watercolor painting
-A cute happy Corgi playing in park, sunset, surrealism style
-Gwen Stacy reading a book, Van Gogh style
-Gwen Stacy reading a book, oil painting
-Gwen Stacy reading a book by Hokusai, in the style of Ukiyo
-Gwen Stacy reading a book, black and white
-Gwen Stacy reading a book, pixel art
-Gwen Stacy reading a book, in cyberpunk style
-Gwen Stacy reading a book, animated style
-Gwen Stacy reading a book, watercolor painting
-Gwen Stacy reading a book, surrealism style
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style
-An astronaut flying in space, Van Gogh style
-An astronaut flying in space, oil painting
-An astronaut flying in space by Hokusai, in the style of Ukiyo
-An astronaut flying in space, black and white
-An astronaut flying in space, pixel art
-An astronaut flying in space, in cyberpunk style
-An astronaut flying in space, animated style
-An astronaut flying in space, watercolor painting
-An astronaut flying in space, surrealism style
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style
-A beautiful coastal beach in spring, waves lapping on sand, in super slow motion
-A beautiful coastal beach in spring, waves lapping on sand, zoom in
-A beautiful coastal beach in spring, waves lapping on sand, zoom out
-A beautiful coastal beach in spring, waves lapping on sand, pan left
-A beautiful coastal beach in spring, waves lapping on sand, pan right
-A beautiful coastal beach in spring, waves lapping on sand, tilt up
-A beautiful coastal beach in spring, waves lapping on sand, tilt down
-A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect
-A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective
-A beautiful coastal beach in spring, waves lapping on sand, racking focus
-The bund Shanghai, in super slow motion
-The bund Shanghai, zoom in
-The bund Shanghai, zoom out
-The bund Shanghai, pan left
-The bund Shanghai, pan right
-The bund Shanghai, tilt up
-The bund Shanghai, tilt down
-The bund Shanghai, with an intense shaking effect
-The bund Shanghai, featuring a steady and smooth perspective
-The bund Shanghai, racking focus
-a shark is swimming in the ocean, in super slow motion
-a shark is swimming in the ocean, zoom in
-a shark is swimming in the ocean, zoom out
-a shark is swimming in the ocean, pan left
-a shark is swimming in the ocean, pan right
-a shark is swimming in the ocean, tilt up
-a shark is swimming in the ocean, tilt down
-a shark is swimming in the ocean, with an intense shaking effect
-a shark is swimming in the ocean, featuring a steady and smooth perspective
-a shark is swimming in the ocean, racking focus
-A panda drinking coffee in a cafe in Paris, in super slow motion
-A panda drinking coffee in a cafe in Paris, zoom in
-A panda drinking coffee in a cafe in Paris, zoom out
-A panda drinking coffee in a cafe in Paris, pan left
-A panda drinking coffee in a cafe in Paris, pan right
-A panda drinking coffee in a cafe in Paris, tilt up
-A panda drinking coffee in a cafe in Paris, tilt down
-A panda drinking coffee in a cafe in Paris, with an intense shaking effect
-A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective
-A panda drinking coffee in a cafe in Paris, racking focus
-A cute happy Corgi playing in park, sunset, in super slow motion
-A cute happy Corgi playing in park, sunset, zoom in
-A cute happy Corgi playing in park, sunset, zoom out
-A cute happy Corgi playing in park, sunset, pan left
-A cute happy Corgi playing in park, sunset, pan right
-A cute happy Corgi playing in park, sunset, tilt up
-A cute happy Corgi playing in park, sunset, tilt down
-A cute happy Corgi playing in park, sunset, with an intense shaking effect
-A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective
-A cute happy Corgi playing in park, sunset, racking focus
-Gwen Stacy reading a book, in super slow motion
-Gwen Stacy reading a book, zoom in
-Gwen Stacy reading a book, zoom out
-Gwen Stacy reading a book, pan left
-Gwen Stacy reading a book, pan right
-Gwen Stacy reading a book, tilt up
-Gwen Stacy reading a book, tilt down
-Gwen Stacy reading a book, with an intense shaking effect
-Gwen Stacy reading a book, featuring a steady and smooth perspective
-Gwen Stacy reading a book, racking focus
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus
-An astronaut flying in space, in super slow motion
-An astronaut flying in space, zoom in
-An astronaut flying in space, zoom out
-An astronaut flying in space, pan left
-An astronaut flying in space, pan right
-An astronaut flying in space, tilt up
-An astronaut flying in space, tilt down
-An astronaut flying in space, with an intense shaking effect
-An astronaut flying in space, featuring a steady and smooth perspective
-An astronaut flying in space, racking focus
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus
-Close up of grapes on a rotating table.
-Turtle swimming in ocean.
-A storm trooper vacuuming the beach.
-A panda standing on a surfboard in the ocean in sunset.
-An astronaut feeding ducks on a sunny afternoon, reflection from the water.
-Two pandas discussing an academic paper.
-Sunset time lapse at the beach with moving clouds and colors in the sky.
-A fat rabbit wearing a purple robe walking through a fantasy landscape.
-A koala bear playing piano in the forest.
-An astronaut flying in space.
-Fireworks.
-An animated painting of fluffy white clouds moving in sky.
-Flying through fantasy landscapes.
-A bigfoot walking in the snowstorm.
-A squirrel eating a burger.
-A cat wearing sunglasses and working as a lifeguard at a pool.
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks.
-Splash of turquoise water in extreme slow motion, alpha channel included.
-an ice cream is melting on the table.
-a drone flying over a snowy forest.
-a shark is swimming in the ocean.
-Aerial panoramic video from a drone of a fantasy land.
-a teddy bear is swimming in the ocean.
-time lapse of sunrise on mars.
-golden fish swimming in the ocean.
-An artist brush painting on a canvas close up.
-A drone view of celebration with Christmas tree and fireworks, starry sky - background.
-happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background
-Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance.
-Campfire at night in a snowy forest with starry sky in the background.
-a fantasy landscape
-A 3D model of a 1800s victorian house.
-this is how I do makeup in the morning.
-A raccoon that looks like a turtle, digital art.
-Robot dancing in Times Square.
-Busy freeway at night.
-Balloon full of water exploding in extreme slow motion.
-An astronaut is riding a horse in the space in a photorealistic style.
-Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl.
-Sewing machine, old sewing machine working.
-Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink.
-Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro.
-Vampire makeup face of beautiful girl, red contact lenses.
-Ashtray full of butts on table, smoke flowing on black background, close-up
-Pacific coast, carmel by the sea ocean and waves.
-A teddy bear is playing drum kit in NYC Times Square.
-A corgi is playing drum kit.
-An Iron man is playing the electronic guitar, high electronic guitar.
-A raccoon is playing the electronic guitar.
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh
-A corgi's head depicted as an explosion of a nebula
-A fantasy landscape
-A future where humans have achieved teleportation technology
-A jellyfish floating through the ocean, with bioluminescent tentacles
-A Mars rover moving on Mars
-A panda drinking coffee in a cafe in Paris
-A space shuttle launching into orbit, with flames and smoke billowing out from the engines
-A steam train moving on a mountainside
-A super cool giant robot in Cyberpunk Beijing
-A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground
-Cinematic shot of Van Gogh's selfie, Van Gogh style
-Gwen Stacy reading a book
-Iron Man flying in the sky
-The bund Shanghai, oil painting
-Yoda playing guitar on the stage
-A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo
-A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background
-A car moving slowly on an empty street, rainy evening
-A cat eating food out of a bowl
-A cat wearing sunglasses at a pool
-A confused panda in calculus class
-A cute fluffy panda eating Chinese food in a restaurant
-A cute happy Corgi playing in park, sunset
-A cute raccoon playing guitar in a boat on the ocean
-A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background
-A lightning striking atop of eiffel tower, dark clouds in the sky
-A modern art museum, with colorful paintings
-A panda cooking in the kitchen
-A panda playing on a swing set
-A polar bear is playing guitar
-A raccoon dressed in suit playing the trumpet, stage background
-A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy
-A shark swimming in clear Caribbean ocean
-A super robot protecting city
-A teddy bear washing the dishes
-An epic tornado attacking above a glowing city at night, the tornado is made of smoke
-An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas
-Clown fish swimming through the coral reef
-Hyper-realistic spaceship landing on Mars
-The bund Shanghai, vibrant color
-Vincent van Gogh is painting in the room
-Yellow flowers swing in the wind
-alley
-amusement park
-aquarium
-arch
-art gallery
-bathroom
-bakery shop
-ballroom
-bar
-barn
-basement
-beach
-bedroom
-bridge
-botanical garden
-cafeteria
-campsite
-campus
-carrousel
-castle
-cemetery
-classroom
-cliff
-crosswalk
-construction site
-corridor
-courtyard
-desert
-downtown
-driveway
-farm
-food court
-football field
-forest road
-fountain
-gas station
-glacier
-golf course
-indoor gymnasium
-harbor
-highway
-hospital
-house
-iceberg
-industrial area
-jail cell
-junkyard
-kitchen
-indoor library
-lighthouse
-laboratory
-mansion
-marsh
-mountain
-indoor movie theater
-indoor museum
-music studio
-nursery
-ocean
-office
-palace
-parking lot
-pharmacy
-phone booth
-raceway
-restaurant
-river
-science museum
-shower
-ski slope
-sky
-skyscraper
-baseball stadium
-staircase
-street
-supermarket
-indoor swimming pool
-tower
-outdoor track
-train railway
-train station platform
-underwater coral reef
-valley
-volcano
-waterfall
-windmill
-a bicycle on the left of a car, front view
-a car on the right of a motorcycle, front view
-a motorcycle on the left of a bus, front view
-a bus on the right of a traffic light, front view
-a traffic light on the left of a fire hydrant, front view
-a fire hydrant on the right of a stop sign, front view
-a stop sign on the left of a parking meter, front view
-a parking meter on the right of a bench, front view
-a bench on the left of a truck, front view
-a truck on the right of a bicycle, front view
-a bird on the left of a cat, front view
-a cat on the right of a dog, front view
-a dog on the left of a horse, front view
-a horse on the right of a sheep, front view
-a sheep on the left of a cow, front view
-a cow on the right of an elephant, front view
-an elephant on the left of a bear, front view
-a bear on the right of a zebra, front view
-a zebra on the left of a giraffe, front view
-a giraffe on the right of a bird, front view
-a bottle on the left of a wine glass, front view
-a wine glass on the right of a cup, front view
-a cup on the left of a fork, front view
-a fork on the right of a knife, front view
-a knife on the left of a spoon, front view
-a spoon on the right of a bowl, front view
-a bowl on the left of a bottle, front view
-a potted plant on the left of a remote, front view
-a remote on the right of a clock, front view
-a clock on the left of a vase, front view
-a vase on the right of scissors, front view
-scissors on the left of a teddy bear, front view
-a teddy bear on the right of a potted plant, front view
-a frisbee on the left of a sports ball, front view
-a sports ball on the right of a baseball bat, front view
-a baseball bat on the left of a baseball glove, front view
-a baseball glove on the right of a tennis racket, front view
-a tennis racket on the left of a frisbee, front view
-a toilet on the left of a hair drier, front view
-a hair drier on the right of a toothbrush, front view
-a toothbrush on the left of a sink, front view
-a sink on the right of a toilet, front view
-a chair on the left of a couch, front view
-a couch on the right of a bed, front view
-a bed on the left of a tv, front view
-a tv on the right of a dining table, front view
-a dining table on the left of a chair, front view
-an airplane on the left of a train, front view
-a train on the right of a boat, front view
-a boat on the left of an airplane, front view
-an oven on the top of a toaster, front view
-an oven on the bottom of a toaster, front view
-a toaster on the top of a microwave, front view
-a toaster on the bottom of a microwave, front view
-a microwave on the top of an oven, front view
-a microwave on the bottom of an oven, front view
-a banana on the top of an apple, front view
-a banana on the bottom of an apple, front view
-an apple on the top of a sandwich, front view
-an apple on the bottom of a sandwich, front view
-a sandwich on the top of an orange, front view
-a sandwich on the bottom of an orange, front view
-an orange on the top of a carrot, front view
-an orange on the bottom of a carrot, front view
-a carrot on the top of a hot dog, front view
-a carrot on the bottom of a hot dog, front view
-a hot dog on the top of a pizza, front view
-a hot dog on the bottom of a pizza, front view
-a pizza on the top of a donut, front view
-a pizza on the bottom of a donut, front view
-a donut on the top of broccoli, front view
-a donut on the bottom of broccoli, front view
-broccoli on the top of a banana, front view
-broccoli on the bottom of a banana, front view
-skis on the top of a snowboard, front view
-skis on the bottom of a snowboard, front view
-a snowboard on the top of a kite, front view
-a snowboard on the bottom of a kite, front view
-a kite on the top of a skateboard, front view
-a kite on the bottom of a skateboard, front view
-a skateboard on the top of a surfboard, front view
-a skateboard on the bottom of a surfboard, front view
-a surfboard on the top of skis, front view
-a surfboard on the bottom of skis, front view
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/all_i2v.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/all_i2v.txt
deleted file mode 100644
index cf0aa35958f1fcfe1352b26af57cabeb4f919f12..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/all_i2v.txt
+++ /dev/null
@@ -1,1118 +0,0 @@
-a close up of a blue and orange liquid{"reference_path": "cache/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
-a close up of a blue and orange liquid, camera pans left{"reference_path": "cache/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
-a close up of a blue and orange liquid, camera pans right{"reference_path": "cache/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
-a close up of a blue and orange liquid, camera tilts up{"reference_path": "cache/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
-a close up of a blue and orange liquid, camera tilts down{"reference_path": "cache/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
-a close up of a blue and orange liquid, camera zooms in{"reference_path": "cache/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
-a close up of a blue and orange liquid, camera zooms out{"reference_path": "cache/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
-a close up of a blue and orange liquid, camera static{"reference_path": "cache/crop/1-1/a close up of a blue and orange liquid.jpg", "mask_strategy": "0"}
-A black and white abstract video featuring mesmerizing bubbles{"reference_path": "cache/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
-A black and white abstract video featuring mesmerizing bubbles, camera pans left{"reference_path": "cache/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
-A black and white abstract video featuring mesmerizing bubbles, camera pans right{"reference_path": "cache/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
-A black and white abstract video featuring mesmerizing bubbles, camera tilts up{"reference_path": "cache/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
-A black and white abstract video featuring mesmerizing bubbles, camera tilts down{"reference_path": "cache/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
-A black and white abstract video featuring mesmerizing bubbles, camera zooms in{"reference_path": "cache/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
-A black and white abstract video featuring mesmerizing bubbles, camera zooms out{"reference_path": "cache/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
-A black and white abstract video featuring mesmerizing bubbles, camera static{"reference_path": "cache/crop/1-1/A black and white abstract video featuring mesmerizing bubbles.jpg", "mask_strategy": "0"}
-a blue and white smoke is swirly in the dark{"reference_path": "cache/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
-a blue and white smoke is swirly in the dark, camera pans left{"reference_path": "cache/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
-a blue and white smoke is swirly in the dark, camera pans right{"reference_path": "cache/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
-a blue and white smoke is swirly in the dark, camera tilts up{"reference_path": "cache/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
-a blue and white smoke is swirly in the dark, camera tilts down{"reference_path": "cache/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
-a blue and white smoke is swirly in the dark, camera zooms in{"reference_path": "cache/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
-a blue and white smoke is swirly in the dark, camera zooms out{"reference_path": "cache/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
-a blue and white smoke is swirly in the dark, camera static{"reference_path": "cache/crop/1-1/a blue and white smoke is swirly in the dark.jpg", "mask_strategy": "0"}
-a close-up view of a sea fan in the water{"reference_path": "cache/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
-a close-up view of a sea fan in the water, camera pans left{"reference_path": "cache/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
-a close-up view of a sea fan in the water, camera pans right{"reference_path": "cache/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
-a close-up view of a sea fan in the water, camera tilts up{"reference_path": "cache/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
-a close-up view of a sea fan in the water, camera tilts down{"reference_path": "cache/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
-a close-up view of a sea fan in the water, camera zooms in{"reference_path": "cache/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
-a close-up view of a sea fan in the water, camera zooms out{"reference_path": "cache/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
-a close-up view of a sea fan in the water, camera static{"reference_path": "cache/crop/1-1/a close-up view of a sea fan in the water.jpg", "mask_strategy": "0"}
-a visually captivating abstract video, rich in color, set against a dramatic black background{"reference_path": "cache/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
-a visually captivating abstract video, rich in color, set against a dramatic black background, camera pans left{"reference_path": "cache/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
-a visually captivating abstract video, rich in color, set against a dramatic black background, camera pans right{"reference_path": "cache/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
-a visually captivating abstract video, rich in color, set against a dramatic black background, camera tilts up{"reference_path": "cache/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
-a visually captivating abstract video, rich in color, set against a dramatic black background, camera tilts down{"reference_path": "cache/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
-a visually captivating abstract video, rich in color, set against a dramatic black background, camera zooms in{"reference_path": "cache/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
-a visually captivating abstract video, rich in color, set against a dramatic black background, camera zooms out{"reference_path": "cache/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
-a visually captivating abstract video, rich in color, set against a dramatic black background, camera static{"reference_path": "cache/crop/1-1/a visually captivating abstract video, rich in color, set against a dramatic black background.jpg", "mask_strategy": "0"}
-a purple and yellow abstract painting with a black background{"reference_path": "cache/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
-a purple and yellow abstract painting with a black background, camera pans left{"reference_path": "cache/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
-a purple and yellow abstract painting with a black background, camera pans right{"reference_path": "cache/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
-a purple and yellow abstract painting with a black background, camera tilts up{"reference_path": "cache/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
-a purple and yellow abstract painting with a black background, camera tilts down{"reference_path": "cache/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
-a purple and yellow abstract painting with a black background, camera zooms in{"reference_path": "cache/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
-a purple and yellow abstract painting with a black background, camera zooms out{"reference_path": "cache/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
-a purple and yellow abstract painting with a black background, camera static{"reference_path": "cache/crop/1-1/a purple and yellow abstract painting with a black background.jpg", "mask_strategy": "0"}
-a dynamic video of a blurry neon light in the dark, radiating captivating colors{"reference_path": "cache/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
-a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera pans left{"reference_path": "cache/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
-a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera pans right{"reference_path": "cache/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
-a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera tilts up{"reference_path": "cache/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
-a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera tilts down{"reference_path": "cache/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
-a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera zooms in{"reference_path": "cache/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
-a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera zooms out{"reference_path": "cache/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
-a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera static{"reference_path": "cache/crop/1-1/a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg", "mask_strategy": "0"}
-a view of a star trail in the night sky{"reference_path": "cache/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
-a view of a star trail in the night sky, camera pans left{"reference_path": "cache/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
-a view of a star trail in the night sky, camera pans right{"reference_path": "cache/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
-a view of a star trail in the night sky, camera tilts up{"reference_path": "cache/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
-a view of a star trail in the night sky, camera tilts down{"reference_path": "cache/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
-a view of a star trail in the night sky, camera zooms in{"reference_path": "cache/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
-a view of a star trail in the night sky, camera zooms out{"reference_path": "cache/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
-a view of a star trail in the night sky, camera static{"reference_path": "cache/crop/1-1/a view of a star trail in the night sky.jpg", "mask_strategy": "0"}
-an aerial view of a small town on the edge of the ocean{"reference_path": "cache/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
-an aerial view of a small town on the edge of the ocean, camera pans left{"reference_path": "cache/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
-an aerial view of a small town on the edge of the ocean, camera pans right{"reference_path": "cache/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
-an aerial view of a small town on the edge of the ocean, camera tilts up{"reference_path": "cache/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
-an aerial view of a small town on the edge of the ocean, camera tilts down{"reference_path": "cache/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
-an aerial view of a small town on the edge of the ocean, camera zooms in{"reference_path": "cache/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
-an aerial view of a small town on the edge of the ocean, camera zooms out{"reference_path": "cache/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
-an aerial view of a small town on the edge of the ocean, camera static{"reference_path": "cache/crop/1-1/an aerial view of a small town on the edge of the ocean.jpg", "mask_strategy": "0"}
-Colorful buildings on the seaside cliffs{"reference_path": "cache/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
-Colorful buildings on the seaside cliffs, camera pans left{"reference_path": "cache/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
-Colorful buildings on the seaside cliffs, camera pans right{"reference_path": "cache/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
-Colorful buildings on the seaside cliffs, camera tilts up{"reference_path": "cache/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
-Colorful buildings on the seaside cliffs, camera tilts down{"reference_path": "cache/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
-Colorful buildings on the seaside cliffs, camera zooms in{"reference_path": "cache/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
-Colorful buildings on the seaside cliffs, camera zooms out{"reference_path": "cache/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
-Colorful buildings on the seaside cliffs, camera static{"reference_path": "cache/crop/1-1/Colorful buildings on the seaside cliffs.jpg", "mask_strategy": "0"}
-a bunch of houses that are on a hillside{"reference_path": "cache/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
-a bunch of houses that are on a hillside, camera pans left{"reference_path": "cache/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
-a bunch of houses that are on a hillside, camera pans right{"reference_path": "cache/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
-a bunch of houses that are on a hillside, camera tilts up{"reference_path": "cache/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
-a bunch of houses that are on a hillside, camera tilts down{"reference_path": "cache/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
-a bunch of houses that are on a hillside, camera zooms in{"reference_path": "cache/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
-a bunch of houses that are on a hillside, camera zooms out{"reference_path": "cache/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
-a bunch of houses that are on a hillside, camera static{"reference_path": "cache/crop/1-1/a bunch of houses that are on a hillside.jpg", "mask_strategy": "0"}
-a building that is sitting on the side of a pond{"reference_path": "cache/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
-a building that is sitting on the side of a pond, camera pans left{"reference_path": "cache/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
-a building that is sitting on the side of a pond, camera pans right{"reference_path": "cache/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
-a building that is sitting on the side of a pond, camera tilts up{"reference_path": "cache/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
-a building that is sitting on the side of a pond, camera tilts down{"reference_path": "cache/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
-a building that is sitting on the side of a pond, camera zooms in{"reference_path": "cache/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
-a building that is sitting on the side of a pond, camera zooms out{"reference_path": "cache/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
-a building that is sitting on the side of a pond, camera static{"reference_path": "cache/crop/1-1/a building that is sitting on the side of a pond.jpg", "mask_strategy": "0"}
-an aerial view of a busy city with a bridge in the background{"reference_path": "cache/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
-an aerial view of a busy city with a bridge in the background, camera pans left{"reference_path": "cache/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
-an aerial view of a busy city with a bridge in the background, camera pans right{"reference_path": "cache/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
-an aerial view of a busy city with a bridge in the background, camera tilts up{"reference_path": "cache/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
-an aerial view of a busy city with a bridge in the background, camera tilts down{"reference_path": "cache/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
-an aerial view of a busy city with a bridge in the background, camera zooms in{"reference_path": "cache/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
-an aerial view of a busy city with a bridge in the background, camera zooms out{"reference_path": "cache/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
-an aerial view of a busy city with a bridge in the background, camera static{"reference_path": "cache/crop/1-1/an aerial view of a busy city with a bridge in the background.jpg", "mask_strategy": "0"}
-a bridge that is over a body of water{"reference_path": "cache/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
-a bridge that is over a body of water, camera pans left{"reference_path": "cache/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
-a bridge that is over a body of water, camera pans right{"reference_path": "cache/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
-a bridge that is over a body of water, camera tilts up{"reference_path": "cache/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
-a bridge that is over a body of water, camera tilts down{"reference_path": "cache/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
-a bridge that is over a body of water, camera zooms in{"reference_path": "cache/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
-a bridge that is over a body of water, camera zooms out{"reference_path": "cache/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
-a bridge that is over a body of water, camera static{"reference_path": "cache/crop/1-1/a bridge that is over a body of water.jpg", "mask_strategy": "0"}
-a pile of wood sitting next to a log house{"reference_path": "cache/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
-a pile of wood sitting next to a log house, camera pans left{"reference_path": "cache/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
-a pile of wood sitting next to a log house, camera pans right{"reference_path": "cache/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
-a pile of wood sitting next to a log house, camera tilts up{"reference_path": "cache/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
-a pile of wood sitting next to a log house, camera tilts down{"reference_path": "cache/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
-a pile of wood sitting next to a log house, camera zooms in{"reference_path": "cache/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
-a pile of wood sitting next to a log house, camera zooms out{"reference_path": "cache/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
-a pile of wood sitting next to a log house, camera static{"reference_path": "cache/crop/1-1/a pile of wood sitting next to a log house.jpg", "mask_strategy": "0"}
-a view of a snowy mountain side with many buildings{"reference_path": "cache/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
-a view of a snowy mountain side with many buildings, camera pans left{"reference_path": "cache/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
-a view of a snowy mountain side with many buildings, camera pans right{"reference_path": "cache/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
-a view of a snowy mountain side with many buildings, camera tilts up{"reference_path": "cache/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
-a view of a snowy mountain side with many buildings, camera tilts down{"reference_path": "cache/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
-a view of a snowy mountain side with many buildings, camera zooms in{"reference_path": "cache/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
-a view of a snowy mountain side with many buildings, camera zooms out{"reference_path": "cache/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
-a view of a snowy mountain side with many buildings, camera static{"reference_path": "cache/crop/1-1/a view of a snowy mountain side with many buildings.jpg", "mask_strategy": "0"}
-san francisco skyline at sunset{"reference_path": "cache/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
-san francisco skyline at sunset, camera pans left{"reference_path": "cache/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
-san francisco skyline at sunset, camera pans right{"reference_path": "cache/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
-san francisco skyline at sunset, camera tilts up{"reference_path": "cache/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
-san francisco skyline at sunset, camera tilts down{"reference_path": "cache/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
-san francisco skyline at sunset, camera zooms in{"reference_path": "cache/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
-san francisco skyline at sunset, camera zooms out{"reference_path": "cache/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
-san francisco skyline at sunset, camera static{"reference_path": "cache/crop/1-1/san francisco skyline at sunset.jpg", "mask_strategy": "0"}
-a castle on top of a hill covered in snow{"reference_path": "cache/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
-a castle on top of a hill covered in snow, camera pans left{"reference_path": "cache/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
-a castle on top of a hill covered in snow, camera pans right{"reference_path": "cache/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
-a castle on top of a hill covered in snow, camera tilts up{"reference_path": "cache/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
-a castle on top of a hill covered in snow, camera tilts down{"reference_path": "cache/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
-a castle on top of a hill covered in snow, camera zooms in{"reference_path": "cache/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
-a castle on top of a hill covered in snow, camera zooms out{"reference_path": "cache/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
-a castle on top of a hill covered in snow, camera static{"reference_path": "cache/crop/1-1/a castle on top of a hill covered in snow.jpg", "mask_strategy": "0"}
-an aerial view of big ben and the houses of parliament in london{"reference_path": "cache/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
-an aerial view of big ben and the houses of parliament in london, camera pans left{"reference_path": "cache/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
-an aerial view of big ben and the houses of parliament in london, camera pans right{"reference_path": "cache/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
-an aerial view of big ben and the houses of parliament in london, camera tilts up{"reference_path": "cache/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
-an aerial view of big ben and the houses of parliament in london, camera tilts down{"reference_path": "cache/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
-an aerial view of big ben and the houses of parliament in london, camera zooms in{"reference_path": "cache/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
-an aerial view of big ben and the houses of parliament in london, camera zooms out{"reference_path": "cache/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
-an aerial view of big ben and the houses of parliament in london, camera static{"reference_path": "cache/crop/1-1/an aerial view of big ben and the houses of parliament in london.jpg", "mask_strategy": "0"}
-a beach with a lot of buildings on the side of a cliff{"reference_path": "cache/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
-a beach with a lot of buildings on the side of a cliff, camera pans left{"reference_path": "cache/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
-a beach with a lot of buildings on the side of a cliff, camera pans right{"reference_path": "cache/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
-a beach with a lot of buildings on the side of a cliff, camera tilts up{"reference_path": "cache/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
-a beach with a lot of buildings on the side of a cliff, camera tilts down{"reference_path": "cache/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
-a beach with a lot of buildings on the side of a cliff, camera zooms in{"reference_path": "cache/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
-a beach with a lot of buildings on the side of a cliff, camera zooms out{"reference_path": "cache/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
-a beach with a lot of buildings on the side of a cliff, camera static{"reference_path": "cache/crop/1-1/a beach with a lot of buildings on the side of a cliff.jpg", "mask_strategy": "0"}
-an alley way in an old european city{"reference_path": "cache/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
-an alley way in an old european city, camera pans left{"reference_path": "cache/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
-an alley way in an old european city, camera pans right{"reference_path": "cache/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
-an alley way in an old european city, camera tilts up{"reference_path": "cache/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
-an alley way in an old european city, camera tilts down{"reference_path": "cache/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
-an alley way in an old european city, camera zooms in{"reference_path": "cache/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
-an alley way in an old european city, camera zooms out{"reference_path": "cache/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
-an alley way in an old european city, camera static{"reference_path": "cache/crop/1-1/an alley way in an old european city.jpg", "mask_strategy": "0"}
-the golden gate bridge in san franscisco is lit up by the setting sun{"reference_path": "cache/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
-the golden gate bridge in san franscisco is lit up by the setting sun, camera pans left{"reference_path": "cache/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
-the golden gate bridge in san franscisco is lit up by the setting sun, camera pans right{"reference_path": "cache/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
-the golden gate bridge in san franscisco is lit up by the setting sun, camera tilts up{"reference_path": "cache/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
-the golden gate bridge in san franscisco is lit up by the setting sun, camera tilts down{"reference_path": "cache/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
-the golden gate bridge in san franscisco is lit up by the setting sun, camera zooms in{"reference_path": "cache/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
-the golden gate bridge in san franscisco is lit up by the setting sun, camera zooms out{"reference_path": "cache/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
-the golden gate bridge in san franscisco is lit up by the setting sun, camera static{"reference_path": "cache/crop/1-1/the golden gate bridge in san franscisco is lit up by the setting sun.jpg", "mask_strategy": "0"}
-the great wall of china in autumn{"reference_path": "cache/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
-the great wall of china in autumn, camera pans left{"reference_path": "cache/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
-the great wall of china in autumn, camera pans right{"reference_path": "cache/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
-the great wall of china in autumn, camera tilts up{"reference_path": "cache/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
-the great wall of china in autumn, camera tilts down{"reference_path": "cache/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
-the great wall of china in autumn, camera zooms in{"reference_path": "cache/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
-the great wall of china in autumn, camera zooms out{"reference_path": "cache/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
-the great wall of china in autumn, camera static{"reference_path": "cache/crop/1-1/the great wall of china in autumn.jpg", "mask_strategy": "0"}
-the town of hallstatt is surrounded by mountains and water{"reference_path": "cache/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
-the town of hallstatt is surrounded by mountains and water, camera pans left{"reference_path": "cache/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
-the town of hallstatt is surrounded by mountains and water, camera pans right{"reference_path": "cache/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
-the town of hallstatt is surrounded by mountains and water, camera tilts up{"reference_path": "cache/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
-the town of hallstatt is surrounded by mountains and water, camera tilts down{"reference_path": "cache/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
-the town of hallstatt is surrounded by mountains and water, camera zooms in{"reference_path": "cache/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
-the town of hallstatt is surrounded by mountains and water, camera zooms out{"reference_path": "cache/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
-the town of hallstatt is surrounded by mountains and water, camera static{"reference_path": "cache/crop/1-1/the town of hallstatt is surrounded by mountains and water.jpg", "mask_strategy": "0"}
-tokyo skyline at night{"reference_path": "cache/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
-tokyo skyline at night, camera pans left{"reference_path": "cache/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
-tokyo skyline at night, camera pans right{"reference_path": "cache/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
-tokyo skyline at night, camera tilts up{"reference_path": "cache/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
-tokyo skyline at night, camera tilts down{"reference_path": "cache/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
-tokyo skyline at night, camera zooms in{"reference_path": "cache/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
-tokyo skyline at night, camera zooms out{"reference_path": "cache/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
-tokyo skyline at night, camera static{"reference_path": "cache/crop/1-1/tokyo skyline at night.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse, camera pans left{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse, camera pans right{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse, camera tilts up{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse, camera tilts down{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse, camera zooms in{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse, camera zooms out{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse, camera static{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse.jpg", "mask_strategy": "0"}
-a church sits on top of a hill under a cloudy sky{"reference_path": "cache/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
-a church sits on top of a hill under a cloudy sky, camera pans left{"reference_path": "cache/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
-a church sits on top of a hill under a cloudy sky, camera pans right{"reference_path": "cache/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
-a church sits on top of a hill under a cloudy sky, camera tilts up{"reference_path": "cache/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
-a church sits on top of a hill under a cloudy sky, camera tilts down{"reference_path": "cache/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
-a church sits on top of a hill under a cloudy sky, camera zooms in{"reference_path": "cache/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
-a church sits on top of a hill under a cloudy sky, camera zooms out{"reference_path": "cache/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
-a church sits on top of a hill under a cloudy sky, camera static{"reference_path": "cache/crop/1-1/a church sits on top of a hill under a cloudy sky.jpg", "mask_strategy": "0"}
-the parthenon in acropolis, greece{"reference_path": "cache/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
-the parthenon in acropolis, greece, camera pans left{"reference_path": "cache/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
-the parthenon in acropolis, greece, camera pans right{"reference_path": "cache/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
-the parthenon in acropolis, greece, camera tilts up{"reference_path": "cache/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
-the parthenon in acropolis, greece, camera tilts down{"reference_path": "cache/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
-the parthenon in acropolis, greece, camera zooms in{"reference_path": "cache/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
-the parthenon in acropolis, greece, camera zooms out{"reference_path": "cache/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
-the parthenon in acropolis, greece, camera static{"reference_path": "cache/crop/1-1/the parthenon in acropolis, greece.jpg", "mask_strategy": "0"}
-a large crowd of people walking in a shopping mall{"reference_path": "cache/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
-a large crowd of people walking in a shopping mall, camera pans left{"reference_path": "cache/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
-a large crowd of people walking in a shopping mall, camera pans right{"reference_path": "cache/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
-a large crowd of people walking in a shopping mall, camera tilts up{"reference_path": "cache/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
-a large crowd of people walking in a shopping mall, camera tilts down{"reference_path": "cache/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
-a large crowd of people walking in a shopping mall, camera zooms in{"reference_path": "cache/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
-a large crowd of people walking in a shopping mall, camera zooms out{"reference_path": "cache/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
-a large crowd of people walking in a shopping mall, camera static{"reference_path": "cache/crop/1-1/a large crowd of people walking in a shopping mall.jpg", "mask_strategy": "0"}
-the pyramids of giza, egypt{"reference_path": "cache/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
-the pyramids of giza, egypt, camera pans left{"reference_path": "cache/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
-the pyramids of giza, egypt, camera pans right{"reference_path": "cache/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
-the pyramids of giza, egypt, camera tilts up{"reference_path": "cache/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
-the pyramids of giza, egypt, camera tilts down{"reference_path": "cache/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
-the pyramids of giza, egypt, camera zooms in{"reference_path": "cache/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
-the pyramids of giza, egypt, camera zooms out{"reference_path": "cache/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
-the pyramids of giza, egypt, camera static{"reference_path": "cache/crop/1-1/the pyramids of giza, egypt.jpg", "mask_strategy": "0"}
-a stage door painted with a star on the side of a brick wall{"reference_path": "cache/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
-a stage door painted with a star on the side of a brick wall, camera pans left{"reference_path": "cache/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
-a stage door painted with a star on the side of a brick wall, camera pans right{"reference_path": "cache/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
-a stage door painted with a star on the side of a brick wall, camera tilts up{"reference_path": "cache/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
-a stage door painted with a star on the side of a brick wall, camera tilts down{"reference_path": "cache/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
-a stage door painted with a star on the side of a brick wall, camera zooms in{"reference_path": "cache/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
-a stage door painted with a star on the side of a brick wall, camera zooms out{"reference_path": "cache/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
-a stage door painted with a star on the side of a brick wall, camera static{"reference_path": "cache/crop/1-1/a stage door painted with a star on the side of a brick wall.jpg", "mask_strategy": "0"}
-a light house on the edge of the water{"reference_path": "cache/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
-a light house on the edge of the water, camera pans left{"reference_path": "cache/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
-a light house on the edge of the water, camera pans right{"reference_path": "cache/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
-a light house on the edge of the water, camera tilts up{"reference_path": "cache/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
-a light house on the edge of the water, camera tilts down{"reference_path": "cache/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
-a light house on the edge of the water, camera zooms in{"reference_path": "cache/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
-a light house on the edge of the water, camera zooms out{"reference_path": "cache/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
-a light house on the edge of the water, camera static{"reference_path": "cache/crop/1-1/a light house on the edge of the water.jpg", "mask_strategy": "0"}
-an asian city street at night with people and bicycles{"reference_path": "cache/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
-an asian city street at night with people and bicycles, camera pans left{"reference_path": "cache/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
-an asian city street at night with people and bicycles, camera pans right{"reference_path": "cache/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
-an asian city street at night with people and bicycles, camera tilts up{"reference_path": "cache/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
-an asian city street at night with people and bicycles, camera tilts down{"reference_path": "cache/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
-an asian city street at night with people and bicycles, camera zooms in{"reference_path": "cache/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
-an asian city street at night with people and bicycles, camera zooms out{"reference_path": "cache/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
-an asian city street at night with people and bicycles, camera static{"reference_path": "cache/crop/1-1/an asian city street at night with people and bicycles.jpg", "mask_strategy": "0"}
-a couple of wooden benches in the middle of a street{"reference_path": "cache/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
-a couple of wooden benches in the middle of a street, camera pans left{"reference_path": "cache/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
-a couple of wooden benches in the middle of a street, camera pans right{"reference_path": "cache/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
-a couple of wooden benches in the middle of a street, camera tilts up{"reference_path": "cache/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
-a couple of wooden benches in the middle of a street, camera tilts down{"reference_path": "cache/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
-a couple of wooden benches in the middle of a street, camera zooms in{"reference_path": "cache/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
-a couple of wooden benches in the middle of a street, camera zooms out{"reference_path": "cache/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
-a couple of wooden benches in the middle of a street, camera static{"reference_path": "cache/crop/1-1/a couple of wooden benches in the middle of a street.jpg", "mask_strategy": "0"}
-a pagoda sits on top of a mountain in japan{"reference_path": "cache/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
-a pagoda sits on top of a mountain in japan, camera pans left{"reference_path": "cache/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
-a pagoda sits on top of a mountain in japan, camera pans right{"reference_path": "cache/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
-a pagoda sits on top of a mountain in japan, camera tilts up{"reference_path": "cache/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
-a pagoda sits on top of a mountain in japan, camera tilts down{"reference_path": "cache/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
-a pagoda sits on top of a mountain in japan, camera zooms in{"reference_path": "cache/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
-a pagoda sits on top of a mountain in japan, camera zooms out{"reference_path": "cache/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
-a pagoda sits on top of a mountain in japan, camera static{"reference_path": "cache/crop/1-1/a pagoda sits on top of a mountain in japan.jpg", "mask_strategy": "0"}
-a red bus driving down a snowy street at night{"reference_path": "cache/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
-a red bus driving down a snowy street at night, camera pans left{"reference_path": "cache/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
-a red bus driving down a snowy street at night, camera pans right{"reference_path": "cache/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
-a red bus driving down a snowy street at night, camera tilts up{"reference_path": "cache/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
-a red bus driving down a snowy street at night, camera tilts down{"reference_path": "cache/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
-a red bus driving down a snowy street at night, camera zooms in{"reference_path": "cache/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
-a red bus driving down a snowy street at night, camera zooms out{"reference_path": "cache/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
-a red bus driving down a snowy street at night, camera static{"reference_path": "cache/crop/1-1/a red bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
-a snow covered street{"reference_path": "cache/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
-a snow covered street, camera pans left{"reference_path": "cache/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
-a snow covered street, camera pans right{"reference_path": "cache/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
-a snow covered street, camera tilts up{"reference_path": "cache/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
-a snow covered street, camera tilts down{"reference_path": "cache/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
-a snow covered street, camera zooms in{"reference_path": "cache/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
-a snow covered street, camera zooms out{"reference_path": "cache/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
-a snow covered street, camera static{"reference_path": "cache/crop/1-1/a snow covered street.jpg", "mask_strategy": "0"}
-a house with snow on the ground{"reference_path": "cache/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
-a house with snow on the ground, camera pans left{"reference_path": "cache/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
-a house with snow on the ground, camera pans right{"reference_path": "cache/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
-a house with snow on the ground, camera tilts up{"reference_path": "cache/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
-a house with snow on the ground, camera tilts down{"reference_path": "cache/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
-a house with snow on the ground, camera zooms in{"reference_path": "cache/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
-a house with snow on the ground, camera zooms out{"reference_path": "cache/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
-a house with snow on the ground, camera static{"reference_path": "cache/crop/1-1/a house with snow on the ground.jpg", "mask_strategy": "0"}
-cars parked on the side of the road during a snowstorm{"reference_path": "cache/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
-cars parked on the side of the road during a snowstorm, camera pans left{"reference_path": "cache/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
-cars parked on the side of the road during a snowstorm, camera pans right{"reference_path": "cache/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
-cars parked on the side of the road during a snowstorm, camera tilts up{"reference_path": "cache/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
-cars parked on the side of the road during a snowstorm, camera tilts down{"reference_path": "cache/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
-cars parked on the side of the road during a snowstorm, camera zooms in{"reference_path": "cache/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
-cars parked on the side of the road during a snowstorm, camera zooms out{"reference_path": "cache/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
-cars parked on the side of the road during a snowstorm, camera static{"reference_path": "cache/crop/1-1/cars parked on the side of the road during a snowstorm.jpg", "mask_strategy": "0"}
-a group of statues on the side of a building{"reference_path": "cache/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
-a group of statues on the side of a building, camera pans left{"reference_path": "cache/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
-a group of statues on the side of a building, camera pans right{"reference_path": "cache/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
-a group of statues on the side of a building, camera tilts up{"reference_path": "cache/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
-a group of statues on the side of a building, camera tilts down{"reference_path": "cache/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
-a group of statues on the side of a building, camera zooms in{"reference_path": "cache/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
-a group of statues on the side of a building, camera zooms out{"reference_path": "cache/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
-a group of statues on the side of a building, camera static{"reference_path": "cache/crop/1-1/a group of statues on the side of a building.jpg", "mask_strategy": "0"}
-a city street at night during a snow storm{"reference_path": "cache/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
-a city street at night during a snow storm, camera pans left{"reference_path": "cache/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
-a city street at night during a snow storm, camera pans right{"reference_path": "cache/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
-a city street at night during a snow storm, camera tilts up{"reference_path": "cache/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
-a city street at night during a snow storm, camera tilts down{"reference_path": "cache/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
-a city street at night during a snow storm, camera zooms in{"reference_path": "cache/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
-a city street at night during a snow storm, camera zooms out{"reference_path": "cache/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
-a city street at night during a snow storm, camera static{"reference_path": "cache/crop/1-1/a city street at night during a snow storm.jpg", "mask_strategy": "0"}
-tower bridge in london{"reference_path": "cache/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
-tower bridge in london, camera pans left{"reference_path": "cache/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
-tower bridge in london, camera pans right{"reference_path": "cache/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
-tower bridge in london, camera tilts up{"reference_path": "cache/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
-tower bridge in london, camera tilts down{"reference_path": "cache/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
-tower bridge in london, camera zooms in{"reference_path": "cache/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
-tower bridge in london, camera zooms out{"reference_path": "cache/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
-tower bridge in london, camera static{"reference_path": "cache/crop/1-1/tower bridge in london.jpg", "mask_strategy": "0"}
-chinese pagoda in the middle of a snowy day{"reference_path": "cache/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
-chinese pagoda in the middle of a snowy day, camera pans left{"reference_path": "cache/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
-chinese pagoda in the middle of a snowy day, camera pans right{"reference_path": "cache/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
-chinese pagoda in the middle of a snowy day, camera tilts up{"reference_path": "cache/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
-chinese pagoda in the middle of a snowy day, camera tilts down{"reference_path": "cache/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
-chinese pagoda in the middle of a snowy day, camera zooms in{"reference_path": "cache/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
-chinese pagoda in the middle of a snowy day, camera zooms out{"reference_path": "cache/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
-chinese pagoda in the middle of a snowy day, camera static{"reference_path": "cache/crop/1-1/chinese pagoda in the middle of a snowy day.jpg", "mask_strategy": "0"}
-a dark alleyway with a bus driving down it{"reference_path": "cache/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
-a dark alleyway with a bus driving down it, camera pans left{"reference_path": "cache/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
-a dark alleyway with a bus driving down it, camera pans right{"reference_path": "cache/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
-a dark alleyway with a bus driving down it, camera tilts up{"reference_path": "cache/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
-a dark alleyway with a bus driving down it, camera tilts down{"reference_path": "cache/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
-a dark alleyway with a bus driving down it, camera zooms in{"reference_path": "cache/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
-a dark alleyway with a bus driving down it, camera zooms out{"reference_path": "cache/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
-a dark alleyway with a bus driving down it, camera static{"reference_path": "cache/crop/1-1/a dark alleyway with a bus driving down it.jpg", "mask_strategy": "0"}
-a monastery sits on top of a cliff in bhutan{"reference_path": "cache/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
-a monastery sits on top of a cliff in bhutan, camera pans left{"reference_path": "cache/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
-a monastery sits on top of a cliff in bhutan, camera pans right{"reference_path": "cache/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
-a monastery sits on top of a cliff in bhutan, camera tilts up{"reference_path": "cache/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
-a monastery sits on top of a cliff in bhutan, camera tilts down{"reference_path": "cache/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
-a monastery sits on top of a cliff in bhutan, camera zooms in{"reference_path": "cache/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
-a monastery sits on top of a cliff in bhutan, camera zooms out{"reference_path": "cache/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
-a monastery sits on top of a cliff in bhutan, camera static{"reference_path": "cache/crop/1-1/a monastery sits on top of a cliff in bhutan.jpg", "mask_strategy": "0"}
-the dome of the rock in jerusalem{"reference_path": "cache/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
-the dome of the rock in jerusalem, camera pans left{"reference_path": "cache/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
-the dome of the rock in jerusalem, camera pans right{"reference_path": "cache/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
-the dome of the rock in jerusalem, camera tilts up{"reference_path": "cache/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
-the dome of the rock in jerusalem, camera tilts down{"reference_path": "cache/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
-the dome of the rock in jerusalem, camera zooms in{"reference_path": "cache/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
-the dome of the rock in jerusalem, camera zooms out{"reference_path": "cache/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
-the dome of the rock in jerusalem, camera static{"reference_path": "cache/crop/1-1/the dome of the rock in jerusalem.jpg", "mask_strategy": "0"}
-an aerial view of a futuristic building on a cliff overlooking a body of water{"reference_path": "cache/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
-an aerial view of a futuristic building on a cliff overlooking a body of water, camera pans left{"reference_path": "cache/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
-an aerial view of a futuristic building on a cliff overlooking a body of water, camera pans right{"reference_path": "cache/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
-an aerial view of a futuristic building on a cliff overlooking a body of water, camera tilts up{"reference_path": "cache/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
-an aerial view of a futuristic building on a cliff overlooking a body of water, camera tilts down{"reference_path": "cache/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
-an aerial view of a futuristic building on a cliff overlooking a body of water, camera zooms in{"reference_path": "cache/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
-an aerial view of a futuristic building on a cliff overlooking a body of water, camera zooms out{"reference_path": "cache/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
-an aerial view of a futuristic building on a cliff overlooking a body of water, camera static{"reference_path": "cache/crop/1-1/an aerial view of a futuristic building on a cliff overlooking a body of water.jpg", "mask_strategy": "0"}
-a reflection of a city with buildings in the water{"reference_path": "cache/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
-a reflection of a city with buildings in the water, camera pans left{"reference_path": "cache/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
-a reflection of a city with buildings in the water, camera pans right{"reference_path": "cache/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
-a reflection of a city with buildings in the water, camera tilts up{"reference_path": "cache/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
-a reflection of a city with buildings in the water, camera tilts down{"reference_path": "cache/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
-a reflection of a city with buildings in the water, camera zooms in{"reference_path": "cache/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
-a reflection of a city with buildings in the water, camera zooms out{"reference_path": "cache/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
-a reflection of a city with buildings in the water, camera static{"reference_path": "cache/crop/1-1/a reflection of a city with buildings in the water.jpg", "mask_strategy": "0"}
-a bar with chairs and a television on the wall{"reference_path": "cache/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
-a bar with chairs and a television on the wall, camera pans left{"reference_path": "cache/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
-a bar with chairs and a television on the wall, camera pans right{"reference_path": "cache/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
-a bar with chairs and a television on the wall, camera tilts up{"reference_path": "cache/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
-a bar with chairs and a television on the wall, camera tilts down{"reference_path": "cache/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
-a bar with chairs and a television on the wall, camera zooms in{"reference_path": "cache/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
-a bar with chairs and a television on the wall, camera zooms out{"reference_path": "cache/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
-a bar with chairs and a television on the wall, camera static{"reference_path": "cache/crop/1-1/a bar with chairs and a television on the wall.jpg", "mask_strategy": "0"}
-a living room filled with lots of books on a wall{"reference_path": "cache/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
-a living room filled with lots of books on a wall, camera pans left{"reference_path": "cache/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
-a living room filled with lots of books on a wall, camera pans right{"reference_path": "cache/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
-a living room filled with lots of books on a wall, camera tilts up{"reference_path": "cache/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
-a living room filled with lots of books on a wall, camera tilts down{"reference_path": "cache/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
-a living room filled with lots of books on a wall, camera zooms in{"reference_path": "cache/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
-a living room filled with lots of books on a wall, camera zooms out{"reference_path": "cache/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
-a living room filled with lots of books on a wall, camera static{"reference_path": "cache/crop/1-1/a living room filled with lots of books on a wall.jpg", "mask_strategy": "0"}
-a living room filled with furniture next to a stone wall{"reference_path": "cache/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
-a living room filled with furniture next to a stone wall, camera pans left{"reference_path": "cache/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
-a living room filled with furniture next to a stone wall, camera pans right{"reference_path": "cache/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
-a living room filled with furniture next to a stone wall, camera tilts up{"reference_path": "cache/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
-a living room filled with furniture next to a stone wall, camera tilts down{"reference_path": "cache/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
-a living room filled with furniture next to a stone wall, camera zooms in{"reference_path": "cache/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
-a living room filled with furniture next to a stone wall, camera zooms out{"reference_path": "cache/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
-a living room filled with furniture next to a stone wall, camera static{"reference_path": "cache/crop/1-1/a living room filled with furniture next to a stone wall.jpg", "mask_strategy": "0"}
-a table and chairs in a room with sunlight coming through the window{"reference_path": "cache/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
-a table and chairs in a room with sunlight coming through the window, camera pans left{"reference_path": "cache/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
-a table and chairs in a room with sunlight coming through the window, camera pans right{"reference_path": "cache/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
-a table and chairs in a room with sunlight coming through the window, camera tilts up{"reference_path": "cache/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
-a table and chairs in a room with sunlight coming through the window, camera tilts down{"reference_path": "cache/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
-a table and chairs in a room with sunlight coming through the window, camera zooms in{"reference_path": "cache/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
-a table and chairs in a room with sunlight coming through the window, camera zooms out{"reference_path": "cache/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
-a table and chairs in a room with sunlight coming through the window, camera static{"reference_path": "cache/crop/1-1/a table and chairs in a room with sunlight coming through the window.jpg", "mask_strategy": "0"}
-a room filled with lots of shelves filled with books{"reference_path": "cache/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
-a room filled with lots of shelves filled with books, camera pans left{"reference_path": "cache/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
-a room filled with lots of shelves filled with books, camera pans right{"reference_path": "cache/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
-a room filled with lots of shelves filled with books, camera tilts up{"reference_path": "cache/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
-a room filled with lots of shelves filled with books, camera tilts down{"reference_path": "cache/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
-a room filled with lots of shelves filled with books, camera zooms in{"reference_path": "cache/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
-a room filled with lots of shelves filled with books, camera zooms out{"reference_path": "cache/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
-a room filled with lots of shelves filled with books, camera static{"reference_path": "cache/crop/1-1/a room filled with lots of shelves filled with books.jpg", "mask_strategy": "0"}
-an art gallery with paintings on the walls{"reference_path": "cache/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
-an art gallery with paintings on the walls, camera pans left{"reference_path": "cache/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
-an art gallery with paintings on the walls, camera pans right{"reference_path": "cache/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
-an art gallery with paintings on the walls, camera tilts up{"reference_path": "cache/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
-an art gallery with paintings on the walls, camera tilts down{"reference_path": "cache/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
-an art gallery with paintings on the walls, camera zooms in{"reference_path": "cache/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
-an art gallery with paintings on the walls, camera zooms out{"reference_path": "cache/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
-an art gallery with paintings on the walls, camera static{"reference_path": "cache/crop/1-1/an art gallery with paintings on the walls.jpg", "mask_strategy": "0"}
-a room with a lot of pictures on the walls{"reference_path": "cache/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
-a room with a lot of pictures on the walls, camera pans left{"reference_path": "cache/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
-a room with a lot of pictures on the walls, camera pans right{"reference_path": "cache/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
-a room with a lot of pictures on the walls, camera tilts up{"reference_path": "cache/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
-a room with a lot of pictures on the walls, camera tilts down{"reference_path": "cache/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
-a room with a lot of pictures on the walls, camera zooms in{"reference_path": "cache/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
-a room with a lot of pictures on the walls, camera zooms out{"reference_path": "cache/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
-a room with a lot of pictures on the walls, camera static{"reference_path": "cache/crop/1-1/a room with a lot of pictures on the walls.jpg", "mask_strategy": "0"}
-a painting of a cloudy sky next to an easel{"reference_path": "cache/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
-a painting of a cloudy sky next to an easel, camera pans left{"reference_path": "cache/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
-a painting of a cloudy sky next to an easel, camera pans right{"reference_path": "cache/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
-a painting of a cloudy sky next to an easel, camera tilts up{"reference_path": "cache/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
-a painting of a cloudy sky next to an easel, camera tilts down{"reference_path": "cache/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
-a painting of a cloudy sky next to an easel, camera zooms in{"reference_path": "cache/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
-a painting of a cloudy sky next to an easel, camera zooms out{"reference_path": "cache/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
-a painting of a cloudy sky next to an easel, camera static{"reference_path": "cache/crop/1-1/a painting of a cloudy sky next to an easel.jpg", "mask_strategy": "0"}
-a living room with a christmas tree and a rocking chair{"reference_path": "cache/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
-a living room with a christmas tree and a rocking chair, camera pans left{"reference_path": "cache/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
-a living room with a christmas tree and a rocking chair, camera pans right{"reference_path": "cache/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
-a living room with a christmas tree and a rocking chair, camera tilts up{"reference_path": "cache/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
-a living room with a christmas tree and a rocking chair, camera tilts down{"reference_path": "cache/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
-a living room with a christmas tree and a rocking chair, camera zooms in{"reference_path": "cache/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
-a living room with a christmas tree and a rocking chair, camera zooms out{"reference_path": "cache/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
-a living room with a christmas tree and a rocking chair, camera static{"reference_path": "cache/crop/1-1/a living room with a christmas tree and a rocking chair.jpg", "mask_strategy": "0"}
-a kitchen with a sink and a lot of glasses on the counter{"reference_path": "cache/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
-a kitchen with a sink and a lot of glasses on the counter, camera pans left{"reference_path": "cache/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
-a kitchen with a sink and a lot of glasses on the counter, camera pans right{"reference_path": "cache/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
-a kitchen with a sink and a lot of glasses on the counter, camera tilts up{"reference_path": "cache/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
-a kitchen with a sink and a lot of glasses on the counter, camera tilts down{"reference_path": "cache/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
-a kitchen with a sink and a lot of glasses on the counter, camera zooms in{"reference_path": "cache/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
-a kitchen with a sink and a lot of glasses on the counter, camera zooms out{"reference_path": "cache/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
-a kitchen with a sink and a lot of glasses on the counter, camera static{"reference_path": "cache/crop/1-1/a kitchen with a sink and a lot of glasses on the counter.jpg", "mask_strategy": "0"}
-a wooden table in front of a brick wall with bottles on the wall{"reference_path": "cache/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
-a wooden table in front of a brick wall with bottles on the wall, camera pans left{"reference_path": "cache/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
-a wooden table in front of a brick wall with bottles on the wall, camera pans right{"reference_path": "cache/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
-a wooden table in front of a brick wall with bottles on the wall, camera tilts up{"reference_path": "cache/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
-a wooden table in front of a brick wall with bottles on the wall, camera tilts down{"reference_path": "cache/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
-a wooden table in front of a brick wall with bottles on the wall, camera zooms in{"reference_path": "cache/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
-a wooden table in front of a brick wall with bottles on the wall, camera zooms out{"reference_path": "cache/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
-a wooden table in front of a brick wall with bottles on the wall, camera static{"reference_path": "cache/crop/1-1/a wooden table in front of a brick wall with bottles on the wall.jpg", "mask_strategy": "0"}
-a room filled with paintings and statues{"reference_path": "cache/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
-a room filled with paintings and statues, camera pans left{"reference_path": "cache/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
-a room filled with paintings and statues, camera pans right{"reference_path": "cache/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
-a room filled with paintings and statues, camera tilts up{"reference_path": "cache/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
-a room filled with paintings and statues, camera tilts down{"reference_path": "cache/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
-a room filled with paintings and statues, camera zooms in{"reference_path": "cache/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
-a room filled with paintings and statues, camera zooms out{"reference_path": "cache/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
-a room filled with paintings and statues, camera static{"reference_path": "cache/crop/1-1/a room filled with paintings and statues.jpg", "mask_strategy": "0"}
-an outdoor dining area surrounded by plants and a brick walkway{"reference_path": "cache/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
-an outdoor dining area surrounded by plants and a brick walkway, camera pans left{"reference_path": "cache/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
-an outdoor dining area surrounded by plants and a brick walkway, camera pans right{"reference_path": "cache/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
-an outdoor dining area surrounded by plants and a brick walkway, camera tilts up{"reference_path": "cache/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
-an outdoor dining area surrounded by plants and a brick walkway, camera tilts down{"reference_path": "cache/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
-an outdoor dining area surrounded by plants and a brick walkway, camera zooms in{"reference_path": "cache/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
-an outdoor dining area surrounded by plants and a brick walkway, camera zooms out{"reference_path": "cache/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
-an outdoor dining area surrounded by plants and a brick walkway, camera static{"reference_path": "cache/crop/1-1/an outdoor dining area surrounded by plants and a brick walkway.jpg", "mask_strategy": "0"}
-a room filled with books and teddy bears{"reference_path": "cache/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
-a room filled with books and teddy bears, camera pans left{"reference_path": "cache/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
-a room filled with books and teddy bears, camera pans right{"reference_path": "cache/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
-a room filled with books and teddy bears, camera tilts up{"reference_path": "cache/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
-a room filled with books and teddy bears, camera tilts down{"reference_path": "cache/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
-a room filled with books and teddy bears, camera zooms in{"reference_path": "cache/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
-a room filled with books and teddy bears, camera zooms out{"reference_path": "cache/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
-a room filled with books and teddy bears, camera static{"reference_path": "cache/crop/1-1/a room filled with books and teddy bears.jpg", "mask_strategy": "0"}
-a table and chairs in a room with a plant in the corner{"reference_path": "cache/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
-a table and chairs in a room with a plant in the corner, camera pans left{"reference_path": "cache/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
-a table and chairs in a room with a plant in the corner, camera pans right{"reference_path": "cache/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
-a table and chairs in a room with a plant in the corner, camera tilts up{"reference_path": "cache/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
-a table and chairs in a room with a plant in the corner, camera tilts down{"reference_path": "cache/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
-a table and chairs in a room with a plant in the corner, camera zooms in{"reference_path": "cache/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
-a table and chairs in a room with a plant in the corner, camera zooms out{"reference_path": "cache/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
-a table and chairs in a room with a plant in the corner, camera static{"reference_path": "cache/crop/1-1/a table and chairs in a room with a plant in the corner.jpg", "mask_strategy": "0"}
-a living room with a couch, table, and a window{"reference_path": "cache/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
-a living room with a couch, table, and a window, camera pans left{"reference_path": "cache/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
-a living room with a couch, table, and a window, camera pans right{"reference_path": "cache/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
-a living room with a couch, table, and a window, camera tilts up{"reference_path": "cache/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
-a living room with a couch, table, and a window, camera tilts down{"reference_path": "cache/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
-a living room with a couch, table, and a window, camera zooms in{"reference_path": "cache/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
-a living room with a couch, table, and a window, camera zooms out{"reference_path": "cache/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
-a living room with a couch, table, and a window, camera static{"reference_path": "cache/crop/1-1/a living room with a couch, table, and a window.jpg", "mask_strategy": "0"}
-a modern living room with wood floors and a tv{"reference_path": "cache/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
-a modern living room with wood floors and a tv, camera pans left{"reference_path": "cache/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
-a modern living room with wood floors and a tv, camera pans right{"reference_path": "cache/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
-a modern living room with wood floors and a tv, camera tilts up{"reference_path": "cache/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
-a modern living room with wood floors and a tv, camera tilts down{"reference_path": "cache/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
-a modern living room with wood floors and a tv, camera zooms in{"reference_path": "cache/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
-a modern living room with wood floors and a tv, camera zooms out{"reference_path": "cache/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
-a modern living room with wood floors and a tv, camera static{"reference_path": "cache/crop/1-1/a modern living room with wood floors and a tv.jpg", "mask_strategy": "0"}
-a room with a desk and a chair in it{"reference_path": "cache/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
-a room with a desk and a chair in it, camera pans left{"reference_path": "cache/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
-a room with a desk and a chair in it, camera pans right{"reference_path": "cache/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
-a room with a desk and a chair in it, camera tilts up{"reference_path": "cache/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
-a room with a desk and a chair in it, camera tilts down{"reference_path": "cache/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
-a room with a desk and a chair in it, camera zooms in{"reference_path": "cache/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
-a room with a desk and a chair in it, camera zooms out{"reference_path": "cache/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
-a room with a desk and a chair in it, camera static{"reference_path": "cache/crop/1-1/a room with a desk and a chair in it.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a building{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a building, camera pans left{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a building, camera pans right{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a building, camera tilts up{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a building, camera tilts down{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a building, camera zooms in{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a building, camera zooms out{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a building, camera static{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a building.jpg", "mask_strategy": "0"}
-a chair in a room next to some drawings{"reference_path": "cache/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
-a chair in a room next to some drawings, camera pans left{"reference_path": "cache/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
-a chair in a room next to some drawings, camera pans right{"reference_path": "cache/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
-a chair in a room next to some drawings, camera tilts up{"reference_path": "cache/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
-a chair in a room next to some drawings, camera tilts down{"reference_path": "cache/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
-a chair in a room next to some drawings, camera zooms in{"reference_path": "cache/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
-a chair in a room next to some drawings, camera zooms out{"reference_path": "cache/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
-a chair in a room next to some drawings, camera static{"reference_path": "cache/crop/1-1/a chair in a room next to some drawings.jpg", "mask_strategy": "0"}
-a living room with hardwood floors and a white couch{"reference_path": "cache/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
-a living room with hardwood floors and a white couch, camera pans left{"reference_path": "cache/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
-a living room with hardwood floors and a white couch, camera pans right{"reference_path": "cache/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
-a living room with hardwood floors and a white couch, camera tilts up{"reference_path": "cache/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
-a living room with hardwood floors and a white couch, camera tilts down{"reference_path": "cache/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
-a living room with hardwood floors and a white couch, camera zooms in{"reference_path": "cache/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
-a living room with hardwood floors and a white couch, camera zooms out{"reference_path": "cache/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
-a living room with hardwood floors and a white couch, camera static{"reference_path": "cache/crop/1-1/a living room with hardwood floors and a white couch.jpg", "mask_strategy": "0"}
-two people in a canoe on a lake with mountains in the background{"reference_path": "cache/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
-two people in a canoe on a lake with mountains in the background, camera pans left{"reference_path": "cache/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
-two people in a canoe on a lake with mountains in the background, camera pans right{"reference_path": "cache/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
-two people in a canoe on a lake with mountains in the background, camera tilts up{"reference_path": "cache/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
-two people in a canoe on a lake with mountains in the background, camera tilts down{"reference_path": "cache/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
-two people in a canoe on a lake with mountains in the background, camera zooms in{"reference_path": "cache/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
-two people in a canoe on a lake with mountains in the background, camera zooms out{"reference_path": "cache/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
-two people in a canoe on a lake with mountains in the background, camera static{"reference_path": "cache/crop/1-1/two people in a canoe on a lake with mountains in the background.jpg", "mask_strategy": "0"}
-an aerial view of a snowy road in a forest{"reference_path": "cache/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
-an aerial view of a snowy road in a forest, camera pans left{"reference_path": "cache/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
-an aerial view of a snowy road in a forest, camera pans right{"reference_path": "cache/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
-an aerial view of a snowy road in a forest, camera tilts up{"reference_path": "cache/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
-an aerial view of a snowy road in a forest, camera tilts down{"reference_path": "cache/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
-an aerial view of a snowy road in a forest, camera zooms in{"reference_path": "cache/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
-an aerial view of a snowy road in a forest, camera zooms out{"reference_path": "cache/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
-an aerial view of a snowy road in a forest, camera static{"reference_path": "cache/crop/1-1/an aerial view of a snowy road in a forest.jpg", "mask_strategy": "0"}
-a view of a waterfall from a distance{"reference_path": "cache/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
-a view of a waterfall from a distance, camera pans left{"reference_path": "cache/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
-a view of a waterfall from a distance, camera pans right{"reference_path": "cache/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
-a view of a waterfall from a distance, camera tilts up{"reference_path": "cache/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
-a view of a waterfall from a distance, camera tilts down{"reference_path": "cache/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
-a view of a waterfall from a distance, camera zooms in{"reference_path": "cache/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
-a view of a waterfall from a distance, camera zooms out{"reference_path": "cache/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
-a view of a waterfall from a distance, camera static{"reference_path": "cache/crop/1-1/a view of a waterfall from a distance.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a valley{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a valley, camera pans left{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a valley, camera pans right{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a valley, camera tilts up{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a valley, camera tilts down{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a valley, camera zooms in{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a valley, camera zooms out{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a valley, camera static{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a valley.jpg", "mask_strategy": "0"}
-an aerial view of a group of islands in the middle of a lake{"reference_path": "cache/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
-an aerial view of a group of islands in the middle of a lake, camera pans left{"reference_path": "cache/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
-an aerial view of a group of islands in the middle of a lake, camera pans right{"reference_path": "cache/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
-an aerial view of a group of islands in the middle of a lake, camera tilts up{"reference_path": "cache/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
-an aerial view of a group of islands in the middle of a lake, camera tilts down{"reference_path": "cache/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
-an aerial view of a group of islands in the middle of a lake, camera zooms in{"reference_path": "cache/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
-an aerial view of a group of islands in the middle of a lake, camera zooms out{"reference_path": "cache/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
-an aerial view of a group of islands in the middle of a lake, camera static{"reference_path": "cache/crop/1-1/an aerial view of a group of islands in the middle of a lake.jpg", "mask_strategy": "0"}
-an aerial view of a rocky beach in indonesia{"reference_path": "cache/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
-an aerial view of a rocky beach in indonesia, camera pans left{"reference_path": "cache/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
-an aerial view of a rocky beach in indonesia, camera pans right{"reference_path": "cache/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
-an aerial view of a rocky beach in indonesia, camera tilts up{"reference_path": "cache/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
-an aerial view of a rocky beach in indonesia, camera tilts down{"reference_path": "cache/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
-an aerial view of a rocky beach in indonesia, camera zooms in{"reference_path": "cache/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
-an aerial view of a rocky beach in indonesia, camera zooms out{"reference_path": "cache/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
-an aerial view of a rocky beach in indonesia, camera static{"reference_path": "cache/crop/1-1/an aerial view of a rocky beach in indonesia.jpg", "mask_strategy": "0"}
-fireworks in the night sky over a city{"reference_path": "cache/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
-fireworks in the night sky over a city, camera pans left{"reference_path": "cache/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
-fireworks in the night sky over a city, camera pans right{"reference_path": "cache/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
-fireworks in the night sky over a city, camera tilts up{"reference_path": "cache/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
-fireworks in the night sky over a city, camera tilts down{"reference_path": "cache/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
-fireworks in the night sky over a city, camera zooms in{"reference_path": "cache/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
-fireworks in the night sky over a city, camera zooms out{"reference_path": "cache/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
-fireworks in the night sky over a city, camera static{"reference_path": "cache/crop/1-1/fireworks in the night sky over a city.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse on a stormy day{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse on a stormy day, camera pans left{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse on a stormy day, camera pans right{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse on a stormy day, camera tilts up{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse on a stormy day, camera tilts down{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse on a stormy day, camera zooms in{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse on a stormy day, camera zooms out{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
-a large wave crashes into a lighthouse on a stormy day, camera static{"reference_path": "cache/crop/1-1/a large wave crashes into a lighthouse on a stormy day.jpg", "mask_strategy": "0"}
-a mountain range with a sky background{"reference_path": "cache/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
-a mountain range with a sky background, camera pans left{"reference_path": "cache/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
-a mountain range with a sky background, camera pans right{"reference_path": "cache/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
-a mountain range with a sky background, camera tilts up{"reference_path": "cache/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
-a mountain range with a sky background, camera tilts down{"reference_path": "cache/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
-a mountain range with a sky background, camera zooms in{"reference_path": "cache/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
-a mountain range with a sky background, camera zooms out{"reference_path": "cache/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
-a mountain range with a sky background, camera static{"reference_path": "cache/crop/1-1/a mountain range with a sky background.jpg", "mask_strategy": "0"}
-a large bonfire is burning in the night sky{"reference_path": "cache/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
-a large bonfire is burning in the night sky, camera pans left{"reference_path": "cache/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
-a large bonfire is burning in the night sky, camera pans right{"reference_path": "cache/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
-a large bonfire is burning in the night sky, camera tilts up{"reference_path": "cache/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
-a large bonfire is burning in the night sky, camera tilts down{"reference_path": "cache/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
-a large bonfire is burning in the night sky, camera zooms in{"reference_path": "cache/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
-a large bonfire is burning in the night sky, camera zooms out{"reference_path": "cache/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
-a large bonfire is burning in the night sky, camera static{"reference_path": "cache/crop/1-1/a large bonfire is burning in the night sky.jpg", "mask_strategy": "0"}
-a close-up view of the flames of a fireplace{"reference_path": "cache/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
-a close-up view of the flames of a fireplace, camera pans left{"reference_path": "cache/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
-a close-up view of the flames of a fireplace, camera pans right{"reference_path": "cache/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
-a close-up view of the flames of a fireplace, camera tilts up{"reference_path": "cache/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
-a close-up view of the flames of a fireplace, camera tilts down{"reference_path": "cache/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
-a close-up view of the flames of a fireplace, camera zooms in{"reference_path": "cache/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
-a close-up view of the flames of a fireplace, camera zooms out{"reference_path": "cache/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
-a close-up view of the flames of a fireplace, camera static{"reference_path": "cache/crop/1-1/a close-up view of the flames of a fireplace.jpg", "mask_strategy": "0"}
-a farm in the middle of the day{"reference_path": "cache/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
-a farm in the middle of the day, camera pans left{"reference_path": "cache/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
-a farm in the middle of the day, camera pans right{"reference_path": "cache/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
-a farm in the middle of the day, camera tilts up{"reference_path": "cache/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
-a farm in the middle of the day, camera tilts down{"reference_path": "cache/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
-a farm in the middle of the day, camera zooms in{"reference_path": "cache/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
-a farm in the middle of the day, camera zooms out{"reference_path": "cache/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
-a farm in the middle of the day, camera static{"reference_path": "cache/crop/1-1/a farm in the middle of the day.jpg", "mask_strategy": "0"}
-a flock of birds flying over a tree at sunset{"reference_path": "cache/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
-a flock of birds flying over a tree at sunset, camera pans left{"reference_path": "cache/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
-a flock of birds flying over a tree at sunset, camera pans right{"reference_path": "cache/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
-a flock of birds flying over a tree at sunset, camera tilts up{"reference_path": "cache/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
-a flock of birds flying over a tree at sunset, camera tilts down{"reference_path": "cache/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
-a flock of birds flying over a tree at sunset, camera zooms in{"reference_path": "cache/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
-a flock of birds flying over a tree at sunset, camera zooms out{"reference_path": "cache/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
-a flock of birds flying over a tree at sunset, camera static{"reference_path": "cache/crop/1-1/a flock of birds flying over a tree at sunset.jpg", "mask_strategy": "0"}
-a captivating scene featuring a spiral galaxy shining brilliantly in the night sky{"reference_path": "cache/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
-a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera pans left{"reference_path": "cache/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
-a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera pans right{"reference_path": "cache/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
-a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera tilts up{"reference_path": "cache/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
-a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera tilts down{"reference_path": "cache/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
-a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera zooms in{"reference_path": "cache/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
-a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera zooms out{"reference_path": "cache/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
-a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera static{"reference_path": "cache/crop/1-1/a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg", "mask_strategy": "0"}
-a mountain with snow on it{"reference_path": "cache/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
-a mountain with snow on it, camera pans left{"reference_path": "cache/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
-a mountain with snow on it, camera pans right{"reference_path": "cache/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
-a mountain with snow on it, camera tilts up{"reference_path": "cache/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
-a mountain with snow on it, camera tilts down{"reference_path": "cache/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
-a mountain with snow on it, camera zooms in{"reference_path": "cache/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
-a mountain with snow on it, camera zooms out{"reference_path": "cache/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
-a mountain with snow on it, camera static{"reference_path": "cache/crop/1-1/a mountain with snow on it.jpg", "mask_strategy": "0"}
-a bridge that is in the middle of a river{"reference_path": "cache/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
-a bridge that is in the middle of a river, camera pans left{"reference_path": "cache/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
-a bridge that is in the middle of a river, camera pans right{"reference_path": "cache/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
-a bridge that is in the middle of a river, camera tilts up{"reference_path": "cache/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
-a bridge that is in the middle of a river, camera tilts down{"reference_path": "cache/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
-a bridge that is in the middle of a river, camera zooms in{"reference_path": "cache/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
-a bridge that is in the middle of a river, camera zooms out{"reference_path": "cache/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
-a bridge that is in the middle of a river, camera static{"reference_path": "cache/crop/1-1/a bridge that is in the middle of a river.jpg", "mask_strategy": "0"}
-a group of people standing on top of a green hill{"reference_path": "cache/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
-a group of people standing on top of a green hill, camera pans left{"reference_path": "cache/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
-a group of people standing on top of a green hill, camera pans right{"reference_path": "cache/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
-a group of people standing on top of a green hill, camera tilts up{"reference_path": "cache/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
-a group of people standing on top of a green hill, camera tilts down{"reference_path": "cache/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
-a group of people standing on top of a green hill, camera zooms in{"reference_path": "cache/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
-a group of people standing on top of a green hill, camera zooms out{"reference_path": "cache/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
-a group of people standing on top of a green hill, camera static{"reference_path": "cache/crop/1-1/a group of people standing on top of a green hill.jpg", "mask_strategy": "0"}
-a sandy beach with a wooden pier in the water{"reference_path": "cache/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
-a sandy beach with a wooden pier in the water, camera pans left{"reference_path": "cache/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
-a sandy beach with a wooden pier in the water, camera pans right{"reference_path": "cache/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
-a sandy beach with a wooden pier in the water, camera tilts up{"reference_path": "cache/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
-a sandy beach with a wooden pier in the water, camera tilts down{"reference_path": "cache/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
-a sandy beach with a wooden pier in the water, camera zooms in{"reference_path": "cache/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
-a sandy beach with a wooden pier in the water, camera zooms out{"reference_path": "cache/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
-a sandy beach with a wooden pier in the water, camera static{"reference_path": "cache/crop/1-1/a sandy beach with a wooden pier in the water.jpg", "mask_strategy": "0"}
-a lake surrounded by mountains and flowers{"reference_path": "cache/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
-a lake surrounded by mountains and flowers, camera pans left{"reference_path": "cache/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
-a lake surrounded by mountains and flowers, camera pans right{"reference_path": "cache/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
-a lake surrounded by mountains and flowers, camera tilts up{"reference_path": "cache/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
-a lake surrounded by mountains and flowers, camera tilts down{"reference_path": "cache/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
-a lake surrounded by mountains and flowers, camera zooms in{"reference_path": "cache/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
-a lake surrounded by mountains and flowers, camera zooms out{"reference_path": "cache/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
-a lake surrounded by mountains and flowers, camera static{"reference_path": "cache/crop/1-1/a lake surrounded by mountains and flowers.jpg", "mask_strategy": "0"}
-a hot-air balloon flying over a desert landscape{"reference_path": "cache/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
-a hot-air balloon flying over a desert landscape, camera pans left{"reference_path": "cache/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
-a hot-air balloon flying over a desert landscape, camera pans right{"reference_path": "cache/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
-a hot-air balloon flying over a desert landscape, camera tilts up{"reference_path": "cache/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
-a hot-air balloon flying over a desert landscape, camera tilts down{"reference_path": "cache/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
-a hot-air balloon flying over a desert landscape, camera zooms in{"reference_path": "cache/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
-a hot-air balloon flying over a desert landscape, camera zooms out{"reference_path": "cache/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
-a hot-air balloon flying over a desert landscape, camera static{"reference_path": "cache/crop/1-1/a hot-air balloon flying over a desert landscape.jpg", "mask_strategy": "0"}
-several hot air balloons flying over a city{"reference_path": "cache/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
-several hot air balloons flying over a city, camera pans left{"reference_path": "cache/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
-several hot air balloons flying over a city, camera pans right{"reference_path": "cache/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
-several hot air balloons flying over a city, camera tilts up{"reference_path": "cache/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
-several hot air balloons flying over a city, camera tilts down{"reference_path": "cache/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
-several hot air balloons flying over a city, camera zooms in{"reference_path": "cache/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
-several hot air balloons flying over a city, camera zooms out{"reference_path": "cache/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
-several hot air balloons flying over a city, camera static{"reference_path": "cache/crop/1-1/several hot air balloons flying over a city.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a field{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a field, camera pans left{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a field, camera pans right{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a field, camera tilts up{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a field, camera tilts down{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a field, camera zooms in{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a field, camera zooms out{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
-a group of hot air balloons flying over a field, camera static{"reference_path": "cache/crop/1-1/a group of hot air balloons flying over a field.jpg", "mask_strategy": "0"}
-a large wave crashes over a rocky cliff{"reference_path": "cache/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
-a large wave crashes over a rocky cliff, camera pans left{"reference_path": "cache/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
-a large wave crashes over a rocky cliff, camera pans right{"reference_path": "cache/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
-a large wave crashes over a rocky cliff, camera tilts up{"reference_path": "cache/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
-a large wave crashes over a rocky cliff, camera tilts down{"reference_path": "cache/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
-a large wave crashes over a rocky cliff, camera zooms in{"reference_path": "cache/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
-a large wave crashes over a rocky cliff, camera zooms out{"reference_path": "cache/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
-a large wave crashes over a rocky cliff, camera static{"reference_path": "cache/crop/1-1/a large wave crashes over a rocky cliff.jpg", "mask_strategy": "0"}
-the sun is setting over a lake in the mountains{"reference_path": "cache/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
-the sun is setting over a lake in the mountains, camera pans left{"reference_path": "cache/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
-the sun is setting over a lake in the mountains, camera pans right{"reference_path": "cache/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
-the sun is setting over a lake in the mountains, camera tilts up{"reference_path": "cache/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
-the sun is setting over a lake in the mountains, camera tilts down{"reference_path": "cache/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
-the sun is setting over a lake in the mountains, camera zooms in{"reference_path": "cache/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
-the sun is setting over a lake in the mountains, camera zooms out{"reference_path": "cache/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
-the sun is setting over a lake in the mountains, camera static{"reference_path": "cache/crop/1-1/the sun is setting over a lake in the mountains.jpg", "mask_strategy": "0"}
-a mountain range with snow on the ground{"reference_path": "cache/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
-a mountain range with snow on the ground, camera pans left{"reference_path": "cache/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
-a mountain range with snow on the ground, camera pans right{"reference_path": "cache/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
-a mountain range with snow on the ground, camera tilts up{"reference_path": "cache/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
-a mountain range with snow on the ground, camera tilts down{"reference_path": "cache/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
-a mountain range with snow on the ground, camera zooms in{"reference_path": "cache/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
-a mountain range with snow on the ground, camera zooms out{"reference_path": "cache/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
-a mountain range with snow on the ground, camera static{"reference_path": "cache/crop/1-1/a mountain range with snow on the ground.jpg", "mask_strategy": "0"}
-sun rays shining through clouds over a lake{"reference_path": "cache/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
-sun rays shining through clouds over a lake, camera pans left{"reference_path": "cache/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
-sun rays shining through clouds over a lake, camera pans right{"reference_path": "cache/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
-sun rays shining through clouds over a lake, camera tilts up{"reference_path": "cache/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
-sun rays shining through clouds over a lake, camera tilts down{"reference_path": "cache/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
-sun rays shining through clouds over a lake, camera zooms in{"reference_path": "cache/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
-sun rays shining through clouds over a lake, camera zooms out{"reference_path": "cache/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
-sun rays shining through clouds over a lake, camera static{"reference_path": "cache/crop/1-1/sun rays shining through clouds over a lake.jpg", "mask_strategy": "0"}
-a boat sits on the shore of a lake with mt fuji in the background{"reference_path": "cache/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
-a boat sits on the shore of a lake with mt fuji in the background, camera pans left{"reference_path": "cache/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
-a boat sits on the shore of a lake with mt fuji in the background, camera pans right{"reference_path": "cache/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
-a boat sits on the shore of a lake with mt fuji in the background, camera tilts up{"reference_path": "cache/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
-a boat sits on the shore of a lake with mt fuji in the background, camera tilts down{"reference_path": "cache/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
-a boat sits on the shore of a lake with mt fuji in the background, camera zooms in{"reference_path": "cache/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
-a boat sits on the shore of a lake with mt fuji in the background, camera zooms out{"reference_path": "cache/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
-a boat sits on the shore of a lake with mt fuji in the background, camera static{"reference_path": "cache/crop/1-1/a boat sits on the shore of a lake with mt fuji in the background.jpg", "mask_strategy": "0"}
-a foggy road with trees in the distance{"reference_path": "cache/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
-a foggy road with trees in the distance, camera pans left{"reference_path": "cache/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
-a foggy road with trees in the distance, camera pans right{"reference_path": "cache/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
-a foggy road with trees in the distance, camera tilts up{"reference_path": "cache/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
-a foggy road with trees in the distance, camera tilts down{"reference_path": "cache/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
-a foggy road with trees in the distance, camera zooms in{"reference_path": "cache/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
-a foggy road with trees in the distance, camera zooms out{"reference_path": "cache/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
-a foggy road with trees in the distance, camera static{"reference_path": "cache/crop/1-1/a foggy road with trees in the distance.jpg", "mask_strategy": "0"}
-two swans swimming on a lake in the fog{"reference_path": "cache/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
-two swans swimming on a lake in the fog, camera pans left{"reference_path": "cache/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
-two swans swimming on a lake in the fog, camera pans right{"reference_path": "cache/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
-two swans swimming on a lake in the fog, camera tilts up{"reference_path": "cache/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
-two swans swimming on a lake in the fog, camera tilts down{"reference_path": "cache/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
-two swans swimming on a lake in the fog, camera zooms in{"reference_path": "cache/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
-two swans swimming on a lake in the fog, camera zooms out{"reference_path": "cache/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
-two swans swimming on a lake in the fog, camera static{"reference_path": "cache/crop/1-1/two swans swimming on a lake in the fog.jpg", "mask_strategy": "0"}
-the sun is shining through the trees near a waterfall{"reference_path": "cache/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
-the sun is shining through the trees near a waterfall, camera pans left{"reference_path": "cache/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
-the sun is shining through the trees near a waterfall, camera pans right{"reference_path": "cache/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
-the sun is shining through the trees near a waterfall, camera tilts up{"reference_path": "cache/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
-the sun is shining through the trees near a waterfall, camera tilts down{"reference_path": "cache/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
-the sun is shining through the trees near a waterfall, camera zooms in{"reference_path": "cache/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
-the sun is shining through the trees near a waterfall, camera zooms out{"reference_path": "cache/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
-the sun is shining through the trees near a waterfall, camera static{"reference_path": "cache/crop/1-1/the sun is shining through the trees near a waterfall.jpg", "mask_strategy": "0"}
-a sandy beach with palm trees on the shore{"reference_path": "cache/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
-a sandy beach with palm trees on the shore, camera pans left{"reference_path": "cache/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
-a sandy beach with palm trees on the shore, camera pans right{"reference_path": "cache/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
-a sandy beach with palm trees on the shore, camera tilts up{"reference_path": "cache/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
-a sandy beach with palm trees on the shore, camera tilts down{"reference_path": "cache/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
-a sandy beach with palm trees on the shore, camera zooms in{"reference_path": "cache/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
-a sandy beach with palm trees on the shore, camera zooms out{"reference_path": "cache/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
-a sandy beach with palm trees on the shore, camera static{"reference_path": "cache/crop/1-1/a sandy beach with palm trees on the shore.jpg", "mask_strategy": "0"}
-an aerial view of a body of water and a beach{"reference_path": "cache/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
-an aerial view of a body of water and a beach, camera pans left{"reference_path": "cache/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
-an aerial view of a body of water and a beach, camera pans right{"reference_path": "cache/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
-an aerial view of a body of water and a beach, camera tilts up{"reference_path": "cache/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
-an aerial view of a body of water and a beach, camera tilts down{"reference_path": "cache/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
-an aerial view of a body of water and a beach, camera zooms in{"reference_path": "cache/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
-an aerial view of a body of water and a beach, camera zooms out{"reference_path": "cache/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
-an aerial view of a body of water and a beach, camera static{"reference_path": "cache/crop/1-1/an aerial view of a body of water and a beach.jpg", "mask_strategy": "0"}
-a foggy field that has trees in the grass{"reference_path": "cache/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
-a foggy field that has trees in the grass, camera pans left{"reference_path": "cache/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
-a foggy field that has trees in the grass, camera pans right{"reference_path": "cache/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
-a foggy field that has trees in the grass, camera tilts up{"reference_path": "cache/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
-a foggy field that has trees in the grass, camera tilts down{"reference_path": "cache/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
-a foggy field that has trees in the grass, camera zooms in{"reference_path": "cache/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
-a foggy field that has trees in the grass, camera zooms out{"reference_path": "cache/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
-a foggy field that has trees in the grass, camera static{"reference_path": "cache/crop/1-1/a foggy field that has trees in the grass.jpg", "mask_strategy": "0"}
-a foggy landscape with trees and hills in the distance{"reference_path": "cache/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
-a foggy landscape with trees and hills in the distance, camera pans left{"reference_path": "cache/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
-a foggy landscape with trees and hills in the distance, camera pans right{"reference_path": "cache/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
-a foggy landscape with trees and hills in the distance, camera tilts up{"reference_path": "cache/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
-a foggy landscape with trees and hills in the distance, camera tilts down{"reference_path": "cache/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
-a foggy landscape with trees and hills in the distance, camera zooms in{"reference_path": "cache/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
-a foggy landscape with trees and hills in the distance, camera zooms out{"reference_path": "cache/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
-a foggy landscape with trees and hills in the distance, camera static{"reference_path": "cache/crop/1-1/a foggy landscape with trees and hills in the distance.jpg", "mask_strategy": "0"}
-a large wave in the ocean with a lot of spray coming from it{"reference_path": "cache/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
-a large wave in the ocean with a lot of spray coming from it, camera pans left{"reference_path": "cache/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
-a large wave in the ocean with a lot of spray coming from it, camera pans right{"reference_path": "cache/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
-a large wave in the ocean with a lot of spray coming from it, camera tilts up{"reference_path": "cache/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
-a large wave in the ocean with a lot of spray coming from it, camera tilts down{"reference_path": "cache/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
-a large wave in the ocean with a lot of spray coming from it, camera zooms in{"reference_path": "cache/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
-a large wave in the ocean with a lot of spray coming from it, camera zooms out{"reference_path": "cache/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
-a large wave in the ocean with a lot of spray coming from it, camera static{"reference_path": "cache/crop/1-1/a large wave in the ocean with a lot of spray coming from it.jpg", "mask_strategy": "0"}
-a waterfall is shown in the middle of a lush green hillside{"reference_path": "cache/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a waterfall is shown in the middle of a lush green hillside, camera pans left{"reference_path": "cache/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a waterfall is shown in the middle of a lush green hillside, camera pans right{"reference_path": "cache/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a waterfall is shown in the middle of a lush green hillside, camera tilts up{"reference_path": "cache/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a waterfall is shown in the middle of a lush green hillside, camera tilts down{"reference_path": "cache/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a waterfall is shown in the middle of a lush green hillside, camera zooms in{"reference_path": "cache/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a waterfall is shown in the middle of a lush green hillside, camera zooms out{"reference_path": "cache/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a waterfall is shown in the middle of a lush green hillside, camera static{"reference_path": "cache/crop/1-1/a waterfall is shown in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-an aerial view of a curvy road in the middle of a forest{"reference_path": "cache/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
-an aerial view of a curvy road in the middle of a forest, camera pans left{"reference_path": "cache/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
-an aerial view of a curvy road in the middle of a forest, camera pans right{"reference_path": "cache/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
-an aerial view of a curvy road in the middle of a forest, camera tilts up{"reference_path": "cache/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
-an aerial view of a curvy road in the middle of a forest, camera tilts down{"reference_path": "cache/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
-an aerial view of a curvy road in the middle of a forest, camera zooms in{"reference_path": "cache/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
-an aerial view of a curvy road in the middle of a forest, camera zooms out{"reference_path": "cache/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
-an aerial view of a curvy road in the middle of a forest, camera static{"reference_path": "cache/crop/1-1/an aerial view of a curvy road in the middle of a forest.jpg", "mask_strategy": "0"}
-a mountain covered in snow with evergreen trees{"reference_path": "cache/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
-a mountain covered in snow with evergreen trees, camera pans left{"reference_path": "cache/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
-a mountain covered in snow with evergreen trees, camera pans right{"reference_path": "cache/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
-a mountain covered in snow with evergreen trees, camera tilts up{"reference_path": "cache/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
-a mountain covered in snow with evergreen trees, camera tilts down{"reference_path": "cache/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
-a mountain covered in snow with evergreen trees, camera zooms in{"reference_path": "cache/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
-a mountain covered in snow with evergreen trees, camera zooms out{"reference_path": "cache/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
-a mountain covered in snow with evergreen trees, camera static{"reference_path": "cache/crop/1-1/a mountain covered in snow with evergreen trees.jpg", "mask_strategy": "0"}
-a very large waterfall in the middle of the day{"reference_path": "cache/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
-a very large waterfall in the middle of the day, camera pans left{"reference_path": "cache/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
-a very large waterfall in the middle of the day, camera pans right{"reference_path": "cache/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
-a very large waterfall in the middle of the day, camera tilts up{"reference_path": "cache/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
-a very large waterfall in the middle of the day, camera tilts down{"reference_path": "cache/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
-a very large waterfall in the middle of the day, camera zooms in{"reference_path": "cache/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
-a very large waterfall in the middle of the day, camera zooms out{"reference_path": "cache/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
-a very large waterfall in the middle of the day, camera static{"reference_path": "cache/crop/1-1/a very large waterfall in the middle of the day.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a lush green hillside{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a lush green hillside, camera pans left{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a lush green hillside, camera pans right{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a lush green hillside, camera tilts up{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a lush green hillside, camera tilts down{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a lush green hillside, camera zooms in{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a lush green hillside, camera zooms out{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a large waterfall in the middle of a lush green hillside, camera static{"reference_path": "cache/crop/1-1/a large waterfall in the middle of a lush green hillside.jpg", "mask_strategy": "0"}
-a brown bear in the water with a fish in its mouth{"reference_path": "cache/crop/1-1/a brown bear in the water with a fish in its mouth.jpg", "mask_strategy": "0"}
-a close-up of a hippopotamus eating grass in a field{"reference_path": "cache/crop/1-1/a close-up of a hippopotamus eating grass in a field.jpg", "mask_strategy": "0"}
-a sea turtle swimming in the ocean under the water{"reference_path": "cache/crop/1-1/a sea turtle swimming in the ocean under the water.jpg", "mask_strategy": "0"}
-two bees are flying over a lavender plant{"reference_path": "cache/crop/1-1/two bees are flying over a lavender plant.jpg", "mask_strategy": "0"}
-the otter is standing in the water{"reference_path": "cache/crop/1-1/the otter is standing in the water.jpg", "mask_strategy": "0"}
-a dog carrying a soccer ball in its mouth{"reference_path": "cache/crop/1-1/a dog carrying a soccer ball in its mouth.jpg", "mask_strategy": "0"}
-an eagle is flying over a mountain with trees in the background{"reference_path": "cache/crop/1-1/an eagle is flying over a mountain with trees in the background.jpg", "mask_strategy": "0"}
-a couple of horses are running in the dirt{"reference_path": "cache/crop/1-1/a couple of horses are running in the dirt.jpg", "mask_strategy": "0"}
-a highland cow with long horns standing in a field{"reference_path": "cache/crop/1-1/a highland cow with long horns standing in a field.jpg", "mask_strategy": "0"}
-a monkey is holding a banana in its mouth{"reference_path": "cache/crop/1-1/a monkey is holding a banana in its mouth.jpg", "mask_strategy": "0"}
-a large rhino grazing in the grass near a bush{"reference_path": "cache/crop/1-1/a large rhino grazing in the grass near a bush.jpg", "mask_strategy": "0"}
-a butterfly sits on top of a purple flower{"reference_path": "cache/crop/1-1/a butterfly sits on top of a purple flower.jpg", "mask_strategy": "0"}
-an alligator is covered in green plants in the water{"reference_path": "cache/crop/1-1/an alligator is covered in green plants in the water.jpg", "mask_strategy": "0"}
-a red panda eating bamboo in a zoo{"reference_path": "cache/crop/1-1/a red panda eating bamboo in a zoo.jpg", "mask_strategy": "0"}
-a monochromatic video capturing a cat's gaze into the camera{"reference_path": "cache/crop/1-1/a monochromatic video capturing a cat's gaze into the camera.jpg", "mask_strategy": "0"}
-a frog sitting on top of water lily leaves{"reference_path": "cache/crop/1-1/a frog sitting on top of water lily leaves.jpg", "mask_strategy": "0"}
-a lion is roaring in the wild{"reference_path": "cache/crop/1-1/a lion is roaring in the wild.jpg", "mask_strategy": "0"}
-a seagull is flying towards a person's hand{"reference_path": "cache/crop/1-1/a seagull is flying towards a person's hand.jpg", "mask_strategy": "0"}
-a yellow and white jellyfish is floating in the ocean{"reference_path": "cache/crop/1-1/a yellow and white jellyfish is floating in the ocean.jpg", "mask_strategy": "0"}
-a group of jellyfish swimming in an aquarium{"reference_path": "cache/crop/1-1/a group of jellyfish swimming in an aquarium.jpg", "mask_strategy": "0"}
-a clown fish hiding in a purple anemone{"reference_path": "cache/crop/1-1/a clown fish hiding in a purple anemone.jpg", "mask_strategy": "0"}
-a snake sitting on the ground next to a bowl{"reference_path": "cache/crop/1-1/a snake sitting on the ground next to a bowl.jpg", "mask_strategy": "0"}
-a brown and white cow eating hay{"reference_path": "cache/crop/1-1/a brown and white cow eating hay.jpg", "mask_strategy": "0"}
-a seal swimming in the water{"reference_path": "cache/crop/1-1/a seal swimming in the water.jpg", "mask_strategy": "0"}
-a panda bear is eating a piece of bamboo{"reference_path": "cache/crop/1-1/a panda bear is eating a piece of bamboo.jpg", "mask_strategy": "0"}
-a small bird sits on a moss covered branch{"reference_path": "cache/crop/1-1/a small bird sits on a moss covered branch.jpg", "mask_strategy": "0"}
-a bird with a fish in its beak flying over a field{"reference_path": "cache/crop/1-1/a bird with a fish in its beak flying over a field.jpg", "mask_strategy": "0"}
-a large flock of birds flying in the sky{"reference_path": "cache/crop/1-1/a large flock of birds flying in the sky.jpg", "mask_strategy": "0"}
-a bald eagle flying over a tree filled forest{"reference_path": "cache/crop/1-1/a bald eagle flying over a tree filled forest.jpg", "mask_strategy": "0"}
-a giraffe walking in a field{"reference_path": "cache/crop/1-1/a giraffe walking in a field.jpg", "mask_strategy": "0"}
-a lioness yawning in a field{"reference_path": "cache/crop/1-1/a lioness yawning in a field.jpg", "mask_strategy": "0"}
-a little crab scurried on the sandy beach{"reference_path": "cache/crop/1-1/a little crab scurried on the sandy beach.jpg", "mask_strategy": "0"}
-a warthog is walking in the grass{"reference_path": "cache/crop/1-1/a warthog is walking in the grass.jpg", "mask_strategy": "0"}
-a penguin walking on a beach near the water{"reference_path": "cache/crop/1-1/a penguin walking on a beach near the water.jpg", "mask_strategy": "0"}
-a tiger walking through a wooded area{"reference_path": "cache/crop/1-1/a tiger walking through a wooded area.jpg", "mask_strategy": "0"}
-a tiger walking on a dirt path in the woods{"reference_path": "cache/crop/1-1/a tiger walking on a dirt path in the woods.jpg", "mask_strategy": "0"}
-a small monkey holding a piece of food in it's mouth{"reference_path": "cache/crop/1-1/a small monkey holding a piece of food in it's mouth.jpg", "mask_strategy": "0"}
-a squirrel sitting on the ground eating a piece of bread{"reference_path": "cache/crop/1-1/a squirrel sitting on the ground eating a piece of bread.jpg", "mask_strategy": "0"}
-a group of fish swimming over a coral reef{"reference_path": "cache/crop/1-1/a group of fish swimming over a coral reef.jpg", "mask_strategy": "0"}
-a toad is sitting on top of some moss{"reference_path": "cache/crop/1-1/a toad is sitting on top of some moss.jpg", "mask_strategy": "0"}
-a great white shark swimming in the ocean{"reference_path": "cache/crop/1-1/a great white shark swimming in the ocean.jpg", "mask_strategy": "0"}
-a group of camels resting in the desert{"reference_path": "cache/crop/1-1/a group of camels resting in the desert.jpg", "mask_strategy": "0"}
-two sheep grazing in the grass next to a wooden bridge{"reference_path": "cache/crop/1-1/two sheep grazing in the grass next to a wooden bridge.jpg", "mask_strategy": "0"}
-an elephant walking through a forest{"reference_path": "cache/crop/1-1/an elephant walking through a forest.jpg", "mask_strategy": "0"}
-a white rooster standing in a grassy field{"reference_path": "cache/crop/1-1/a white rooster standing in a grassy field.jpg", "mask_strategy": "0"}
-a zebra walking across a dirt road near a field{"reference_path": "cache/crop/1-1/a zebra walking across a dirt road near a field.jpg", "mask_strategy": "0"}
-cars are driving down a street lined with tall trees{"reference_path": "cache/crop/1-1/cars are driving down a street lined with tall trees.jpg", "mask_strategy": "0"}
-the cars on the street are waiting for the traffic lights{"reference_path": "cache/crop/1-1/the cars on the street are waiting for the traffic lights.jpg", "mask_strategy": "0"}
-a bicycle leaning against a fence in the snow{"reference_path": "cache/crop/1-1/a bicycle leaning against a fence in the snow.jpg", "mask_strategy": "0"}
-a blue fishing boat is navigating in the ocean next to a cruise ship{"reference_path": "cache/crop/1-1/a blue fishing boat is navigating in the ocean next to a cruise ship.jpg", "mask_strategy": "0"}
-a blue car driving down a dirt road near train tracks{"reference_path": "cache/crop/1-1/a blue car driving down a dirt road near train tracks.jpg", "mask_strategy": "0"}
-a sailboat is drifting on the ocean{"reference_path": "cache/crop/1-1/a sailboat is drifting on the ocean.jpg", "mask_strategy": "0"}
-a couple of boats floating on a body of water{"reference_path": "cache/crop/1-1/a couple of boats floating on a body of water.jpg", "mask_strategy": "0"}
-a city street with cars driving in the rain{"reference_path": "cache/crop/1-1/a city street with cars driving in the rain.jpg", "mask_strategy": "0"}
-a red and white tram traveling down a snowy street{"reference_path": "cache/crop/1-1/a red and white tram traveling down a snowy street.jpg", "mask_strategy": "0"}
-a city bus driving down a snowy street at night{"reference_path": "cache/crop/1-1/a city bus driving down a snowy street at night.jpg", "mask_strategy": "0"}
-a green toy car is sitting on the ground{"reference_path": "cache/crop/1-1/a green toy car is sitting on the ground.jpg", "mask_strategy": "0"}
-a train traveling down tracks through the woods with leaves on the ground{"reference_path": "cache/crop/1-1/a train traveling down tracks through the woods with leaves on the ground.jpg", "mask_strategy": "0"}
-a man in a small boat fishing in the ocean{"reference_path": "cache/crop/1-1/a man in a small boat fishing in the ocean.jpg", "mask_strategy": "0"}
-an airplane is flying through the sky at sunset{"reference_path": "cache/crop/1-1/an airplane is flying through the sky at sunset.jpg", "mask_strategy": "0"}
-an old rusty car sits in the middle of a field{"reference_path": "cache/crop/1-1/an old rusty car sits in the middle of a field.jpg", "mask_strategy": "0"}
-a motorcycle driving down a road{"reference_path": "cache/crop/1-1/a motorcycle driving down a road.jpg", "mask_strategy": "0"}
-a blue train traveling through a lush green area{"reference_path": "cache/crop/1-1/a blue train traveling through a lush green area.jpg", "mask_strategy": "0"}
-a white car is swiftly driving on a dirt road near a bush, kicking up dust{"reference_path": "cache/crop/1-1/a white car is swiftly driving on a dirt road near a bush, kicking up dust.jpg", "mask_strategy": "0"}
-a large cargo ship sailing in the water{"reference_path": "cache/crop/1-1/a large cargo ship sailing in the water.jpg", "mask_strategy": "0"}
-the red Alfa sports car is speeding down the road{"reference_path": "cache/crop/1-1/the red Alfa sports car is speeding down the road.jpg", "mask_strategy": "0"}
-two cars that have been involved in a violent collision{"reference_path": "cache/crop/1-1/two cars that have been involved in a violent collision.jpg", "mask_strategy": "0"}
-a red double decker bus driving down a street{"reference_path": "cache/crop/1-1/a red double decker bus driving down a street.jpg", "mask_strategy": "0"}
-A red sports car driving through sand, kicking up a large amount of dust{"reference_path": "cache/crop/1-1/A red sports car driving through sand, kicking up a large amount of dust.jpg", "mask_strategy": "0"}
-a yellow toy car parked on a rock near the water{"reference_path": "cache/crop/1-1/a yellow toy car parked on a rock near the water.jpg", "mask_strategy": "0"}
-a space shuttle taking off into the sky{"reference_path": "cache/crop/1-1/a space shuttle taking off into the sky.jpg", "mask_strategy": "0"}
-a steam train traveling through the woods{"reference_path": "cache/crop/1-1/a steam train traveling through the woods.jpg", "mask_strategy": "0"}
-a group of buses parked at a bus station{"reference_path": "cache/crop/1-1/a group of buses parked at a bus station.jpg", "mask_strategy": "0"}
-A bunch of cars are driving on a highway{"reference_path": "cache/crop/1-1/A bunch of cars are driving on a highway.jpg", "mask_strategy": "0"}
-a white and blue airplane flying in the sky{"reference_path": "cache/crop/1-1/a white and blue airplane flying in the sky.jpg", "mask_strategy": "0"}
-A space station orbited above the Earth{"reference_path": "cache/crop/1-1/A space station orbited above the Earth.jpg", "mask_strategy": "0"}
-A yellow boat is cruising in front of a bridge{"reference_path": "cache/crop/1-1/A yellow boat is cruising in front of a bridge.jpg", "mask_strategy": "0"}
-tangerines in a metal bowl on a table{"reference_path": "cache/crop/1-1/tangerines in a metal bowl on a table.jpg", "mask_strategy": "0"}
-a shadow of a hand reaching for a leaf{"reference_path": "cache/crop/1-1/a shadow of a hand reaching for a leaf.jpg", "mask_strategy": "0"}
-A teddy bear is climbing over a wooden fence{"reference_path": "cache/crop/1-1/A teddy bear is climbing over a wooden fence.jpg", "mask_strategy": "0"}
-a book on fire with flames coming out of it{"reference_path": "cache/crop/1-1/a book on fire with flames coming out of it.jpg", "mask_strategy": "0"}
-a close-up of a pink rose with water droplets on it{"reference_path": "cache/crop/1-1/a close-up of a pink rose with water droplets on it.jpg", "mask_strategy": "0"}
-a person is cooking meat on a grill with flames{"reference_path": "cache/crop/1-1/a person is cooking meat on a grill with flames.jpg", "mask_strategy": "0"}
-a snowman wearing a santa hat and scarf{"reference_path": "cache/crop/1-1/a snowman wearing a santa hat and scarf.jpg", "mask_strategy": "0"}
-a person holding a sparkler in their hand{"reference_path": "cache/crop/1-1/a person holding a sparkler in their hand.jpg", "mask_strategy": "0"}
-a teddy bear sitting on a moss covered ground{"reference_path": "cache/crop/1-1/a teddy bear sitting on a moss covered ground.jpg", "mask_strategy": "0"}
-a statue of a lion is sitting on a pedestal{"reference_path": "cache/crop/1-1/a statue of a lion is sitting on a pedestal.jpg", "mask_strategy": "0"}
-metal balls are suspended in the air{"reference_path": "cache/crop/1-1/metal balls are suspended in the air.jpg", "mask_strategy": "0"}
-a close up of a bunch of green grapes{"reference_path": "cache/crop/1-1/a close up of a bunch of green grapes.jpg", "mask_strategy": "0"}
-a close-up view of a green plant with unfurled fronds{"reference_path": "cache/crop/1-1/a close-up view of a green plant with unfurled fronds.jpg", "mask_strategy": "0"}
-an orange mushroom sitting on top of a tree stump in the woods{"reference_path": "cache/crop/1-1/an orange mushroom sitting on top of a tree stump in the woods.jpg", "mask_strategy": "0"}
-a stack of pancakes covered in syrup and fruit{"reference_path": "cache/crop/1-1/a stack of pancakes covered in syrup and fruit.jpg", "mask_strategy": "0"}
-a plate of spaghetti with spinach and tomatoes{"reference_path": "cache/crop/1-1/a plate of spaghetti with spinach and tomatoes.jpg", "mask_strategy": "0"}
-a pink lotus flower in the middle of a pond{"reference_path": "cache/crop/1-1/a pink lotus flower in the middle of a pond.jpg", "mask_strategy": "0"}
-a person holding a sparkler in front of a sunset{"reference_path": "cache/crop/1-1/a person holding a sparkler in front of a sunset.jpg", "mask_strategy": "0"}
-a pink rose is blooming in a garden{"reference_path": "cache/crop/1-1/a pink rose is blooming in a garden.jpg", "mask_strategy": "0"}
-a snow man holding a lantern in the snow{"reference_path": "cache/crop/1-1/a snow man holding a lantern in the snow.jpg", "mask_strategy": "0"}
-a stack of chocolate cookies with a bite taken out of it{"reference_path": "cache/crop/1-1/a stack of chocolate cookies with a bite taken out of it.jpg", "mask_strategy": "0"}
-a white plate topped with eggs, toast, tomatoes, and a sausage{"reference_path": "cache/crop/1-1/a white plate topped with eggs, toast, tomatoes, and a sausage.jpg", "mask_strategy": "0"}
-a yellow water lily is floating in a pond{"reference_path": "cache/crop/1-1/a yellow water lily is floating in a pond.jpg", "mask_strategy": "0"}
-an astronaut floating in space with the earth in the background{"reference_path": "cache/crop/1-1/an astronaut floating in space with the earth in the background.jpg", "mask_strategy": "0"}
-A little girl, lost in thought, is quietly sitting on the bus{"reference_path": "cache/crop/1-1/A little girl, lost in thought, is quietly sitting on the bus.jpg", "mask_strategy": "0"}
-a man holding a tray in front of a brick wall{"reference_path": "cache/crop/1-1/a man holding a tray in front of a brick wall.jpg", "mask_strategy": "0"}
-an older man playing a saxophone on the street{"reference_path": "cache/crop/1-1/an older man playing a saxophone on the street.jpg", "mask_strategy": "0"}
-an older man jogging by the water{"reference_path": "cache/crop/1-1/an older man jogging by the water.jpg", "mask_strategy": "0"}
-a person riding a skateboard on a concrete floor{"reference_path": "cache/crop/1-1/a person riding a skateboard on a concrete floor.jpg", "mask_strategy": "0"}
-a woman with long black hair is posing for a picture{"reference_path": "cache/crop/1-1/a woman with long black hair is posing for a picture.jpg", "mask_strategy": "0"}
-a woman sitting on the ground in front of a guitar{"reference_path": "cache/crop/1-1/a woman sitting on the ground in front of a guitar.jpg", "mask_strategy": "0"}
-a little girl wearing a purple helmet riding a blue bike{"reference_path": "cache/crop/1-1/a little girl wearing a purple helmet riding a blue bike.jpg", "mask_strategy": "0"}
-a young boy is jumping in the mud{"reference_path": "cache/crop/1-1/a young boy is jumping in the mud.jpg", "mask_strategy": "0"}
-a man sitting in the driver's seat of a car wearing sunglasses{"reference_path": "cache/crop/1-1/a man sitting in the driver's seat of a car wearing sunglasses.jpg", "mask_strategy": "0"}
-a little boy jumping in the air over a puddle of water{"reference_path": "cache/crop/1-1/a little boy jumping in the air over a puddle of water.jpg", "mask_strategy": "0"}
-a woman with afro hair is smiling while wearing earphones{"reference_path": "cache/crop/1-1/a woman with afro hair is smiling while wearing earphones.jpg", "mask_strategy": "0"}
-a smiling woman with her hands clasped{"reference_path": "cache/crop/1-1/a smiling woman with her hands clasped.jpg", "mask_strategy": "0"}
-a young boy standing in a field with horses in the background{"reference_path": "cache/crop/1-1/a young boy standing in a field with horses in the background.jpg", "mask_strategy": "0"}
-a young man is covered in colored powder{"reference_path": "cache/crop/1-1/a young man is covered in colored powder.jpg", "mask_strategy": "0"}
-a woman with curly hair is drinking a beer{"reference_path": "cache/crop/1-1/a woman with curly hair is drinking a beer.jpg", "mask_strategy": "0"}
-an old man standing in the middle of a field holding a bunch of plants{"reference_path": "cache/crop/1-1/an old man standing in the middle of a field holding a bunch of plants.jpg", "mask_strategy": "0"}
-a man standing on a boat with a net{"reference_path": "cache/crop/1-1/a man standing on a boat with a net.jpg", "mask_strategy": "0"}
-a woman in a hat is putting salt into a basket{"reference_path": "cache/crop/1-1/a woman in a hat is putting salt into a basket.jpg", "mask_strategy": "0"}
-a young girl smelling a pink flower{"reference_path": "cache/crop/1-1/a young girl smelling a pink flower.jpg", "mask_strategy": "0"}
-a young boy leaning on a wooden pole{"reference_path": "cache/crop/1-1/a young boy leaning on a wooden pole.jpg", "mask_strategy": "0"}
-a man in a hat sitting in front of a brick oven{"reference_path": "cache/crop/1-1/a man in a hat sitting in front of a brick oven.jpg", "mask_strategy": "0"}
-a man in a mexican outfit holding an acoustic guitar{"reference_path": "cache/crop/1-1/a man in a mexican outfit holding an acoustic guitar.jpg", "mask_strategy": "0"}
-a snowboarder is in the air doing a trick{"reference_path": "cache/crop/1-1/a snowboarder is in the air doing a trick.jpg", "mask_strategy": "0"}
-a man riding a horse with a spear in his hand{"reference_path": "cache/crop/1-1/a man riding a horse with a spear in his hand.jpg", "mask_strategy": "0"}
-a woman carrying a bundle of plants over their head{"reference_path": "cache/crop/1-1/a woman carrying a bundle of plants over their head.jpg", "mask_strategy": "0"}
-a person jumping in the air over a fence{"reference_path": "cache/crop/1-1/a person jumping in the air over a fence.jpg", "mask_strategy": "0"}
-a man on a surfboard riding a wave in the ocean{"reference_path": "cache/crop/1-1/a man on a surfboard riding a wave in the ocean.jpg", "mask_strategy": "0"}
-a man sitting on steps playing an acoustic guitar{"reference_path": "cache/crop/1-1/a man sitting on steps playing an acoustic guitar.jpg", "mask_strategy": "0"}
-a man swinging a tennis racquet at a tennis ball{"reference_path": "cache/crop/1-1/a man swinging a tennis racquet at a tennis ball.jpg", "mask_strategy": "0"}
-a man riding a mountain bike on top of a rocky hill{"reference_path": "cache/crop/1-1/a man riding a mountain bike on top of a rocky hill.jpg", "mask_strategy": "0"}
-a man riding a bike down a street{"reference_path": "cache/crop/1-1/a man riding a bike down a street.jpg", "mask_strategy": "0"}
-a man is running on a dirt road{"reference_path": "cache/crop/1-1/a man is running on a dirt road.jpg", "mask_strategy": "0"}
-A man in a black suit and a sombrero, shouting loudly{"reference_path": "cache/crop/1-1/A man in a black suit and a sombrero, shouting loudly.jpg", "mask_strategy": "0"}
-a man standing on top of a sand dune in the desert{"reference_path": "cache/crop/1-1/a man standing on top of a sand dune in the desert.jpg", "mask_strategy": "0"}
-a person riding a motorcycle down a road{"reference_path": "cache/crop/1-1/a person riding a motorcycle down a road.jpg", "mask_strategy": "0"}
-a man standing on top of a mountain with a backpack{"reference_path": "cache/crop/1-1/a man standing on top of a mountain with a backpack.jpg", "mask_strategy": "0"}
-a man with a skull face paint smoking a cigar and holding a guitar{"reference_path": "cache/crop/1-1/a man with a skull face paint smoking a cigar and holding a guitar.jpg", "mask_strategy": "0"}
-a man in sunglasses laying on a wooden bench{"reference_path": "cache/crop/1-1/a man in sunglasses laying on a wooden bench.jpg", "mask_strategy": "0"}
-an older woman sitting in a room with a cigarette in her hand{"reference_path": "cache/crop/1-1/an older woman sitting in a room with a cigarette in her hand.jpg", "mask_strategy": "0"}
-a man sitting on the ground playing a musical instrument{"reference_path": "cache/crop/1-1/a man sitting on the ground playing a musical instrument.jpg", "mask_strategy": "0"}
-a person riding a horse in a polo match{"reference_path": "cache/crop/1-1/a person riding a horse in a polo match.jpg", "mask_strategy": "0"}
-a woman in a kimono holding an umbrella{"reference_path": "cache/crop/1-1/a woman in a kimono holding an umbrella.jpg", "mask_strategy": "0"}
-a person riding a dirt bike{"reference_path": "cache/crop/1-1/a person riding a dirt bike.jpg", "mask_strategy": "0"}
-a person riding an atv on a dirt track{"reference_path": "cache/crop/1-1/a person riding an atv on a dirt track.jpg", "mask_strategy": "0"}
-a person riding a wave on a surfboard{"reference_path": "cache/crop/1-1/a person riding a wave on a surfboard.jpg", "mask_strategy": "0"}
-a woman in a wetsuit is swimming in the ocean{"reference_path": "cache/crop/1-1/a woman in a wetsuit is swimming in the ocean.jpg", "mask_strategy": "0"}
-a man snorkling in the ocean{"reference_path": "cache/crop/1-1/a man snorkling in the ocean.jpg", "mask_strategy": "0"}
-a beautiful woman in a blue sari posing in front of a wall{"reference_path": "cache/crop/1-1/a beautiful woman in a blue sari posing in front of a wall.jpg", "mask_strategy": "0"}
-a woman wearing a shawl in front of a mountain{"reference_path": "cache/crop/1-1/a woman wearing a shawl in front of a mountain.jpg", "mask_strategy": "0"}
-a woman is making bread in an oven{"reference_path": "cache/crop/1-1/a woman is making bread in an oven.jpg", "mask_strategy": "0"}
-a woman smiles while holding a yellow flower{"reference_path": "cache/crop/1-1/a woman smiles while holding a yellow flower.jpg", "mask_strategy": "0"}
-A young boy is lifting a bundle of dry grass wrapped in waterproof fabric over his head{"reference_path": "cache/crop/1-1/A young boy is lifting a bundle of dry grass wrapped in waterproof fabric over his head.jpg", "mask_strategy": "0"}
-two people performing a sword fight in front of a forest{"reference_path": "cache/crop/1-1/two people performing a sword fight in front of a forest.jpg", "mask_strategy": "0"}
-a woman in a colorful shirt is cooking food{"reference_path": "cache/crop/1-1/a woman in a colorful shirt is cooking food.jpg", "mask_strategy": "0"}
-an older woman is drinking a bottle of water{"reference_path": "cache/crop/1-1/an older woman is drinking a bottle of water.jpg", "mask_strategy": "0"}
-a smiling woman sitting at a table with food and drinks{"reference_path": "cache/crop/1-1/a smiling woman sitting at a table with food and drinks.jpg", "mask_strategy": "0"}
-a woman wearing a hijab reading a book on the beach{"reference_path": "cache/crop/1-1/a woman wearing a hijab reading a book on the beach.jpg", "mask_strategy": "0"}
-a woman wearing a headscarf is reaching for an olive tree{"reference_path": "cache/crop/1-1/a woman wearing a headscarf is reaching for an olive tree.jpg", "mask_strategy": "0"}
-a woman in a white dress jumping in the air in a field of pink flowers{"reference_path": "cache/crop/1-1/a woman in a white dress jumping in the air in a field of pink flowers.jpg", "mask_strategy": "0"}
-a woman wearing a conical hat sits on a boat{"reference_path": "cache/crop/1-1/a woman wearing a conical hat sits on a boat.jpg", "mask_strategy": "0"}
-an older woman sitting in front of an old building{"reference_path": "cache/crop/1-1/an older woman sitting in front of an old building.jpg", "mask_strategy": "0"}
-a woman is praying in front of a buddhist temple{"reference_path": "cache/crop/1-1/a woman is praying in front of a buddhist temple.jpg", "mask_strategy": "0"}
-a woman with green hair smiling for the camera{"reference_path": "cache/crop/1-1/a woman with green hair smiling for the camera.jpg", "mask_strategy": "0"}
-A group of people in a yellow raft is rowing through turbulent waters{"reference_path": "cache/crop/1-1/A group of people in a yellow raft is rowing through turbulent waters.jpg", "mask_strategy": "0"}
-a man carrying a woman on his back in a field{"reference_path": "cache/crop/1-1/a man carrying a woman on his back in a field.jpg", "mask_strategy": "0"}
-an indian police officer talking to an old woman{"reference_path": "cache/crop/1-1/an indian police officer talking to an old woman.jpg", "mask_strategy": "0"}
-two people scuba diving in the ocean{"reference_path": "cache/crop/1-1/two people scuba diving in the ocean.jpg", "mask_strategy": "0"}
-A man and woman dressed as sugar skulls in a field of flowers, sharing a loving gaze with each other{"reference_path": "cache/crop/1-1/A man and woman dressed as sugar skulls in a field of flowers, sharing a loving gaze with each other.jpg", "mask_strategy": "0"}
-a group of people watching a cow race{"reference_path": "cache/crop/1-1/a group of people watching a cow race.jpg", "mask_strategy": "0"}
-a man and a child riding bumper cars in an amusement park{"reference_path": "cache/crop/1-1/a man and a child riding bumper cars in an amusement park.jpg", "mask_strategy": "0"}
-a group of motorcyclists racing on a dirt track{"reference_path": "cache/crop/1-1/a group of motorcyclists racing on a dirt track.jpg", "mask_strategy": "0"}
-a man and a woman are boxing in a boxing ring{"reference_path": "cache/crop/1-1/a man and a woman are boxing in a boxing ring.jpg", "mask_strategy": "0"}
-a man holding a baby in his arms{"reference_path": "cache/crop/1-1/a man holding a baby in his arms.jpg", "mask_strategy": "0"}
-a man and a woman sitting on a bench playing instruments{"reference_path": "cache/crop/1-1/a man and a woman sitting on a bench playing instruments.jpg", "mask_strategy": "0"}
-two men are standing next to each other with a bicycle{"reference_path": "cache/crop/1-1/two men are standing next to each other with a bicycle.jpg", "mask_strategy": "0"}
-a man and a boy sitting on a beach near the ocean{"reference_path": "cache/crop/1-1/a man and a boy sitting on a beach near the ocean.jpg", "mask_strategy": "0"}
-two men in white clothing standing next to each other{"reference_path": "cache/crop/1-1/two men in white clothing standing next to each other.jpg", "mask_strategy": "0"}
-a group of men riding horses in a dusty arena{"reference_path": "cache/crop/1-1/a group of men riding horses in a dusty arena.jpg", "mask_strategy": "0"}
-a soccer player in a yellow and black shirt is chasing a soccer ball{"reference_path": "cache/crop/1-1/a soccer player in a yellow and black shirt is chasing a soccer ball.jpg", "mask_strategy": "0"}
-a group of women sitting on the steps of a building{"reference_path": "cache/crop/1-1/a group of women sitting on the steps of a building.jpg", "mask_strategy": "0"}
-a group of people gathered around a red checkered blanket{"reference_path": "cache/crop/1-1/a group of people gathered around a red checkered blanket.jpg", "mask_strategy": "0"}
-a group of people in orange jumpsuits running along a river{"reference_path": "cache/crop/1-1/a group of people in orange jumpsuits running along a river.jpg", "mask_strategy": "0"}
-a woman walking down a sidewalk with a bag{"reference_path": "cache/crop/1-1/a woman walking down a sidewalk with a bag.jpg", "mask_strategy": "0"}
-a busy street with cars and people on motorcycles{"reference_path": "cache/crop/1-1/a busy street with cars and people on motorcycles.jpg", "mask_strategy": "0"}
-a man in a mask is walking through a crowd of people{"reference_path": "cache/crop/1-1/a man in a mask is walking through a crowd of people.jpg", "mask_strategy": "0"}
-a man and a woman walking under an umbrella next to a brick wall{"reference_path": "cache/crop/1-1/a man and a woman walking under an umbrella next to a brick wall.jpg", "mask_strategy": "0"}
-a group of people riding bikes down a street{"reference_path": "cache/crop/1-1/a group of people riding bikes down a street.jpg", "mask_strategy": "0"}
-An old person is holding a cup on the street, and people around are curiously looking at him{"reference_path": "cache/crop/1-1/An old person is holding a cup on the street, and people around are curiously looking at him.jpg", "mask_strategy": "0"}
-two young girls playing with leaves in the woods{"reference_path": "cache/crop/1-1/two young girls playing with leaves in the woods.jpg", "mask_strategy": "0"}
-One person is riding on the back of a horse led by another person{"reference_path": "cache/crop/1-1/One person is riding on the back of a horse led by another person.jpg", "mask_strategy": "0"}
-an older woman and a young girl are knitting together{"reference_path": "cache/crop/1-1/an older woman and a young girl are knitting together.jpg", "mask_strategy": "0"}
-three geishas walking down the street in traditional clothing{"reference_path": "cache/crop/1-1/three geishas walking down the street in traditional clothing.jpg", "mask_strategy": "0"}
-two men riding bikes down a road near a forest{"reference_path": "cache/crop/1-1/two men riding bikes down a road near a forest.jpg", "mask_strategy": "0"}
-two women carrying bowls on their heads{"reference_path": "cache/crop/1-1/two women carrying bowls on their heads.jpg", "mask_strategy": "0"}
-two women eating pizza at a restaurant{"reference_path": "cache/crop/1-1/two women eating pizza at a restaurant.jpg", "mask_strategy": "0"}
-two young women studying in a library{"reference_path": "cache/crop/1-1/two young women studying in a library.jpg", "mask_strategy": "0"}
-pink water lilies in a pond with leaves{"reference_path": "cache/crop/1-1/pink water lilies in a pond with leaves.jpg", "mask_strategy": "0"}
-a group of succulents in a rock garden{"reference_path": "cache/crop/1-1/a group of succulents in a rock garden.jpg", "mask_strategy": "0"}
-a close up view of a bunch of snowdrop flowers{"reference_path": "cache/crop/1-1/a close up view of a bunch of snowdrop flowers.jpg", "mask_strategy": "0"}
-a close up of leaves with water droplets on them{"reference_path": "cache/crop/1-1/a close up of leaves with water droplets on them.jpg", "mask_strategy": "0"}
-a close-up of a sea anemone in the water{"reference_path": "cache/crop/1-1/a close-up of a sea anemone in the water.jpg", "mask_strategy": "0"}
-a plant with water droplets on it{"reference_path": "cache/crop/1-1/a plant with water droplets on it.jpg", "mask_strategy": "0"}
-a group of cactus plants in the desert{"reference_path": "cache/crop/1-1/a group of cactus plants in the desert.jpg", "mask_strategy": "0"}
-a close-up view of a plant with spiky leaves{"reference_path": "cache/crop/1-1/a close-up view of a plant with spiky leaves.jpg", "mask_strategy": "0"}
-A budding and blossoming flower bud seedling{"reference_path": "cache/crop/1-1/A budding and blossoming flower bud seedling.jpg", "mask_strategy": "0"}
-a field of orange flowers near the ocean'{"reference_path": "cache/crop/1-1/a field of orange flowers near the ocean'.jpg", "mask_strategy": "0"}
-a close-up view of a bunch of pink flowers{"reference_path": "cache/crop/1-1/a close-up view of a bunch of pink flowers.jpg", "mask_strategy": "0"}
-pink water lilies in a pond{"reference_path": "cache/crop/1-1/pink water lilies in a pond.jpg", "mask_strategy": "0"}
-reeds blowing in the wind against a cloudy sky{"reference_path": "cache/crop/1-1/reeds blowing in the wind against a cloudy sky.jpg", "mask_strategy": "0"}
-two tall cacti in the middle of the desert{"reference_path": "cache/crop/1-1/two tall cacti in the middle of the desert.jpg", "mask_strategy": "0"}
-a sea anemone on a coral reef{"reference_path": "cache/crop/1-1/a sea anemone on a coral reef.jpg", "mask_strategy": "0"}
-a dandelion blowing in the wind{"reference_path": "cache/crop/1-1/a dandelion blowing in the wind.jpg", "mask_strategy": "0"}
-A boiling pot cooking vegetables{"reference_path": "cache/crop/1-1/A boiling pot cooking vegetables.jpg", "mask_strategy": "0"}
-a woman stirring food in a pan on the stove{"reference_path": "cache/crop/1-1/a woman stirring food in a pan on the stove.jpg", "mask_strategy": "0"}
-two eggs are fried in a frying pan on the stove{"reference_path": "cache/crop/1-1/two eggs are fried in a frying pan on the stove.jpg", "mask_strategy": "0"}
-fried onion rings in a basket{"reference_path": "cache/crop/1-1/fried onion rings in a basket.jpg", "mask_strategy": "0"}
-a pot is sitting on top of a campfire{"reference_path": "cache/crop/1-1/a pot is sitting on top of a campfire.jpg", "mask_strategy": "0"}
-a chef is preparing a dish with mushrooms on a wooden board{"reference_path": "cache/crop/1-1/a chef is preparing a dish with mushrooms on a wooden board.jpg", "mask_strategy": "0"}
-a hand holding a slice of pizza{"reference_path": "cache/crop/1-1/a hand holding a slice of pizza.jpg", "mask_strategy": "0"}
-A person is using tongs to pick up meat from a plate{"reference_path": "cache/crop/1-1/A person is using tongs to pick up meat from a plate.jpg", "mask_strategy": "0"}
-The meat is picked up from the grill with tongs{"reference_path": "cache/crop/1-1/The meat is picked up from the grill with tongs.jpg", "mask_strategy": "0"}
-A person is whisking eggs, and the egg whites and yolks are gently streaming out{"reference_path": "cache/crop/1-1/A person is whisking eggs, and the egg whites and yolks are gently streaming out.jpg", "mask_strategy": "0"}
-a person is putting sauce on a burger{"reference_path": "cache/crop/1-1/a person is putting sauce on a burger.jpg", "mask_strategy": "0"}
-A person is making dumplings{"reference_path": "cache/crop/1-1/A person is making dumplings.jpg", "mask_strategy": "0"}
-a pan filled with fried food{"reference_path": "cache/crop/1-1/a pan filled with fried food.jpg", "mask_strategy": "0"}
-Chopsticks are slowly picking up the buns from the plastic container{"reference_path": "cache/crop/1-1/Chopsticks are slowly picking up the buns from the plastic container.jpg", "mask_strategy": "0"}
-a basket of french fries in a fryer{"reference_path": "cache/crop/1-1/a basket of french fries in a fryer.jpg", "mask_strategy": "0"}
-a table with lobsters and drinks on it{"reference_path": "cache/crop/1-1/a table with lobsters and drinks on it.jpg", "mask_strategy": "0"}
-a person pouring coffee into a pot on a stove{"reference_path": "cache/crop/1-1/a person pouring coffee into a pot on a stove.jpg", "mask_strategy": "0"}
-a kettle is sitting on top of a campfire{"reference_path": "cache/crop/1-1/a kettle is sitting on top of a campfire.jpg", "mask_strategy": "0"}
-Chopsticks are picking up noodles from the bowl{"reference_path": "cache/crop/1-1/Chopsticks are picking up noodles from the bowl.jpg", "mask_strategy": "0"}
-a person is cooking eggs on an outdoor grill{"reference_path": "cache/crop/1-1/a person is cooking eggs on an outdoor grill.jpg", "mask_strategy": "0"}
-a person is cooking food in a wok on a stove{"reference_path": "cache/crop/1-1/a person is cooking food in a wok on a stove.jpg", "mask_strategy": "0"}
-a person is holding up a burger with his hands{"reference_path": "cache/crop/1-1/a person is holding up a burger with his hands.jpg", "mask_strategy": "0"}
-A person is pouring water into a teacup{"reference_path": "cache/crop/1-1/A person is pouring water into a teacup.jpg", "mask_strategy": "0"}
-a person pouring seasoning into a pot of food{"reference_path": "cache/crop/1-1/a person pouring seasoning into a pot of food.jpg", "mask_strategy": "0"}
-a person holding a taco in their hand{"reference_path": "cache/crop/1-1/a person holding a taco in their hand.jpg", "mask_strategy": "0"}
-a person slicing salmon on a cutting board{"reference_path": "cache/crop/1-1/a person slicing salmon on a cutting board.jpg", "mask_strategy": "0"}
-a bunch of food is cooking on a grill over an open fire{"reference_path": "cache/crop/1-1/a bunch of food is cooking on a grill over an open fire.jpg", "mask_strategy": "0"}
-a close up of a piece of sushi on chopsticks{"reference_path": "cache/crop/1-1/a close up of a piece of sushi on chopsticks.jpg", "mask_strategy": "0"}
-a group of pots on a stove with flames in the background{"reference_path": "cache/crop/1-1/a group of pots on a stove with flames in the background.jpg", "mask_strategy": "0"}
-a person cooking vegetables in a pan on a stove{"reference_path": "cache/crop/1-1/a person cooking vegetables in a pan on a stove.jpg", "mask_strategy": "0"}
-a large pot of soup filled with vegetables and meat{"reference_path": "cache/crop/1-1/a large pot of soup filled with vegetables and meat.jpg", "mask_strategy": "0"}
-a person holding chopsticks over a bowl of food{"reference_path": "cache/crop/1-1/a person holding chopsticks over a bowl of food.jpg", "mask_strategy": "0"}
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/animal.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/animal.txt
deleted file mode 100644
index 775f4c4d6f5190ce860ab61358a3896e7762f104..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/animal.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-a black dog wearing halloween costume
-spider making a web
-bat eating fruits while hanging
-a snake crawling on a wooden flooring
-a close up video of a dragonfly
-macro shot of ladybug on green leaf plant
-chameleon eating ant
-a bee feeding on nectars
-bird nests on a tree captured with moving camera
-a squirrel eating nuts
-close up video of snail
-top view of a hermit crab crawling on a wooden surface
-cat licking another cat
-red dragonfly perched on green leaf
-close up view of a brown caterpillar crawling on green leaf
-ants eating dead spider
-an eagle on a tree branch
-a frog eating an ant
-white rabbit near the fence
-a gorilla eating a carrot
-close up of wolf
-a meerkat looking around
-a hyena in a zoo
-lemur eating grass leaves
-an owl being trained by a man
-a lizard on a bamboo
-brown chicken hunting for its food
-video of parrots perched on bird stand
-underwater footage of an octopus in a coral reef
-a cute pomeranian dog playing with a soccer ball
-white fox on rock
-close up footage of a horse figurine
-giraffe feeding on a tree in a savannah
-curious cat sitting and looking around
-hummingbird hawk moth flying near pink flowers
-close up of a scorpion on a rock
-close up on fish in net
-koala eating leaves from a branch
-a pod of dolphins swirling in the sea catching forage fish
-low angle view of a hawk perched on a tree branch
-a lion standing on wild grass
-deer grazing in the field
-elephant herd in a savanna
-close up on lobster under water
-hedgehog crossing road in forest
-a sheep eating yellow flowers from behind a wire fence
-twin sisters and a turtle
-a pig wallowing in mud
-flock of goose eating on the lake water
-cow in a field irritated with flies
-a close up shot of a fly
-cheetah lying on the grass
-close up of a lemur
-close up shot of a kangaroo itching in the sand
-a tortoise covered with algae
-turkey in cage
-a great blue heron bird in the lakeside
-crab with shell in aquarium
-a seagull walking on shore
-an american crocodile
-a tiger walking inside a cage
-alligator in the nature
-a raccoon climbing a tree
-wild rabbit in a green meadow
-group of ring tailed lemurs
-a clouded leopard on a tree branch
-duck grooming its feathers
-an african penguin walking on a beach
-a video of a peacock
-close up shot of a wild bear
-baby rhino plays with mom
-porcupine climbs tree branches
-close up of a natterjack toad on a rock
-a sleeping orangutan
-mother whale swimming with babies
-a bear wearing red jersey
-pink jellyfish swimming underwater in a blue sea
-beautiful clown fish swimming
-animation of disposable objects shaped as a whale
-paper cut out of a pair of hands a whale and a heart
-vertical video of camel roaming in the field during daytime
-a still video of mosquito biting human
-a curious sloth hanging from a tree branch
-a plastic flamingo bird stumbles from the wind
-a wolf in its natural habitat
-a monkey sitting in the stone and scratching his head
-bat hanging upside down
-a red panda eating leaves
-snake on ground
-a harbour seal swimming near the shore
-shark swimming in the sea
-otter on branch while eating
-goat standing over a rock
-a troop of monkey on top of a mountain
-a zebra eating grass on the field
-a colorful butterfly perching on a bud
-a snail crawling on a leaf
-zookeeper showering a baby elephant
-a beetle emerging from the sand
-a nine banded armadillo searching for food
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/architecture.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/architecture.txt
deleted file mode 100644
index 599b76baafda9b85e6a13cb56978522fdecfe2f2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/architecture.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-an apartment building with balcony
-asian garden and medieval castle
-illuminated tower in berlin
-a wooden house overseeing the lake
-a crowd of people in a plaza in front of a government building
-a church interior
-jewish friends posing with hanukkah menorah in a cabin house
-a destroyed building after a missile attack in ukraine
-abandoned building in the woods
-drone video of an abandoned school building in pripyat ukraine
-elegant university building
-architecture and designs of buildings in central london
-a pancake tower with chocolate syrup and strawberries on top
-an ancient white building
-friends hanging out at a coffee house
-house front door with christmas decorations
-city night dark building
-a bird house hanging on a tree branch
-sacred sculpture in a temple
-high angle shot of a clock tower
-modern wooden house interior
-the interior of an abandoned building
-opera house overlooking sea
-a concrete structure near the green trees
-dome like building in scotland
-low angle shot of a building
-tower on hill
-a miniature house
-eiffel tower from the seine river
-low angle footage of an apartment building
-island with pier and antique building
-asian historic architecture
-drone footage of a beautiful mansion
-mosque in the middle east
-building a tent and hammock in the forest camping site
-top view of a high rise building
-house covered in snow
-skyscraper at night
-house in village
-a casino with people outside the building
-silhouette of a building
-a woman climbing a tree house
-drone view of house near lake during golden hour
-an under construction concrete house
-a watch tower by the sea
-exterior view of arabic style building
-video of a hotel building
-red paper lantern decorations hanging outside a building
-house on seashore
-aerial footage of the palace of culture and science building in warsaw poland
-aerial video of stuttgart tv tower in germany
-aerial view of the highway and building in a city
-drone shot of a skyscraper san francisco california usa
-waterfall and house
-view of the sky through a building
-drone footage of a house on top of the mountain
-abandoned house in the nature
-clouds hovering over a mansion
-light house on the ocean
-buddhist temple at sunrise
-people walking by a graveyard near a mosque at sunset
-view of lifeguard tower on the beach
-scenic view of a house in the mountains
-the landscape in front of a government building
-aerial footage of a building and its surrounding landscape in winter
-time lapse of a cloudy sky behind a transmission tower
-blue ocean near the brown castle
-fog over temple
-house in countryside top view
-building under construction
-turkish flag waving on old tower
-the georgian building
-close up shot of a steel structure
-the atrium and interior design of a multi floor building
-city view reflected on a glass building
-aerial view of a luxurious house with pool
-an unpaved road leading to the house
-drone footage of a lookout tower in mountain landscape
-wind turbines on hill behind building
-time lapse footage of the sun light in front of a small house porch
-a building built with lots of stairways
-overcast over house on seashore
-the view of the sydney opera house from the other side of the harbor
-candle on a jar and a house figurine on a surface
-video of a farm and house
-a dilapidated building made of bricks
-a view of a unique building from a moving vehicle
-aerial footage of a tall building in cambodia
-push in shot of a huge house
-a beach house built over a seawall protected from the sea waves
-exotic house surrounded by trees
-drone video of a house surrounded by tropical vegetation
-drone footage of a building beside a pond
-observation tower on hill in forest
-a tree house in the woods
-a video of vessel structure during daytime
-fire in front of illuminated building at night
-a footage of a wooden house on a wheat field
-tilt shot of a solar panel below a light tower
-water tower on the desert
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/food.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/food.txt
deleted file mode 100644
index 032aed96d61b209fa9e58befe3bf5e4afe9dd20a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/food.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-freshly baked finger looking cookies
-video of fake blood in wine glass
-halloween food art
-a person slicing a vegetable
-a serving of pumpkin dish in a plate
-close up view of green leafy vegetable
-a birthday cake in the plate
-video of a slice papaya fruit
-a muffin with a burning candle and a love sign by a ceramic mug
-a jack o lantern designed cookie
-baked bread with chocolate
-a broccoli soup on wooden table
-a freshly brewed coffee on a pink mug
-grabbing sourdough neapolitan style pizza slices
-person cooking mushrooms in frying pan
-rice grains placed on a reusable cloth bag
-slices of kiwi fruit
-grilling a steak on a pan grill
-close up of bread popping out of a toaster
-man eating noodle
-preparing a cocktail drink
-close up pasta with bacon on plate
-milk and cinnamon rolls
-boy getting a dumpling using chopsticks
-a mother preparing food with her kids
-man using his phone while eating
-fresh salmon salad on a plate
-cutting cucumbers into long thin slices as ingredient for sushi roll
-a steaming cup of tea by the window
-a glass filled with beer
-a kid eating popcorn while watching tv
-close up shot of fried fish on the plate
-a man eating a donut
-person making a vegetarian dish
-spreading cheese on bagel
-close up view of a man drinking red wine
-a couple having breakfast in a restaurant
-a student eating her sandwich
-girl peeling a banana
-red rice in a small bowl
-pancake with blueberry on the top
-green apple fruit on white wooden table
-a man eating a taco by the bar
-making of a burrito
-squeezing lemon into salad
-a chef cutting sushi rolls
-video of a delicious dessert
-deep frying a crab on a wok in high fire
-close up video of a orange juice
-video of a cooked chicken breast
-woman holding a pineapple
-a woman eating a bar of chocolate
-decorating christmas cookie
-squeezing a slice of fruit
-tuna sashimi on a plate
-a strawberry fruit mixed in an alcoholic drink
-preparing hot dogs in a grill
-a woman cutting a tomato
-an orange fruit cut in half
-a coconut fruit with drinking straw
-woman holding a dragon fruit
-a woman pouring hot beverage on a cup
-waffles with whipped cream and fruit
-focus shot of an insect at the bottom of a fruit
-preparing a healthy broccoli dish
-man eating snack at picnic
-close up video of a grilled shrimp skewer
-a woman mixing a smoothie drinks
-close up video of woman having a bite of jelly
-businessman drinking whiskey at the bar counter of a hotel lounge
-cutting an onion with a knife over a wooden chopping board
-fresh lemonade in bottles
-grilling a meat on a charcoal grill
-people enjoying asian cuisine
-close up footage of a hot dish on a clay pot
-pork ribs dish
-waffle with strawberry and syrup for breakfast
-tofu dish with rose garnish
-uncooked pork meat
-egg yolk being dumped over gourmet dish
-tasty brunch dish close up
-little boy pretending to eat the watermelon
-slicing roasted beef
-close up of a chef adding teriyaki sauce to a dish
-flat lay mexican dish
-a person placing an octopus dish on a marble surface
-close up of tea leaves brewing in a glass kettle
-adding fresh herbs to soup dish
-a scoop of roasted coffee beans
-fresh dim sum set up on a bamboo steam tray for cooking
-a girl putting ketchup on food at the kitchen
-cooking on electric stove
-a woman with a slice of a pie
-grapes and wine on a wooden board
-man taking picture of his food
-hamburger and fries on restaurant table
-close up video of japanese food
-a cracker sandwich with cheese filling for snack
-barista preparing matcha tea
-close up of onion rings being deep fried
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/human.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/human.txt
deleted file mode 100644
index 88de93a4815801b42c80d24d282c296aae827dd8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/human.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-people carving a pumpkin
-people sitting on a sofa
-a man with a muertos face painting
-man walking in the dark
-men in front of their computer editing photos
-men loading christmas tree on tow truck
-woman washing the dishes
-woman adding honey to the cinnamon rolls
-two women kissing and smiling
-three women looking at watercolor paintings
-a family wearing paper bag masks
-a family posing for the camera
-a boy covering a rose flower with a dome glass
-boy sitting on grass petting a dog
-a girl in her tennis sportswear
-a girl coloring the cardboard
-silhouette of the couple during sunset
-couple dancing with body paint
-a child playing with water
-a woman with her child sitting on a couch in the living room
-a group of friend place doing hand gestures of agreement
-friends having a group selfie
-friends talking while on the basketball court
-group of people protesting
-a group of campers with a cute dog
-a group of photographers taking pictures at the north western gardens in llandudno north wales
-a group of students laughing and talking
-a group of martial artist warming up
-a person playing golf
-a person walking on a wet wooden bridge
-person doing a leg exercise
-ice hockey athlete on rink
-a young athlete training in swimming
-chess player dusting a chessboard
-baseball player holding his bat
-a bearded man putting a vinyl record on a vinyl player
-an orchestra finishes a performance
-people applauding the performance of the kids
-band performance at the recording studio
-father and his children playing jenga game
-people playing a board game
-man playing a video game
-a man video recording the movie in theater
-man and a woman eating while watching a movie
-movie crew talking together
-a director explaining the movie scene
-man and woman listening to music on car
-man playing music
-couple dancing slow dance with sun glare
-a ballerina practicing in the dance studio
-father and son holding hands
-father and daughter talking together
-a mother and her kids engaged in a video call
-mother and daughter reading a book together
-a mother teaching her daughter playing a violin
-kid in a halloween costume
-a happy kid playing the ukulele
-a chef slicing a cucumber
-chef wearing his gloves properly
-brother and sister using hammock
-girl applying sunblock to her brother
-a girl pushing the chair while her sister is on the chair
-colleagues talking in office building
-fighter practice kicking
-a woman fighter in her cosplay costume
-an engineer holding blueprints while talking with her colleague
-a young woman looking at vr controllers with her friend
-workmates teasing a colleague in the work
-a male police officer talking on the radio
-teacher holding a marker while talking
-teacher writing on her notebook
-a young student attending her online classes
-a student showing his classmates his wand
-a male vendor selling fruits
-a shirtless male climber
-a sound engineer listening to music
-female talking to a psychiatrist in a therapy session
-young female activist posing with flag
-a man in a hoodie and woman with a red bandana talking to each other and smiling
-a medium close up of women wearing kimonos
-a male interviewer listening to a person talking
-a social worker having a conversation with the foster parents
-a farm worker harvesting onions
-worker packing street food
-worker and client at barber shop
-elderly man lifting kettlebell
-mom assisting son in riding a bicycle
-dad watching her daughter eat
-young guy with vr headset
-pregnant woman exercising with trainer
-a fortune teller talking to a client
-wizard doing a ritual on a woman
-a footage of an actor on a movie scene
-a man holding a best actor trophy
-a singer of a music band
-a young singer performing on stage
-young dancer practicing at home
-seller showing room to a couple
-cab driver talking to passenger
-a policeman talking to the car driver
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/lifestyle.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/lifestyle.txt
deleted file mode 100644
index 78c8be0b7e7ed7dcf10356aad0f254ae8f73301b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/lifestyle.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-kids celebrating halloween at home
-little boy helping mother in kitchen
-video of a indoor green plant
-a girl arranges a christmas garland hanging by the kitchen cabinet
-candle burning in dark room
-couple having fun and goofing around the bedroom
-girls jumping up and down in the bedroom
-woman and man in pajamas working from home
-a muslim family sitting and talking in the living room
-family enjoying snack time while sitting in the living room
-woman holding an animal puppet and a little girl playing together at the living room
-kids playing in the indoor tent
-young people celebrating new year at the office
-a woman writing on the sticky note in the office
-a woman exercising at home over a yoga mat
-girls preparing easter decorations at home
-dog on floor in room
-turning on a fluorescent light inside a room
-colleagues talking to each other near the office windows
-a woman recording herself while exercising at home
-music room
-different kind of tools kept in a utility room
-sofa beds and other furniture
-a girl finding her brother reading a book in the bedroom
-an elegant ceramic plant pot and hanging plant on indoor
-furniture inside a bedroom
-interior design of the bar section
-living room with party decoration
-firewood burning in dark room
-a young woman playing the ukulele at home
-woman painting at home
-a woman in a locker room
-video of a bathroom interior
-the interior design of a jewish synagogue
-a woman in protective suit disinfecting the kitchen
-modern minimalist home interior
-modern interior design of a coffee shop
-person arranging minimalist furniture
-aerial shot of interior of the warehouse
-a room of a manufacturing facility
-interior of catholic
-interior design of a restaurant
-a female model in a changing room looking herself in mirror
-men walking in the office hallway
-people sitting in a conference room
-the interior design of a shopping mall
-chandeliers in room
-lucerne railway station interior
-a female fencer posing in a foggy room
-a toolbox and a paint roller beside a huge package in a room
-bedroom in hotel
-a woman lying in the operating room
-a chef holding and checking kitchen utensils
-a couple singing in the shower room together
-a woman cleaning mess in the living room
-an empty meeting room with natural light
-person dancing in a dark room
-close up on blood in hospital room
-a couple resting on their home floor
-a young female staff at courier office
-a man entering the gym locker room
-a bored man sitting by the tv at home
-woman dancing in indoor garden
-rubble in the interior of an abandoned house
-indoor farm in a greenhouse
-man doing handstand in indoor garden
-an abandoned indoor swimming pool
-home decorations on top of a cabinet
-graffiti art on the interior walls of an abandoned mansion
-indoor wall climbing activity
-sunlight inside a room
-teenage girl roller skating at indoor rink
-home deco with lighted
-baby in the shower room
-men enjoying office christmas party
-a bedroom with a brick wall
-actors prepping in the dressing room
-kids playing at an indoor playground
-a person sanitizing an office space using smoke machine
-mother and daughter choosing clothes at home
-a woman sitting by the indoor fire pit
-man standing on the corner of the room while looking around
-person assembling furniture
-a family stacking cardboard boxes in a room
-family having fun in the dining room
-person disinfecting a room
-a woman washing strawberries in the kitchen sink
-modern office waiting room
-close up view of a person slicing with a kitchen knife
-boiling coffee on a stove in the kitchen
-modern equipment used in a home studio
-interior of a recording studio
-people working in a call center office
-band performing at a home concert
-a group of people watching a concert in a room
-people packing their furniture
-young employees in office holding a certificate
-a criminal inside a dark room handcuffed in a table
-couple browsing and looking for furniture in the store
-workspace at home
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/plant.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/plant.txt
deleted file mode 100644
index 7d04fc74e26f830191a74cf3b36b9b275b10f55f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/plant.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-video of a indoor green plant
-close up view of a plant
-close up shot of a burning plant
-plucking leaves from plant
-a plant on gold pot with glass lid
-a branch of a tree and a plant
-a leafless tree
-close up shot of fern leaf
-close up video of strawberry plant
-plant with blooming flowers
-close up video of flower petals
-watering yellow plant
-beautiful flower decoration
-cannabis flower in a jar
-a footage of the tree leaves
-a red leaf plant
-close up view of a white christmas tree
-snow pouring on a tree
-close up shot of white flowers on the tree
-leaves in the trees daytime
-a dead tree lying on a grass field
-tree branches in a flowing river
-purple flowers with leaves
-a coconut tree by the house
-close up on flower in winter
-bamboo leaves backlit by the sun
-close up video of a wet flower
-a man putting a flower in a box
-dropping flower petals on a wooden bowl
-a close up shot of gypsophila flower
-variety of succulent plants on a garden
-variety of trees and plants in a botanical garden
-forest of deciduous trees
-a stack of dried leaves burning in a forest
-tall forest trees on a misty morning
-close up view of dewdrops on a leaf
-close up view of white petaled flower
-removing a pineapple leaf
-a dragonfly perched on a leaf
-butterfly pollinating flower
-person visiting and checking a corn plant
-woman picking beans from a plant
-woman plucking mint leaves
-single tree in the middle of farmland
-a plant on a soil
-drone footage of a tree on farm field
-a tractor harvesting lavender flower
-people putting christmas ornaments on a christmas tree
-jack o lantern hanging on a tree
-tree with halloween decoration
-flower field near the waterfall
-truck carrying the tree logs
-raindrops falling on leaves
-shot of a palm tree swaying with the wind
-squirrels on a tree branch
-person holding a flower
-a fallen tree trunk
-tree with golden leaves
-cherry tree
-wind blows through leaves of the tree in autumn
-a leaf on a glass
-the long trunks of tall trees in the forest
-trees in the forest during sunny day
-close up video of tree bark
-reflection of tree branches
-trunks of many trees in the forest
-tree leaves providing shades from the sun
-leaves swaying in the wind
-low angle shot of baobab tree
-bare trees in forest
-a plant surrounded by fallen leaves
-a couple preparing food and pruning a plant
-a man cutting a tree bark
-oranges on a tree branch
-plant connected on the stones
-video of a sawmill machine cutting tree log
-women drying flower petals
-macro view of an agave plant
-a video of a person tying a plant on a string
-green moss in forest nature
-coconut tree near sea under blue sky
-the canopy of a coconut tree
-a man leaning on a tree at the beach
-a full grown plant on a pot
-candle wax dripping on flower petals
-close up of leaves in autumn
-a woman opening a book with a flower inside
-a man holding leaves looking at the camera
-a shadow of a swaying plant
-a tree and concrete structure under a blue and cloudy sky
-trimming excess leaves on a potted plant
-the changing color of the tree leaves during autumn season
-a gooseberry tree swayed by the wind
-forest trees and a medieval castle at sunset
-woman cut down tree
-an old oak tree in a park across the street from a hotel
-wild flowers growing in a forest ground
-a mossy fountain and green plants in a botanical garden
-mansion with beautiful garden
-ants on a dragon fruit flower
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/scenery.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/scenery.txt
deleted file mode 100644
index 5d77e3fcc8528f0fdbdd9c342209cb2a824e4490..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/scenery.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-scenery of desert landscape
-landscape agriculture farm tractor
-burning slash piles in the forest
-graveyard at sunset
-view of a jack o lantern with pumpkins in a smoky garden
-sun view through a spider web
-view of the sea from an abandoned building
-close up view of a full moon
-close up view of lighted candles
-close up view of swaying white flowers and leaves
-scenery of a relaxing beach
-selective focus video of grass during sunny day
-aerial view of brown dry landscape
-fireworks display in the sky at night
-a bonfire near river
-mountain view
-waterfalls in between mountain
-a picturesque view of nature
-exotic view of a riverfront city
-tall trees in the forest under the clear sky
-snow on branches in forest
-stream in the nature
-an airplane flying above the sea of clouds
-scenic video of sunset
-view of houses with bush fence under a blue and cloudy sky
-scenic view from wooden pathway
-scenic view of a tropical beach
-drone footage of waves crashing on beach shore
-a scenic view of the golden hour at norway
-time lapse video of foggy mountain forest
-brown mountain during fall season
-video of ocean during daytime
-boat sailing in the ocean
-top view of yachts
-beautiful scenery of flowing waterfalls and river
-wild ducks paddling on the lake surface
-a relaxing scenery of beach view under cloudy sky
-natural rock formations on beach under cloudy sky
-a palm tree against blue sky
-video of sailboat on a lake during sunset
-aerial view of snow piles
-time lapse of a sunset sky in the countryside
-aerial footage of a statue
-time lapse video of a farm during sunset
-clouds formation in the sky at sunset
-aerial shot of a village
-drone shot of a beautiful sunrise at the mountains
-time lapse video of foggy morning during sunrise
-sun shining between tree leaves at sunrise
-video of lake during dawn
-vehicles traveling on roadway under cloudy sky
-view of golden domed church
-a monument under the blue sky
-firecrackers in the sky
-view of fruit signage in the farm
-a dark clouds over shadowing the full moon
-view of the amazon river
-a big river swamp in a dense forest
-a blooming cherry blossom tree under a blue sky with white clouds
-a river waterfall cascading down the plunge basin
-flooded landscape with palm trees
-a blurry waterfall background
-waterfall in the mountains
-aerial footage of a city at night
-pond by small waterfall in forest
-aerial view of farmlands at the bay of lake
-rice terraces in the countryside
-a highway built across an agricultural area in the countryside
-gloomy morning in the countryside
-drone shot of an abandoned coliseum on a snowy mountain top
-boat sailing in the middle of ocean
-drone shot of the grass field
-natural landscape of mountain and sea with islets developed into a community
-aerial view of zaporizhia in ukraine
-aerial footage of a herd
-an aerial footage of a red sky
-grass and plants growing in the remains of an abandoned house
-view from hill on city
-aerial view on orthodox church
-aerial view of bay in croatia
-a footage of a frozen river
-overlooking view of a city at daylight
-view outside the cemetery
-clear sky with moon over meadow
-clouds over railway
-aerial footage of moving vehicles on the road at night
-aerial view of town and park
-top view of skyscrapers
-top view of the empire state building in manhattan
-top view of the central park in new york city
-sheep running in a grass field
-clear sky over factory
-smoke and fire in birds eye view
-view of a pathway with snow melting on its side
-ferry under bridge on river near city in malaysia
-mountain slopes covered in green vegetation
-panoramic view of a town surrounded by snow covered mountains
-aerial view of a palace
-top view of vehicles driving on the intersection
-a graveyard by a church in a mountain landscape
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/vehicles.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/vehicles.txt
deleted file mode 100644
index 5fd5fcad2c20d277aa58b6366fa195e48d21e6dc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_category/vehicles.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-a modern railway station in malaysia use for public transportation
-drone footage of amsterdam metro station
-train arriving at a station
-red vehicle driving on field
-close up view of flashing emergency vehicle lighting
-vehicle with fertilizer on field
-a highway built across an agricultural area in the countryside
-drone footage of motorcycles driving on country road between agricultural fields
-a road in the woods under fog
-footage of a car driving through a wheat field
-vehicle stops for an ambulance passing through city traffic
-emergency vehicle parked outside the casino
-zombies attacking a woman and a boy inside a car
-woman seating inside the car while chewing
-video of passengers riding a double decker bus during night
-traffic in london street at night
-elderly couple checking engine of automobile
-a green vintage automobile with an open hood parked in a parking area
-close up of a prototype automobile with exposed engine on the back seat of the car
-aerial view of road in forest
-train departing from station
-aerial view of a train passing by a bridge
-video of a train tracks
-video footage of a subway
-video of blinking traffic lights
-couple walking out on the subway
-time lapse of a subway tunnel
-monitor board inside the subway
-metro train at night
-zoom in video of a tram passing by city
-young man using laptop in the tram
-man reading a book at bus stop
-close up shot of a moving taxi
-night travel in london street on a public bus
-red bus in a rainy city
-flow of traffic in the city
-close up shot of a yellow taxi turning left
-two women calling for a taxi
-drone view of an illuminated bridge across a river
-policeman in police car talking on radio
-airplane taking off at night
-view through window in airplane
-an airplane in the sky
-helicopter landing on the street
-a pilot getting out of a helicopter
-a helicopter flying under blue sky
-boat sailing in the middle of the ocean
-girl playing with a toy boat
-silhouette of a boat on sea during golden hour
-a boat travelling around the lake
-road on mountain ridge
-ship sailing on danube river
-slow motion video of a ship water trail in the sea
-drone footage of a wreck ship on shore
-a white yacht traveling on a river and passing under the bridge
-female teenagers drinking champagne in the yacht
-video of yacht sailing in the ocean
-red combine harvester on road on field
-a woman sitting on a bicycle while using a mobile phone
-a woman sitting on a motorcycle looking around
-three teenagers fixing a bicycle
-a woman in a halloween costume posing on a motorcycle
-a parked motorcycle on a foggy roadside
-cable car near sea shore
-a truck travelling in the road
-footage of the road without any traffic
-a road sign
-love padlocks on a bridge
-camera moving at highway construction site
-vehicles driving on highway
-a motorbike on highway at timelapse mode
-point of view of a car driving through a tunnel
-time lapse of heavy traffic on an avenue
-ferry boat on city canal
-black vintage car in museum
-a zigzag road across a forest
-people crossing the road
-video of a kayak boat in a river
-a person paddling a wooden boat in a lake
-a car charging in the parking area
-cars parked on the road
-footage of the street with people and vehicle passing by in the rain
-traffic on busy city street
-a woman getting out of the car to walk with their dog
-yacht sailing through the ocean
-people in queue to military ship
-man wearing motorcycle helmet looking at the camera
-empty seats in the bus
-empty boat on the water
-cargo train traveling on the mountainside
-cruise ship in harbor
-counting down at traffic lights
-pressing the car ignition
-fire truck driving on the road
-a footage of a broken bicycle
-drone footage of an ambulance on the road
-slow motion footage of a racing car
-ship sailing on sea against sunset
-big cargo ship passing on the shore
-back view of man and woman walking on unpaved road
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/appearance_style.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/appearance_style.txt
deleted file mode 100644
index 4382de9e79fd19e9b1ca18497efb2e1cc9e4ca91..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/appearance_style.txt
+++ /dev/null
@@ -1,90 +0,0 @@
-A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style
-A beautiful coastal beach in spring, waves lapping on sand, oil painting
-A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo
-A beautiful coastal beach in spring, waves lapping on sand, black and white
-A beautiful coastal beach in spring, waves lapping on sand, pixel art
-A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style
-A beautiful coastal beach in spring, waves lapping on sand, animated style
-A beautiful coastal beach in spring, waves lapping on sand, watercolor painting
-A beautiful coastal beach in spring, waves lapping on sand, surrealism style
-The bund Shanghai, Van Gogh style
-The bund Shanghai, oil painting
-The bund Shanghai by Hokusai, in the style of Ukiyo
-The bund Shanghai, black and white
-The bund Shanghai, pixel art
-The bund Shanghai, in cyberpunk style
-The bund Shanghai, animated style
-The bund Shanghai, watercolor painting
-The bund Shanghai, surrealism style
-a shark is swimming in the ocean, Van Gogh style
-a shark is swimming in the ocean, oil painting
-a shark is swimming in the ocean by Hokusai, in the style of Ukiyo
-a shark is swimming in the ocean, black and white
-a shark is swimming in the ocean, pixel art
-a shark is swimming in the ocean, in cyberpunk style
-a shark is swimming in the ocean, animated style
-a shark is swimming in the ocean, watercolor painting
-a shark is swimming in the ocean, surrealism style
-A panda drinking coffee in a cafe in Paris, Van Gogh style
-A panda drinking coffee in a cafe in Paris, oil painting
-A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo
-A panda drinking coffee in a cafe in Paris, black and white
-A panda drinking coffee in a cafe in Paris, pixel art
-A panda drinking coffee in a cafe in Paris, in cyberpunk style
-A panda drinking coffee in a cafe in Paris, animated style
-A panda drinking coffee in a cafe in Paris, watercolor painting
-A panda drinking coffee in a cafe in Paris, surrealism style
-A cute happy Corgi playing in park, sunset, Van Gogh style
-A cute happy Corgi playing in park, sunset, oil painting
-A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo
-A cute happy Corgi playing in park, sunset, black and white
-A cute happy Corgi playing in park, sunset, pixel art
-A cute happy Corgi playing in park, sunset, in cyberpunk style
-A cute happy Corgi playing in park, sunset, animated style
-A cute happy Corgi playing in park, sunset, watercolor painting
-A cute happy Corgi playing in park, sunset, surrealism style
-Gwen Stacy reading a book, Van Gogh style
-Gwen Stacy reading a book, oil painting
-Gwen Stacy reading a book by Hokusai, in the style of Ukiyo
-Gwen Stacy reading a book, black and white
-Gwen Stacy reading a book, pixel art
-Gwen Stacy reading a book, in cyberpunk style
-Gwen Stacy reading a book, animated style
-Gwen Stacy reading a book, watercolor painting
-Gwen Stacy reading a book, surrealism style
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style
-An astronaut flying in space, Van Gogh style
-An astronaut flying in space, oil painting
-An astronaut flying in space by Hokusai, in the style of Ukiyo
-An astronaut flying in space, black and white
-An astronaut flying in space, pixel art
-An astronaut flying in space, in cyberpunk style
-An astronaut flying in space, animated style
-An astronaut flying in space, watercolor painting
-An astronaut flying in space, surrealism style
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/color.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/color.txt
deleted file mode 100644
index 42e135adfd64950df6c3111d4587b08b02727cbe..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/color.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-a red bicycle
-a green bicycle
-a blue bicycle
-a yellow bicycle
-an orange bicycle
-a purple bicycle
-a pink bicycle
-a black bicycle
-a white bicycle
-a red car
-a green car
-a blue car
-a yellow car
-an orange car
-a purple car
-a pink car
-a black car
-a white car
-a red bird
-a green bird
-a blue bird
-a yellow bird
-an orange bird
-a purple bird
-a pink bird
-a black bird
-a white bird
-a black cat
-a white cat
-an orange cat
-a yellow cat
-a red umbrella
-a green umbrella
-a blue umbrella
-a yellow umbrella
-an orange umbrella
-a purple umbrella
-a pink umbrella
-a black umbrella
-a white umbrella
-a red suitcase
-a green suitcase
-a blue suitcase
-a yellow suitcase
-an orange suitcase
-a purple suitcase
-a pink suitcase
-a black suitcase
-a white suitcase
-a red bowl
-a green bowl
-a blue bowl
-a yellow bowl
-an orange bowl
-a purple bowl
-a pink bowl
-a black bowl
-a white bowl
-a red chair
-a green chair
-a blue chair
-a yellow chair
-an orange chair
-a purple chair
-a pink chair
-a black chair
-a white chair
-a red clock
-a green clock
-a blue clock
-a yellow clock
-an orange clock
-a purple clock
-a pink clock
-a black clock
-a white clock
-a red vase
-a green vase
-a blue vase
-a yellow vase
-an orange vase
-a purple vase
-a pink vase
-a black vase
-a white vase
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/human_action.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/human_action.txt
deleted file mode 100644
index 2abd9a25a447e296d999df0d24920acd441f6a4d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/human_action.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-A person is riding a bike
-A person is marching
-A person is roller skating
-A person is tasting beer
-A person is clapping
-A person is drawing
-A person is petting animal (not cat)
-A person is eating watermelon
-A person is playing harp
-A person is wrestling
-A person is riding scooter
-A person is sweeping floor
-A person is skateboarding
-A person is dunking basketball
-A person is playing flute
-A person is stretching leg
-A person is tying tie
-A person is skydiving
-A person is shooting goal (soccer)
-A person is playing piano
-A person is finger snapping
-A person is canoeing or kayaking
-A person is laughing
-A person is digging
-A person is clay pottery making
-A person is shooting basketball
-A person is bending back
-A person is shaking hands
-A person is bandaging
-A person is push up
-A person is catching or throwing frisbee
-A person is playing trumpet
-A person is flying kite
-A person is filling eyebrows
-A person is shuffling cards
-A person is folding clothes
-A person is smoking
-A person is tai chi
-A person is squat
-A person is playing controller
-A person is throwing axe
-A person is giving or receiving award
-A person is air drumming
-A person is taking a shower
-A person is planting trees
-A person is sharpening knives
-A person is robot dancing
-A person is rock climbing
-A person is hula hooping
-A person is writing
-A person is bungee jumping
-A person is pushing cart
-A person is cleaning windows
-A person is cutting watermelon
-A person is cheerleading
-A person is washing hands
-A person is ironing
-A person is cutting nails
-A person is hugging
-A person is trimming or shaving beard
-A person is jogging
-A person is making bed
-A person is washing dishes
-A person is grooming dog
-A person is doing laundry
-A person is knitting
-A person is reading book
-A person is baby waking up
-A person is massaging legs
-A person is brushing teeth
-A person is crawling baby
-A person is motorcycling
-A person is driving car
-A person is sticking tongue out
-A person is shaking head
-A person is sword fighting
-A person is doing aerobics
-A person is strumming guitar
-A person is riding or walking with horse
-A person is archery
-A person is catching or throwing baseball
-A person is playing chess
-A person is rock scissors paper
-A person is using computer
-A person is arranging flowers
-A person is bending metal
-A person is ice skating
-A person is climbing a rope
-A person is crying
-A person is dancing ballet
-A person is getting a haircut
-A person is running on treadmill
-A person is kissing
-A person is counting money
-A person is barbequing
-A person is peeling apples
-A person is milking cow
-A person is shining shoes
-A person is making snowman
-A person is sailing
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/multiple_objects.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/multiple_objects.txt
deleted file mode 100644
index 1a1211eeff1021efc49cab7e4c8e91867862949e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/multiple_objects.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-a bird and a cat
-a cat and a dog
-a dog and a horse
-a horse and a sheep
-a sheep and a cow
-a cow and an elephant
-an elephant and a bear
-a bear and a zebra
-a zebra and a giraffe
-a giraffe and a bird
-a chair and a couch
-a couch and a potted plant
-a potted plant and a tv
-a tv and a laptop
-a laptop and a remote
-a remote and a keyboard
-a keyboard and a cell phone
-a cell phone and a book
-a book and a clock
-a clock and a backpack
-a backpack and an umbrella
-an umbrella and a handbag
-a handbag and a tie
-a tie and a suitcase
-a suitcase and a vase
-a vase and scissors
-scissors and a teddy bear
-a teddy bear and a frisbee
-a frisbee and skis
-skis and a snowboard
-a snowboard and a sports ball
-a sports ball and a kite
-a kite and a baseball bat
-a baseball bat and a baseball glove
-a baseball glove and a skateboard
-a skateboard and a surfboard
-a surfboard and a tennis racket
-a tennis racket and a bottle
-a bottle and a chair
-an airplane and a train
-a train and a boat
-a boat and an airplane
-a bicycle and a car
-a car and a motorcycle
-a motorcycle and a bus
-a bus and a traffic light
-a traffic light and a fire hydrant
-a fire hydrant and a stop sign
-a stop sign and a parking meter
-a parking meter and a truck
-a truck and a bicycle
-a toilet and a hair drier
-a hair drier and a toothbrush
-a toothbrush and a sink
-a sink and a toilet
-a wine glass and a chair
-a cup and a couch
-a fork and a potted plant
-a knife and a tv
-a spoon and a laptop
-a bowl and a remote
-a banana and a keyboard
-an apple and a cell phone
-a sandwich and a book
-an orange and a clock
-broccoli and a backpack
-a carrot and an umbrella
-a hot dog and a handbag
-a pizza and a tie
-a donut and a suitcase
-a cake and a vase
-an oven and scissors
-a toaster and a teddy bear
-a microwave and a frisbee
-a refrigerator and skis
-a bicycle and an airplane
-a car and a train
-a motorcycle and a boat
-a person and a toilet
-a person and a hair drier
-a person and a toothbrush
-a person and a sink
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/object_class.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/object_class.txt
deleted file mode 100644
index 11aa72dd0e757e7272c1c734f804d482abe98c5f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/object_class.txt
+++ /dev/null
@@ -1,79 +0,0 @@
-a person
-a bicycle
-a car
-a motorcycle
-an airplane
-a bus
-a train
-a truck
-a boat
-a traffic light
-a fire hydrant
-a stop sign
-a parking meter
-a bench
-a bird
-a cat
-a dog
-a horse
-a sheep
-a cow
-an elephant
-a bear
-a zebra
-a giraffe
-a backpack
-an umbrella
-a handbag
-a tie
-a suitcase
-a frisbee
-skis
-a snowboard
-a sports ball
-a kite
-a baseball bat
-a baseball glove
-a skateboard
-a surfboard
-a tennis racket
-a bottle
-a wine glass
-a cup
-a fork
-a knife
-a spoon
-a bowl
-a banana
-an apple
-a sandwich
-an orange
-broccoli
-a carrot
-a hot dog
-a pizza
-a donut
-a cake
-a chair
-a couch
-a potted plant
-a bed
-a dining table
-a toilet
-a tv
-a laptop
-a remote
-a keyboard
-a cell phone
-a microwave
-an oven
-a toaster
-a sink
-a refrigerator
-a book
-a clock
-a vase
-scissors
-a teddy bear
-a hair drier
-a toothbrush
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/overall_consistency.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/overall_consistency.txt
deleted file mode 100644
index 360c2673b95fbd898222b0ea3a0fec98779f4b25..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/overall_consistency.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-Close up of grapes on a rotating table.
-Turtle swimming in ocean.
-A storm trooper vacuuming the beach.
-A panda standing on a surfboard in the ocean in sunset.
-An astronaut feeding ducks on a sunny afternoon, reflection from the water.
-Two pandas discussing an academic paper.
-Sunset time lapse at the beach with moving clouds and colors in the sky.
-A fat rabbit wearing a purple robe walking through a fantasy landscape.
-A koala bear playing piano in the forest.
-An astronaut flying in space.
-Fireworks.
-An animated painting of fluffy white clouds moving in sky.
-Flying through fantasy landscapes.
-A bigfoot walking in the snowstorm.
-A squirrel eating a burger.
-A cat wearing sunglasses and working as a lifeguard at a pool.
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks.
-Splash of turquoise water in extreme slow motion, alpha channel included.
-an ice cream is melting on the table.
-a drone flying over a snowy forest.
-a shark is swimming in the ocean.
-Aerial panoramic video from a drone of a fantasy land.
-a teddy bear is swimming in the ocean.
-time lapse of sunrise on mars.
-golden fish swimming in the ocean.
-An artist brush painting on a canvas close up.
-A drone view of celebration with Christmas tree and fireworks, starry sky - background.
-happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background
-Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance.
-Campfire at night in a snowy forest with starry sky in the background.
-a fantasy landscape
-A 3D model of a 1800s victorian house.
-this is how I do makeup in the morning.
-A raccoon that looks like a turtle, digital art.
-Robot dancing in Times Square.
-Busy freeway at night.
-Balloon full of water exploding in extreme slow motion.
-An astronaut is riding a horse in the space in a photorealistic style.
-Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl.
-Sewing machine, old sewing machine working.
-Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink.
-Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro.
-Vampire makeup face of beautiful girl, red contact lenses.
-Ashtray full of butts on table, smoke flowing on black background, close-up
-Pacific coast, carmel by the sea ocean and waves.
-A teddy bear is playing drum kit in NYC Times Square.
-A corgi is playing drum kit.
-An Iron man is playing the electronic guitar, high electronic guitar.
-A raccoon is playing the electronic guitar.
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh
-A corgi's head depicted as an explosion of a nebula
-A fantasy landscape
-A future where humans have achieved teleportation technology
-A jellyfish floating through the ocean, with bioluminescent tentacles
-A Mars rover moving on Mars
-A panda drinking coffee in a cafe in Paris
-A space shuttle launching into orbit, with flames and smoke billowing out from the engines
-A steam train moving on a mountainside
-A super cool giant robot in Cyberpunk Beijing
-A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground
-Cinematic shot of Van Gogh's selfie, Van Gogh style
-Gwen Stacy reading a book
-Iron Man flying in the sky
-The bund Shanghai, oil painting
-Yoda playing guitar on the stage
-A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo
-A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background
-A car moving slowly on an empty street, rainy evening
-A cat eating food out of a bowl
-A cat wearing sunglasses at a pool
-A confused panda in calculus class
-A cute fluffy panda eating Chinese food in a restaurant
-A cute happy Corgi playing in park, sunset
-A cute raccoon playing guitar in a boat on the ocean
-A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background
-A lightning striking atop of eiffel tower, dark clouds in the sky
-A modern art museum, with colorful paintings
-A panda cooking in the kitchen
-A panda playing on a swing set
-A polar bear is playing guitar
-A raccoon dressed in suit playing the trumpet, stage background
-A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy
-A shark swimming in clear Caribbean ocean
-A super robot protecting city
-A teddy bear washing the dishes
-An epic tornado attacking above a glowing city at night, the tornado is made of smoke
-An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas
-Clown fish swimming through the coral reef
-Hyper-realistic spaceship landing on Mars
-The bund Shanghai, vibrant color
-Vincent van Gogh is painting in the room
-Yellow flowers swing in the wind
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/scene.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/scene.txt
deleted file mode 100644
index 1ec0366233eba9957eb5d2772064c4b5d0e9b05d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/scene.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-alley
-amusement park
-aquarium
-arch
-art gallery
-bathroom
-bakery shop
-ballroom
-bar
-barn
-basement
-beach
-bedroom
-bridge
-botanical garden
-cafeteria
-campsite
-campus
-carrousel
-castle
-cemetery
-classroom
-cliff
-crosswalk
-construction site
-corridor
-courtyard
-desert
-downtown
-driveway
-farm
-food court
-football field
-forest road
-fountain
-gas station
-glacier
-golf course
-indoor gymnasium
-harbor
-highway
-hospital
-house
-iceberg
-industrial area
-jail cell
-junkyard
-kitchen
-indoor library
-lighthouse
-laboratory
-mansion
-marsh
-mountain
-indoor movie theater
-indoor museum
-music studio
-nursery
-ocean
-office
-palace
-parking lot
-pharmacy
-phone booth
-raceway
-restaurant
-river
-science museum
-shower
-ski slope
-sky
-skyscraper
-baseball stadium
-staircase
-street
-supermarket
-indoor swimming pool
-tower
-outdoor track
-train railway
-train station platform
-underwater coral reef
-valley
-volcano
-waterfall
-windmill
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/spatial_relationship.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/spatial_relationship.txt
deleted file mode 100644
index 25fe959fd8dc27674a2d52b4349ff3aacbe8d66c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/spatial_relationship.txt
+++ /dev/null
@@ -1,84 +0,0 @@
-a bicycle on the left of a car, front view
-a car on the right of a motorcycle, front view
-a motorcycle on the left of a bus, front view
-a bus on the right of a traffic light, front view
-a traffic light on the left of a fire hydrant, front view
-a fire hydrant on the right of a stop sign, front view
-a stop sign on the left of a parking meter, front view
-a parking meter on the right of a bench, front view
-a bench on the left of a truck, front view
-a truck on the right of a bicycle, front view
-a bird on the left of a cat, front view
-a cat on the right of a dog, front view
-a dog on the left of a horse, front view
-a horse on the right of a sheep, front view
-a sheep on the left of a cow, front view
-a cow on the right of an elephant, front view
-an elephant on the left of a bear, front view
-a bear on the right of a zebra, front view
-a zebra on the left of a giraffe, front view
-a giraffe on the right of a bird, front view
-a bottle on the left of a wine glass, front view
-a wine glass on the right of a cup, front view
-a cup on the left of a fork, front view
-a fork on the right of a knife, front view
-a knife on the left of a spoon, front view
-a spoon on the right of a bowl, front view
-a bowl on the left of a bottle, front view
-a potted plant on the left of a remote, front view
-a remote on the right of a clock, front view
-a clock on the left of a vase, front view
-a vase on the right of scissors, front view
-scissors on the left of a teddy bear, front view
-a teddy bear on the right of a potted plant, front view
-a frisbee on the left of a sports ball, front view
-a sports ball on the right of a baseball bat, front view
-a baseball bat on the left of a baseball glove, front view
-a baseball glove on the right of a tennis racket, front view
-a tennis racket on the left of a frisbee, front view
-a toilet on the left of a hair drier, front view
-a hair drier on the right of a toothbrush, front view
-a toothbrush on the left of a sink, front view
-a sink on the right of a toilet, front view
-a chair on the left of a couch, front view
-a couch on the right of a bed, front view
-a bed on the left of a tv, front view
-a tv on the right of a dining table, front view
-a dining table on the left of a chair, front view
-an airplane on the left of a train, front view
-a train on the right of a boat, front view
-a boat on the left of an airplane, front view
-an oven on the top of a toaster, front view
-an oven on the bottom of a toaster, front view
-a toaster on the top of a microwave, front view
-a toaster on the bottom of a microwave, front view
-a microwave on the top of an oven, front view
-a microwave on the bottom of an oven, front view
-a banana on the top of an apple, front view
-a banana on the bottom of an apple, front view
-an apple on the top of a sandwich, front view
-an apple on the bottom of a sandwich, front view
-a sandwich on the top of an orange, front view
-a sandwich on the bottom of an orange, front view
-an orange on the top of a carrot, front view
-an orange on the bottom of a carrot, front view
-a carrot on the top of a hot dog, front view
-a carrot on the bottom of a hot dog, front view
-a hot dog on the top of a pizza, front view
-a hot dog on the bottom of a pizza, front view
-a pizza on the top of a donut, front view
-a pizza on the bottom of a donut, front view
-a donut on the top of broccoli, front view
-a donut on the bottom of broccoli, front view
-broccoli on the top of a banana, front view
-broccoli on the bottom of a banana, front view
-skis on the top of a snowboard, front view
-skis on the bottom of a snowboard, front view
-a snowboard on the top of a kite, front view
-a snowboard on the bottom of a kite, front view
-a kite on the top of a skateboard, front view
-a kite on the bottom of a skateboard, front view
-a skateboard on the top of a surfboard, front view
-a skateboard on the bottom of a surfboard, front view
-a surfboard on the top of skis, front view
-a surfboard on the bottom of skis, front view
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/subject_consistency.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/subject_consistency.txt
deleted file mode 100644
index 97cb77e5efe48b1fc3730eb9d6144df55baadcb5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/subject_consistency.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-a person swimming in ocean
-a person giving a presentation to a room full of colleagues
-a person washing the dishes
-a person eating a burger
-a person walking in the snowstorm
-a person drinking coffee in a cafe
-a person playing guitar
-a bicycle leaning against a tree
-a bicycle gliding through a snowy field
-a bicycle slowing down to stop
-a bicycle accelerating to gain speed
-a car stuck in traffic during rush hour
-a car turning a corner
-a car slowing down to stop
-a car accelerating to gain speed
-a motorcycle cruising along a coastal highway
-a motorcycle turning a corner
-a motorcycle slowing down to stop
-a motorcycle gliding through a snowy field
-a motorcycle accelerating to gain speed
-an airplane soaring through a clear blue sky
-an airplane taking off
-an airplane landing smoothly on a runway
-an airplane accelerating to gain speed
-a bus turning a corner
-a bus stuck in traffic during rush hour
-a bus accelerating to gain speed
-a train speeding down the tracks
-a train crossing over a tall bridge
-a train accelerating to gain speed
-a truck turning a corner
-a truck anchored in a tranquil bay
-a truck stuck in traffic during rush hour
-a truck slowing down to stop
-a truck accelerating to gain speed
-a boat sailing smoothly on a calm lake
-a boat slowing down to stop
-a boat accelerating to gain speed
-a bird soaring gracefully in the sky
-a bird building a nest from twigs and leaves
-a bird flying over a snowy forest
-a cat grooming itself meticulously with its tongue
-a cat playing in park
-a cat drinking water
-a cat running happily
-a dog enjoying a peaceful walk
-a dog playing in park
-a dog drinking water
-a dog running happily
-a horse bending down to drink water from a river
-a horse galloping across an open field
-a horse taking a peaceful walk
-a horse running to join a herd of its kind
-a sheep bending down to drink water from a river
-a sheep taking a peaceful walk
-a sheep running to join a herd of its kind
-a cow bending down to drink water from a river
-a cow chewing cud while resting in a tranquil barn
-a cow running to join a herd of its kind
-an elephant spraying itself with water using its trunk to cool down
-an elephant taking a peaceful walk
-an elephant running to join a herd of its kind
-a bear catching a salmon in its powerful jaws
-a bear sniffing the air for scents of food
-a bear climbing a tree
-a bear hunting for prey
-a zebra bending down to drink water from a river
-a zebra running to join a herd of its kind
-a zebra taking a peaceful walk
-a giraffe bending down to drink water from a river
-a giraffe taking a peaceful walk
-a giraffe running to join a herd of its kind
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/temporal_flickering.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/temporal_flickering.txt
deleted file mode 100644
index 9fb5cad2d0c6b7a9cdeea5e0c084c2140fcd4152..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/temporal_flickering.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-In a still frame, a stop sign
-a toilet, frozen in time
-a laptop, frozen in time
-A tranquil tableau of alley
-A tranquil tableau of bar
-A tranquil tableau of barn
-A tranquil tableau of bathroom
-A tranquil tableau of bedroom
-A tranquil tableau of cliff
-In a still frame, courtyard
-In a still frame, gas station
-A tranquil tableau of house
-indoor gymnasium, frozen in time
-A tranquil tableau of indoor library
-A tranquil tableau of kitchen
-A tranquil tableau of palace
-In a still frame, parking lot
-In a still frame, phone booth
-A tranquil tableau of restaurant
-A tranquil tableau of tower
-A tranquil tableau of a bowl
-A tranquil tableau of an apple
-A tranquil tableau of a bench
-A tranquil tableau of a bed
-A tranquil tableau of a chair
-A tranquil tableau of a cup
-A tranquil tableau of a dining table
-In a still frame, a pear
-A tranquil tableau of a bunch of grapes
-A tranquil tableau of a bowl on the kitchen counter
-A tranquil tableau of a beautiful, handcrafted ceramic bowl
-A tranquil tableau of an antique bowl
-A tranquil tableau of an exquisite mahogany dining table
-A tranquil tableau of a wooden bench in the park
-A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers
-In a still frame, a park bench with a view of the lake
-A tranquil tableau of a vintage rocking chair was placed on the porch
-A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars
-A tranquil tableau of the phone booth was tucked away in a quiet alley
-a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time
-A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside
-A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow
-In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water
-In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape
-In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens
-In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels
-A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility
-In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity
-static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water
-A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night
-A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water
-In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square
-In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner
-A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy
-A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins
-A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes
-A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades
-In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall
-A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels
-A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour
-In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting
-In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light
-A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon
-A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon
-A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space
-In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk
-In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier
-A tranquil tableau of a country estate's library featured elegant wooden shelves
-A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently
-A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm
-A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden
-In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface
-In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation
-A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms
-A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/temporal_style.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/temporal_style.txt
deleted file mode 100644
index fea23cbd7f248d93978ee44fe1860f75e6978117..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/VBench/prompts_per_dimension/temporal_style.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-A beautiful coastal beach in spring, waves lapping on sand, in super slow motion
-A beautiful coastal beach in spring, waves lapping on sand, zoom in
-A beautiful coastal beach in spring, waves lapping on sand, zoom out
-A beautiful coastal beach in spring, waves lapping on sand, pan left
-A beautiful coastal beach in spring, waves lapping on sand, pan right
-A beautiful coastal beach in spring, waves lapping on sand, tilt up
-A beautiful coastal beach in spring, waves lapping on sand, tilt down
-A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect
-A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective
-A beautiful coastal beach in spring, waves lapping on sand, racking focus
-The bund Shanghai, in super slow motion
-The bund Shanghai, zoom in
-The bund Shanghai, zoom out
-The bund Shanghai, pan left
-The bund Shanghai, pan right
-The bund Shanghai, tilt up
-The bund Shanghai, tilt down
-The bund Shanghai, with an intense shaking effect
-The bund Shanghai, featuring a steady and smooth perspective
-The bund Shanghai, racking focus
-a shark is swimming in the ocean, in super slow motion
-a shark is swimming in the ocean, zoom in
-a shark is swimming in the ocean, zoom out
-a shark is swimming in the ocean, pan left
-a shark is swimming in the ocean, pan right
-a shark is swimming in the ocean, tilt up
-a shark is swimming in the ocean, tilt down
-a shark is swimming in the ocean, with an intense shaking effect
-a shark is swimming in the ocean, featuring a steady and smooth perspective
-a shark is swimming in the ocean, racking focus
-A panda drinking coffee in a cafe in Paris, in super slow motion
-A panda drinking coffee in a cafe in Paris, zoom in
-A panda drinking coffee in a cafe in Paris, zoom out
-A panda drinking coffee in a cafe in Paris, pan left
-A panda drinking coffee in a cafe in Paris, pan right
-A panda drinking coffee in a cafe in Paris, tilt up
-A panda drinking coffee in a cafe in Paris, tilt down
-A panda drinking coffee in a cafe in Paris, with an intense shaking effect
-A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective
-A panda drinking coffee in a cafe in Paris, racking focus
-A cute happy Corgi playing in park, sunset, in super slow motion
-A cute happy Corgi playing in park, sunset, zoom in
-A cute happy Corgi playing in park, sunset, zoom out
-A cute happy Corgi playing in park, sunset, pan left
-A cute happy Corgi playing in park, sunset, pan right
-A cute happy Corgi playing in park, sunset, tilt up
-A cute happy Corgi playing in park, sunset, tilt down
-A cute happy Corgi playing in park, sunset, with an intense shaking effect
-A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective
-A cute happy Corgi playing in park, sunset, racking focus
-Gwen Stacy reading a book, in super slow motion
-Gwen Stacy reading a book, zoom in
-Gwen Stacy reading a book, zoom out
-Gwen Stacy reading a book, pan left
-Gwen Stacy reading a book, pan right
-Gwen Stacy reading a book, tilt up
-Gwen Stacy reading a book, tilt down
-Gwen Stacy reading a book, with an intense shaking effect
-Gwen Stacy reading a book, featuring a steady and smooth perspective
-Gwen Stacy reading a book, racking focus
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective
-A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective
-A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus
-An astronaut flying in space, in super slow motion
-An astronaut flying in space, zoom in
-An astronaut flying in space, zoom out
-An astronaut flying in space, pan left
-An astronaut flying in space, pan right
-An astronaut flying in space, tilt up
-An astronaut flying in space, tilt down
-An astronaut flying in space, with an intense shaking effect
-An astronaut flying in space, featuring a steady and smooth perspective
-An astronaut flying in space, racking focus
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective
-Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/imagenet_id.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/imagenet_id.txt
deleted file mode 100644
index 9085aa0034c05cc60e40b1f14be1bb4a2a171d2f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/imagenet_id.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-207
-360
-387
-974
-88
-979
-417
-279
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/imagenet_labels.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/imagenet_labels.txt
deleted file mode 100644
index 6493fdbf907465063a2cee904fe3994a90d420cd..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/imagenet_labels.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-golden retriever
-otter
-lesser panda
-geyser
-macaw
-valley
-balloon
-golden panda
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2i_samples.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2i_samples.txt
deleted file mode 100644
index 9b729527cee2d4da1d28415e42c52c6627217d10..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2i_samples.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-A small cactus with a happy face in the Sahara desert.
-Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens.
-Nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.
-Poster of a mechanical cat, techical Schematics viewed from front.
-Luffy from ONEPIECE, handsome face, fantasy.
-Real beautiful woman.
-A alpaca made of colorful building blocks, cyberpunk.
-artistic
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_car.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_car.txt
deleted file mode 100644
index f9bd226fa90d7554b73ba8d7011a25afa970eadc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_car.txt
+++ /dev/null
@@ -1 +0,0 @@
-|0|A car driving on the in forest.|2|A car driving in the desert.|4|A car driving near the coast.|6|A car driving in the city.|8|A car driving near a mountain.|10|A car driving on the surface of a river.|12|A car driving on the surface of the earch.|14|A car driving in the universe.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16,0.4"}
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_latte.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_latte.txt
deleted file mode 100644
index a61359ca41325db817d8eb2c6a255d997f0382ca..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_latte.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-Yellow and black tropical fish dart through the sea.
-An epic tornado attacking above aglowing city at night.
-Slow pan upward of blazing oak fire in an indoor fireplace.
-a cat wearing sunglasses and working as a lifeguard at pool.
-Sunset over the sea.
-A dog in astronaut suit and sunglasses floating in space.
-A astronaut in flying in space, 4k, high resolution
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_ref.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_ref.txt
deleted file mode 100644
index c0debe5c177192ae57e65f06fc1bc7491ff08b27..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_ref.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
-In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.
-Pirate ship in a cosmic maelstrom nebula.
-Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
-A sad small cactus with in the Sahara desert becomes happy.
-A car driving on a road in the middle of a desert.
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_samples.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_samples.txt
deleted file mode 100644
index 7953f3752f0a9648bbb716c1cea9cbf99f5237b4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_samples.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures.
-A majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene. The camera angle provides a bird's eye view of the waterfall, allowing viewers to appreciate the full height and grandeur of the waterfall. The video is a stunning representation of nature's power and beauty.
-A vibrant scene of a snowy mountain landscape. The sky is filled with a multitude of colorful hot air balloons, each floating at different heights, creating a dynamic and lively atmosphere. The balloons are scattered across the sky, some closer to the viewer, others further away, adding depth to the scene. Below, the mountainous terrain is blanketed in a thick layer of snow, with a few patches of bare earth visible here and there. The snow-covered mountains provide a stark contrast to the colorful balloons, enhancing the visual appeal of the scene. In the foreground, a few cars can be seen driving along a winding road that cuts through the mountains. The cars are small compared to the vastness of the landscape, emphasizing the grandeur of the surroundings. The overall style of the video is a mix of adventure and tranquility, with the hot air balloons adding a touch of whimsy to the otherwise serene mountain landscape. The video is likely shot during the day, as the lighting is bright and even, casting soft shadows on the snow-covered mountains.
-The vibrant beauty of a sunflower field. The sunflowers, with their bright yellow petals and dark brown centers, are in full bloom, creating a stunning contrast against the green leaves and stems. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. The sun is shining brightly, casting a warm glow on the flowers and highlighting their intricate details. The video is shot from a low angle, looking up at the sunflowers, which adds a sense of grandeur and awe to the scene. The sunflowers are the main focus of the video, with no other objects or people present. The video is a celebration of nature's beauty and the simple joy of a sunny day in the countryside.
-A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell, is the main focus of the video, swimming gracefully towards the right side of the frame. The coral reef, teeming with life, is visible in the background, providing a vibrant and colorful backdrop to the turtle's journey. Several small fish, darting around the turtle, add a sense of movement and dynamism to the scene. The video is shot from a slightly elevated angle, providing a comprehensive view of the turtle's surroundings. The overall style of the video is calm and peaceful, capturing the beauty and tranquility of the underwater world.
-A vibrant underwater scene. A group of blue fish, with yellow fins, are swimming around a coral reef. The coral reef is a mix of brown and green, providing a natural habitat for the fish. The water is a deep blue, indicating a depth of around 30 feet. The fish are swimming in a circular pattern around the coral reef, indicating a sense of motion and activity. The overall scene is a beautiful representation of marine life.
-A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. The scene is a blur of motion, with cars speeding by and pedestrians navigating the crosswalks. The cityscape is a mix of towering buildings and illuminated signs, creating a vibrant and dynamic atmosphere. The perspective of the video is from a high angle, providing a bird's eye view of the street and its surroundings. The overall style of the video is dynamic and energetic, capturing the essence of urban life at night.
-A snowy forest landscape with a dirt road running through it. The road is flanked by trees covered in snow, and the ground is also covered in snow. The sun is shining, creating a bright and serene atmosphere. The road appears to be empty, and there are no people or animals visible in the video. The style of the video is a natural landscape shot, with a focus on the beauty of the snowy forest and the peacefulness of the road.
-The dynamic movement of tall, wispy grasses swaying in the wind. The sky above is filled with clouds, creating a dramatic backdrop. The sunlight pierces through the clouds, casting a warm glow on the scene. The grasses are a mix of green and brown, indicating a change in seasons. The overall style of the video is naturalistic, capturing the beauty of the landscape in a realistic manner. The focus is on the grasses and their movement, with the sky serving as a secondary element. The video does not contain any human or animal elements.
-A serene night scene in a forested area. The first frame shows a tranquil lake reflecting the star-filled sky above. The second frame reveals a beautiful sunset, casting a warm glow over the landscape. The third frame showcases the night sky, filled with stars and a vibrant Milky Way galaxy. The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. The style of the video is naturalistic, emphasizing the beauty of the night sky and the peacefulness of the forest.
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_short.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_short.txt
deleted file mode 100644
index 002864828da40e46dc4e19d7d580ccedfa1be7d9..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_short.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-A fat rabbit wearing a purple robe walking through a fantasy landscape
-Waves crashing against a lone lighthouse, ominous lighting
-A mystical forest showcasing the adventures of travelers who enter
-A blue-haired mage singing
-A surreal landscape with floating islands and waterfalls in the sky craft
-A blue bird standing in water
-A young man walks alone by the seaside
-Pink rose on a glass surface with droplets, close-up
-Drove viewpoint, a subway train coming out of a tunnel
-Space with all planets green and pink color with background of bright white stars
-A city floating in an astral space, with stars and nebulae
-Sunrise on top of a high-rise building
-Pink and cyan powder explosions
-Deers in the woods gaze into the camera under the sunlight
-In a flash of lightning, a wizard appeared from thin air, his long robes billowing in the wind
-A futuristic cyberpunk cityscape at night with towering neon-lit skyscrapers
-A scene where the trees, flowers, and animals come together to create a symphony of nature
-A ghostly ship sailing through the clouds, navigating through a sea under a moonlit sky
-A sunset with beautiful beach
-A young man walking alone in the forest
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_sora.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_sora.txt
deleted file mode 100644
index eeb887b1863e590e45054fb766694c1275cee987..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/t2v_sora.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
-Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
-A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors.
-Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
-Animated scene features a close-up of a short fluffy monster kneeling beside a melting red candle. The art style is 3D and realistic, with a focus on lighting and texture. The mood of the painting is one of wonder and curiosity, as the monster gazes at the flame with wide eyes and open mouth. Its pose and expression convey a sense of innocence and playfulness, as if it is exploring the world around it for the first time. The use of warm colors and dramatic lighting further enhances the cozy atmosphere of the image.
-A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures.
-This close-up shot of a Victoria crowned pigeon showcases its striking blue plumage and red chest. Its crest is made of delicate, lacy feathers, while its eye is a striking red color. The bird’s head is tilted slightly to the side, giving the impression of it looking regal and majestic. The background is blurred, drawing attention to the bird’s striking appearance.
-Photorealistic closeup video of two pirate ships battling each other as they sail inside a cup of coffee.
-A young man at his 20s is sitting on a piece of cloud in the sky, reading a book.
-Historical footage of California during the gold rush.
-A close up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
-Extreme close up of a 24 year old woman’s eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70mm, depth of field, vivid colors, cinematic
-A cartoon kangaroo disco dances.
-A beautiful homemade video showing the people of Lagos, Nigeria in the year 2056. Shot with a mobile phone camera.
-A petri dish with a bamboo forest growing within it that has tiny red pandas running around.
-The camera rotates around a large stack of vintage televisions all showing different programs — 1950s sci-fi movies, horror movies, news, static, a 1970s sitcom, etc, set inside a large New York museum gallery.
-3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.
-The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from it’s tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.
-Reflections in the window of a train traveling through the Tokyo suburbs.
-A drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast, the view showcases historic and magnificent architectural details and tiered pathways and patios, waves are seen crashing against the rocks below as the view overlooks the horizon of the coastal waters and hilly landscapes of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.
-A large orange octopus is seen resting on the bottom of the ocean floor, blending in with the sandy and rocky terrain. Its tentacles are spread out around its body, and its eyes are closed. The octopus is unaware of a king crab that is crawling towards it from behind a rock, its claws raised and ready to attack. The crab is brown and spiny, with long legs and antennae. The scene is captured from a wide angle, showing the vastness and depth of the ocean. The water is clear and blue, with rays of sunlight filtering through. The shot is sharp and crisp, with a high dynamic range. The octopus and the crab are in focus, while the background is slightly blurred, creating a depth of field effect.
-A flock of paper airplanes flutters through a dense jungle, weaving around trees as if they were migrating birds.
-A cat waking up its sleeping owner demanding breakfast. The owner tries to ignore the cat, but the cat tries new tactics and finally the owner pulls out a secret stash of treats from under the pillow to hold the cat off a little longer.
-Borneo wildlife on the Kinabatangan River
-A Chinese Lunar New Year celebration video with Chinese Dragon.
-Tour of an art gallery with many beautiful works of art in different styles.
-Beautiful, snowy Tokyo city is bustling. The camera moves through the bustling city street, following several people enjoying the beautiful snowy weather and shopping at nearby stalls. Gorgeous sakura petals are flying through the wind along with snowflakes.
-A stop motion animation of a flower growing out of the windowsill of a suburban house.
-The story of a robot’s life in a cyberpunk setting.
-An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film.
-A beautiful silhouette animation shows a wolf howling at the moon, feeling lonely, until it finds its pack.
-New York City submerged like Atlantis. Fish, whales, sea turtles and sharks swim through the streets of New York.
-A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in.
-Step-printing scene of a person running, cinematic film shot in 35mm.
-Five gray wolf pups frolicking and chasing each other around a remote gravel road, surrounded by grass. The pups run and leap, chasing each other, and nipping at each other, playing.
-Basketball through hoop then explodes.
-Archeologists discover a generic plastic chair in the desert, excavating and dusting it with great care.
-A grandmother with neatly combed grey hair stands behind a colorful birthday cake with numerous candles at a wood dining room table, expression is one of pure joy and happiness, with a happy glow in her eye. She leans forward and blows out the candles with a gentle puff, the cake has pink frosting and sprinkles and the candles cease to flicker, the grandmother wears a light blue blouse adorned with floral patterns, several happy friends and family sitting at the table can be seen celebrating, out of focus. The scene is beautifully captured, cinematic, showing a 3/4 view of the grandmother and the dining room. Warm color tones and soft lighting enhance the mood.
-The camera directly faces colorful buildings in Burano Italy. An adorable dalmation looks through a window on a building on the ground floor. Many people are walking and cycling along the canal streets in front of the buildings.
-An adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush tropical islands, 3D digital render art style.
-This close-up shot of a chameleon showcases its striking color changing capabilities. The background is blurred, drawing attention to the animal’s striking appearance.
-A corgi vlogging itself in tropical Maui.
-A white and orange tabby cat is seen happily darting through a dense garden, as if chasing something. Its eyes are wide and happy as it jogs forward, scanning the branches, flowers, and leaves as it walks. The path is narrow as it makes its way between all the plants. the scene is captured from a ground-level angle, following the cat closely, giving a low and intimate perspective. The image is cinematic with warm tones and a grainy texture. The scattered daylight between the leaves and plants above creates a warm contrast, accentuating the cat’s orange fur. The shot is clear and sharp, with a shallow depth of field.
-Aerial view of Santorini during the blue hour, showcasing the stunning architecture of white Cycladic buildings with blue domes. The caldera views are breathtaking, and the lighting creates a beautiful, serene atmosphere.
-Tiltshift of a construction site filled with workers, equipment, and heavy machinery.
-A giant, towering cloud in the shape of a man looms over the earth. The cloud man shoots lighting bolts down to the earth.
-A Samoyed and a Golden Retriever dog are playfully romping through a futuristic neon city at night. The neon lights emitted from the nearby buildings glistens off of their fur.
-The Glenfinnan Viaduct is a historic railway bridge in Scotland, UK, that crosses over the west highland line between the towns of Mallaig and Fort William. It is a stunning sight as a steam train leaves the bridge, traveling over the arch-covered viaduct. The landscape is dotted with lush greenery and rocky mountains, creating a picturesque backdrop for the train journey. The sky is blue and the sun is shining, making for a beautiful day to explore this majestic spot.
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/ucf101_id.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/ucf101_id.txt
deleted file mode 100644
index e8371f00609f33a59378dd2f6bb4385a7df8bd63..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/ucf101_id.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-0
-1
-2
-3
-4
-5
diff --git a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/ucf101_labels.txt b/PyTorch/built-in/mm/OpenSora1.1/assets/texts/ucf101_labels.txt
deleted file mode 100644
index 264dbfd8837a4b89b81d05b06c48b567dfa1d150..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/assets/texts/ucf101_labels.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-Apply Eye Makeup
-Apply Lipstick
-Archery
-Baby Crawling
-Balance Beam
-Band Marching
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/dit/inference/16x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/dit/inference/16x256x256.py
deleted file mode 100644
index 44818fe095f5f16f960d5e7d0c7f974076aaeaa7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/dit/inference/16x256x256.py
+++ /dev/null
@@ -1,31 +0,0 @@
-num_frames = 16
-fps = 8
-image_size = (256, 256)
-
-# Define model
-model = dict(
- type="DiT-XL/2",
- condition="text",
- from_pretrained="PRETRAINED_MODEL",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="clip",
- from_pretrained="openai/clip-vit-base-patch32",
- model_max_length=77,
-)
-scheduler = dict(
- type="dpm-solver",
- num_sampling_steps=20,
- cfg_scale=4.0,
-)
-dtype = "bf16"
-
-# Others
-batch_size = 2
-seed = 42
-prompt_path = "./assets/texts/ucf101_labels.txt"
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/dit/inference/1x256x256-class.py b/PyTorch/built-in/mm/OpenSora1.1/configs/dit/inference/1x256x256-class.py
deleted file mode 100644
index bebaa11e286db0ea7968723909482e18f28a12c3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/dit/inference/1x256x256-class.py
+++ /dev/null
@@ -1,31 +0,0 @@
-num_frames = 1
-fps = 1
-image_size = (256, 256)
-
-# Define model
-model = dict(
- type="DiT-XL/2",
- no_temporal_pos_emb=True,
- condition="label_1000",
- from_pretrained="DiT-XL-2-256x256.pt",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="classes",
- num_classes=1000,
-)
-scheduler = dict(
- type="dpm-solver",
- num_sampling_steps=20,
- cfg_scale=4.0,
-)
-dtype = "bf16"
-
-# Others
-batch_size = 2
-seed = 42
-prompt_path = "./assets/texts/imagenet_id.txt"
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/dit/inference/1x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/dit/inference/1x256x256.py
deleted file mode 100644
index e7cb9a2d20e6ae3a19e468f493f0e125cbb0a33f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/dit/inference/1x256x256.py
+++ /dev/null
@@ -1,32 +0,0 @@
-num_frames = 1
-fps = 1
-image_size = (256, 256)
-
-# Define model
-model = dict(
- type="DiT-XL/2",
- no_temporal_pos_emb=True,
- condition="text",
- from_pretrained="PRETRAINED_MODEL",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="clip",
- from_pretrained="openai/clip-vit-base-patch32",
- model_max_length=77,
-)
-scheduler = dict(
- type="dpm-solver",
- num_sampling_steps=20,
- cfg_scale=4.0,
-)
-dtype = "bf16"
-
-# Others
-batch_size = 2
-seed = 42
-prompt_path = "./assets/texts/imagenet_labels.txt"
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/dit/train/16x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/dit/train/16x256x256.py
deleted file mode 100644
index 42845dec8dc39b3c211942610001c41a5edc23ba..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/dit/train/16x256x256.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=16,
- frame_interval=3,
- image_size=(256, 256),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="DiT-XL/2",
- from_pretrained="DiT-XL-2-256x256.pt",
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="clip",
- from_pretrained="openai/clip-vit-base-patch32",
- model_max_length=77,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 1000
-load = None
-
-batch_size = 8
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/dit/train/1x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/dit/train/1x256x256.py
deleted file mode 100644
index c423b24b208b473093f4b1663f99d352249be9d1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/dit/train/1x256x256.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=1,
- frame_interval=1,
- image_size=(256, 256),
- transform_name="center",
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = False
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="DiT-XL/2",
- no_temporal_pos_emb=True,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="clip",
- from_pretrained="openai/clip-vit-base-patch32",
- model_max_length=77,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 1000
-load = None
-
-batch_size = 128
-lr = 1e-4 # according to DiT repo
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/latte/inference/16x256x256-class.py b/PyTorch/built-in/mm/OpenSora1.1/configs/latte/inference/16x256x256-class.py
deleted file mode 100644
index 8ccf6d43604240e724f0e78f2de3aefa85449277..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/latte/inference/16x256x256-class.py
+++ /dev/null
@@ -1,30 +0,0 @@
-num_frames = 16
-fps = 8
-image_size = (256, 256)
-
-# Define model
-model = dict(
- type="Latte-XL/2",
- condition="label_101",
- from_pretrained="Latte-XL-2-256x256-ucf101.pt",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="classes",
- num_classes=101,
-)
-scheduler = dict(
- type="dpm-solver",
- num_sampling_steps=20,
- cfg_scale=4.0,
-)
-dtype = "bf16"
-
-# Others
-batch_size = 2
-seed = 42
-prompt_path = "./assets/texts/ucf101_id.txt"
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/latte/inference/16x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/latte/inference/16x256x256.py
deleted file mode 100644
index 6bdd58fad5f81bcca29c2d975fd2dd89a4bf7c58..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/latte/inference/16x256x256.py
+++ /dev/null
@@ -1,31 +0,0 @@
-num_frames = 16
-fps = 8
-image_size = (256, 256)
-
-# Define model
-model = dict(
- type="Latte-XL/2",
- condition="text",
- from_pretrained="PRETRAINED_MODEL",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="clip",
- from_pretrained="openai/clip-vit-base-patch32",
- model_max_length=77,
-)
-scheduler = dict(
- type="dpm-solver",
- num_sampling_steps=20,
- cfg_scale=4.0,
-)
-dtype = "bf16"
-
-# Others
-batch_size = 2
-seed = 42
-prompt_path = "./assets/texts/ucf101_labels.txt"
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/latte/train/16x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/latte/train/16x256x256.py
deleted file mode 100644
index 41573a05c5744f6ceac4453adac5fd96366ad2d1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/latte/train/16x256x256.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=16,
- frame_interval=3,
- image_size=(256, 256),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="Latte-XL/2",
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="clip",
- from_pretrained="openai/clip-vit-base-patch32",
- model_max_length=77,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 1000
-load = None
-
-batch_size = 8
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/inference/sample-ref.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/inference/sample-ref.py
deleted file mode 100644
index 735c01baddca52af5134f656a5f93b6b3546ab9d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/inference/sample-ref.py
+++ /dev/null
@@ -1,70 +0,0 @@
-num_frames = 16
-frame_interval = 3
-fps = 24
-image_size = (240, 426)
-multi_resolution = "STDiT2"
-
-# Condition
-prompt_path = None
-prompt = [
- "A car driving on the ocean.",
- 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
- "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
-]
-
-loop = 2
-condition_frame_length = 4
-# (
-# loop id, [the loop index of the condition image or video]
-# reference id, [the index of the condition image or video in the reference_path]
-# reference start, [the start frame of the condition image or video]
-# target start, [the location to insert]
-# length, [the number of frames to insert]
-# edit_ratio [the edit rate of the condition image or video]
-# )
-# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
-# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples
-mask_strategy = [
- "0,0,0,0,8,0.3",
- None,
- "0",
-]
-reference_path = [
- "https://cdn.openai.com/tmp/s/interp/d0.mp4",
- None,
- "assets/images/condition/wave.png",
-]
-
-# Define model
-model = dict(
- type="STDiT2-XL/2",
- from_pretrained=None,
- input_sq_size=512,
- qk_norm=True,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- cache_dir=None, # "/mnt/hdd/cached_models",
- micro_batch_size=4,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- cache_dir=None, # "/mnt/hdd/cached_models",
- model_max_length=200,
-)
-scheduler = dict(
- type="iddpm",
- num_sampling_steps=100,
- cfg_scale=7.0,
- cfg_channel=3, # or None
-)
-dtype = "bf16"
-
-# Others
-batch_size = 1
-seed = 42
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/inference/sample.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/inference/sample.py
deleted file mode 100644
index cec80736ef2019c256148b91b1563919ca09f7b0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/inference/sample.py
+++ /dev/null
@@ -1,43 +0,0 @@
-num_frames = 16
-frame_interval = 3
-fps = 24
-image_size = (240, 426)
-multi_resolution = "STDiT2"
-
-# Define model
-model = dict(
- type="STDiT2-XL/2",
- from_pretrained=None,
- input_sq_size=512,
- qk_norm=True,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- cache_dir=None, # "/mnt/hdd/cached_models",
- micro_batch_size=4,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- cache_dir=None, # "/mnt/hdd/cached_models",
- model_max_length=200,
-)
-scheduler = dict(
- type="iddpm",
- num_sampling_steps=100,
- cfg_scale=7.0,
- cfg_channel=3, # or None
-)
-dtype = "bf16"
-
-# Condition
-prompt_path = "./assets/texts/t2v_samples.txt"
-prompt = None # prompt has higher priority than prompt_path
-
-# Others
-batch_size = 1
-seed = 42
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/benchmark.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/benchmark.py
deleted file mode 100644
index 5310b43f3642f0dba80ed8afa991f91b58344343..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/benchmark.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# this file is only for batch size search and is not used for training
-
-# Define dataset
-dataset = dict(
- type="VariableVideoTextDataset",
- data_path=None,
- num_frames=None,
- frame_interval=3,
- image_size=(None, None),
- transform_name="resize_crop",
-)
-
-# bucket config format:
-# 1. { resolution: {num_frames: (prob, batch_size)} }, in this case batch_size is ignored when searching
-# 2. { resolution: {num_frames: (prob, (max_batch_size, ))} }, batch_size is searched in the range [batch_size_start, max_batch_size), batch_size_start is configured via CLI
-# 3. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size)
-# 4. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) with step_size (grid search)
-# 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used
-
-bucket_config = {
- # == manual search ==
- # "240p": {128: (1.0, 2)}, # 4.28s/it
- # "240p": {64: (1.0, 4)},
- # "240p": {32: (1.0, 8)}, # 4.6s/it
- # "240p": {16: (1.0, 16)}, # 4.6s/it
- # "480p": {16: (1.0, 4)}, # 4.6s/it
- # "720p": {16: (1.0, 2)}, # 5.89s/it
- # "256": {1: (1.0, 256)}, # 4.5s/it
- # "512": {1: (1.0, 96)}, # 4.7s/it
- # "512": {1: (1.0, 128)}, # 6.3s/it
- # "480p": {1: (1.0, 50)}, # 4.0s/it
- # "1024": {1: (1.0, 32)}, # 6.8s/it
- # "1024": {1: (1.0, 20)}, # 4.3s/it
- # "1080p": {1: (1.0, 16)}, # 8.6s/it
- # "1080p": {1: (1.0, 8)}, # 4.4s/it
- # == stage 2 ==
- # "240p": {
- # 16: (1.0, (2, 32)),
- # 32: (1.0, (2, 16)),
- # 64: (1.0, (2, 8)),
- # 128: (1.0, (2, 6)),
- # },
- # "256": {1: (1.0, (128, 300))},
- # "512": {1: (0.5, (64, 128))},
- # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
- # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)}, # No examples now
- # "1024": {1: (0.3, (8, 64))},
- # "1080p": {1: (0.3, (2, 32))},
- # == stage 3 ==
- "720p": {1: (20, 40), 32: (0.5, (2, 4)), 64: (0.5, (1, 1))},
-}
-
-
-# Define acceleration
-num_workers = 4
-num_bucket_build_workers = 16
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT2-XL/2",
- from_pretrained=None,
- input_sq_size=512, # pretrained model is trained on 512x512
- qk_norm=True,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=4,
- local_files_only=True,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=200,
- shardformer=True,
- local_files_only=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 1000
-load = None
-
-batch_size = None
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/image.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/image.py
deleted file mode 100644
index 45748b77edaf4447830d97c20507697787be644b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/image.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VariableVideoTextDataset",
- data_path=None,
- num_frames=None,
- frame_interval=3,
- image_size=(None, None),
- transform_name="resize_crop",
-)
-bucket_config = { # 6s/it
- "256": {1: (1.0, 256)},
- "512": {1: (1.0, 80)},
- "480p": {1: (1.0, 52)},
- "1024": {1: (1.0, 20)},
- "1080p": {1: (1.0, 8)},
-}
-
-# Define acceleration
-num_workers = 4
-num_bucket_build_workers = 16
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT2-XL/2",
- from_pretrained=None,
- input_sq_size=512, # pretrained model is trained on 512x512
- qk_norm=True,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=4,
- local_files_only=True,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=200,
- shardformer=True,
- local_files_only=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 500
-load = None
-
-batch_size = 10 # only for logging
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/stage1.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/stage1.py
deleted file mode 100644
index ff927daf3949fcc236e3f1ed6e2f708f0826af4c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/stage1.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VariableVideoTextDataset",
- data_path=None,
- num_frames=None,
- frame_interval=3,
- image_size=(None, None),
- transform_name="resize_crop",
-)
-# IMG: 1024 (20%) 512 (30%) 256 (50%) drop (50%)
-bucket_config = { # 1s/it
- "144p": {1: (0.5, 48), 16: (1.0, 6), 32: (1.0, 3), 96: (1.0, 1)},
- "256": {1: (0.5, 24), 16: (0.5, 3), 48: (0.5, 1), 64: (0.0, None)},
- "240p": {16: (0.3, 2), 32: (0.3, 1), 64: (0.0, None)},
- "512": {1: (0.4, 12)},
- "1024": {1: (0.3, 3)},
-}
-mask_ratios = {
- "mask_no": 0.75,
- "mask_quarter_random": 0.025,
- "mask_quarter_head": 0.025,
- "mask_quarter_tail": 0.025,
- "mask_quarter_head_tail": 0.05,
- "mask_image_random": 0.025,
- "mask_image_head": 0.025,
- "mask_image_tail": 0.025,
- "mask_image_head_tail": 0.05,
-}
-
-# Define acceleration
-num_workers = 8
-num_bucket_build_workers = 16
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT2-XL/2",
- from_pretrained=None,
- input_sq_size=512, # pretrained model is trained on 512x512
- qk_norm=True,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=4,
- local_files_only=True,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=200,
- shardformer=True,
- local_files_only=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 500
-load = None
-
-batch_size = None
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/stage2.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/stage2.py
deleted file mode 100644
index fb7e6d57318622ae1cfc964158235ca2eef1842c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/stage2.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VariableVideoTextDataset",
- data_path=None,
- num_frames=None,
- frame_interval=3,
- image_size=(None, None),
- transform_name="resize_crop",
-)
-bucket_config = { # 7s/it
- "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)},
- "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
- "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
- "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)},
- "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
- "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
- "1024": {1: (0.3, 20)},
- "1080p": {1: (0.4, 8)},
-}
-mask_ratios = {
- "mask_no": 0.75,
- "mask_quarter_random": 0.025,
- "mask_quarter_head": 0.025,
- "mask_quarter_tail": 0.025,
- "mask_quarter_head_tail": 0.05,
- "mask_image_random": 0.025,
- "mask_image_head": 0.025,
- "mask_image_tail": 0.025,
- "mask_image_head_tail": 0.05,
-}
-
-# Define acceleration
-num_workers = 8
-num_bucket_build_workers = 16
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT2-XL/2",
- from_pretrained=None,
- input_sq_size=512, # pretrained model is trained on 512x512
- qk_norm=True,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=4,
- local_files_only=True,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=200,
- shardformer=True,
- local_files_only=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 500
-load = None
-
-batch_size = None
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/stage3.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/stage3.py
deleted file mode 100644
index 84857621330e4b3deefca8cef7df557c6d95673f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/stage3.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VariableVideoTextDataset",
- data_path=None,
- num_frames=None,
- frame_interval=3,
- image_size=(None, None),
- transform_name="resize_crop",
-)
-bucket_config = { # 13s/it
- "144p": {1: (1.0, 200), 16: (1.0, 36), 32: (1.0, 18), 64: (1.0, 9), 128: (1.0, 4)},
- "256": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 11), 64: (0.5, 6), 128: (0.8, 4)},
- "240p": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 10), 64: (0.5, 6), 128: (0.5, 3)},
- "360p": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.5, 1)},
- "512": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.8, 1)},
- "480p": {1: (0.4, 80), 16: (0.6, 6), 32: (0.6, 3), 64: (0.6, 1), 128: (0.0, None)},
- "720p": {1: (0.4, 40), 16: (0.6, 3), 32: (0.6, 1), 96: (0.0, None)},
- "1024": {1: (0.3, 40)},
-}
-mask_ratios = {
- "mask_no": 0.75,
- "mask_quarter_random": 0.025,
- "mask_quarter_head": 0.025,
- "mask_quarter_tail": 0.025,
- "mask_quarter_head_tail": 0.05,
- "mask_image_random": 0.025,
- "mask_image_head": 0.025,
- "mask_image_tail": 0.025,
- "mask_image_head_tail": 0.05,
-}
-
-# Define acceleration
-num_workers = 8
-num_bucket_build_workers = 16
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT2-XL/2",
- from_pretrained=None,
- input_sq_size=512, # pretrained model is trained on 512x512
- qk_norm=True,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=4,
- local_files_only=True,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=200,
- shardformer=True,
- local_files_only=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 500
-load = None
-
-batch_size = None
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/video.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/video.py
deleted file mode 100644
index ef574f2ecdd911ed9dd441ff1a2e372711034baf..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora-v1-1/train/video.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VariableVideoTextDataset",
- data_path=None,
- num_frames=None,
- frame_interval=3,
- image_size=(None, None),
- transform_name="resize_crop",
-)
-bucket_config = { # 6s/it
- "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
- "256": {1: (1.0, 256)},
- "512": {1: (0.5, 80)},
- "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)},
- "720p": {16: (0.1, 2), 32: (0.0, None)}, # No examples now
- "1024": {1: (0.3, 20)},
- "1080p": {1: (0.3, 8)},
-}
-
-# Define acceleration
-num_workers = 4
-num_bucket_build_workers = 16
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT2-XL/2",
- from_pretrained=None,
- input_sq_size=512, # pretrained model is trained on 512x512
- qk_norm=True,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=4,
- local_files_only=True,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=200,
- shardformer=True,
- local_files_only=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 500
-load = None
-
-batch_size = 10 # only for logging
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/inference/16x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/inference/16x256x256.py
deleted file mode 100644
index 50ead832a61c481632a821b330341505776a384e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/inference/16x256x256.py
+++ /dev/null
@@ -1,39 +0,0 @@
-num_frames = 16
-fps = 24 // 3
-image_size = (256, 256)
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=0.5,
- time_scale=1.0,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
- from_pretrained="PRETRAINED_MODEL",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=4,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
-)
-scheduler = dict(
- type="iddpm",
- num_sampling_steps=100,
- cfg_scale=7.0,
- cfg_channel=3, # or None
-)
-dtype = "bf16"
-
-# Condition
-prompt_path = "./assets/texts/t2v_samples.txt"
-prompt = None # prompt has higher priority than prompt_path
-
-# Others
-batch_size = 1
-seed = 42
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/inference/16x512x512.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/inference/16x512x512.py
deleted file mode 100644
index 58d82437b762fbf0935eb97e86665f4eda5329cb..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/inference/16x512x512.py
+++ /dev/null
@@ -1,35 +0,0 @@
-num_frames = 16
-fps = 24 // 3
-image_size = (512, 512)
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=1.0,
- time_scale=1.0,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
- from_pretrained="PRETRAINED_MODEL",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=2,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
-)
-scheduler = dict(
- type="iddpm",
- num_sampling_steps=100,
- cfg_scale=7.0,
-)
-dtype = "bf16"
-
-# Others
-batch_size = 2
-seed = 42
-prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/inference/64x512x512.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/inference/64x512x512.py
deleted file mode 100644
index dbbe2409823dbdb7e8628f705e64e9847172ddf2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/inference/64x512x512.py
+++ /dev/null
@@ -1,35 +0,0 @@
-num_frames = 64
-fps = 24 // 2
-image_size = (512, 512)
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=1.0,
- time_scale=2 / 3,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
- from_pretrained="PRETRAINED_MODEL",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=128,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
-)
-scheduler = dict(
- type="iddpm",
- num_sampling_steps=100,
- cfg_scale=7.0,
-)
-dtype = "bf16"
-
-# Others
-batch_size = 1
-seed = 42
-prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x256x256-mask.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x256x256-mask.py
deleted file mode 100644
index 0e478e5fd7c728fcbfa7791e6153dac789c7999f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x256x256-mask.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=16,
- frame_interval=3,
- image_size=(256, 256),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=0.5,
- time_scale=1.0,
- from_pretrained="PixArt-XL-2-512x512.pth",
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-mask_ratios = {
- "mask_no": 0.7,
- "mask_random": 0.15,
- "mask_head": 0.05,
- "mask_tail": 0.05,
- "mask_head_tail": 0.05,
-}
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 1000
-load = None
-
-batch_size = 8
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x256x256-spee.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x256x256-spee.py
deleted file mode 100644
index b46c749dcbd009d02428183b73ede6b6bdb96737..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x256x256-spee.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=16,
- frame_interval=3,
- image_size=(256, 256),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=0.5,
- time_scale=1.0,
- from_pretrained="PixArt-XL-2-512x512.pth",
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-mask_ratios = {
- "mask_no": 0.5,
- "mask_random": 0.29,
- "mask_head": 0.07,
- "mask_tail": 0.07,
- "mask_head_tail": 0.07,
-}
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True,
-)
-scheduler = dict(
- type="iddpm-speed",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 1000
-load = None
-
-batch_size = 8
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x256x256.py
deleted file mode 100644
index 144adee4d5bccc2a4ee1cda603ee8a6abf9c99c8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x256x256.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=16,
- frame_interval=3,
- image_size=(256, 256),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=0.5,
- time_scale=1.0,
- from_pretrained="PixArt-XL-2-512x512.pth",
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 1000
-load = None
-
-batch_size = 8
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x512x512.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x512x512.py
deleted file mode 100644
index a2a87b15d08243cc9993b378735f07bf616cc5e8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/16x512x512.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=16,
- frame_interval=3,
- image_size=(512, 512),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=1.0,
- time_scale=1.0,
- from_pretrained=None,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=128,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 500
-load = None
-
-batch_size = 8
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/360x512x512.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/360x512x512.py
deleted file mode 100644
index 7cb4f561475f1125d04691938de566c7356ddd5a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/360x512x512.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=360,
- frame_interval=3,
- image_size=(512, 512),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define acceleration
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2-seq"
-sp_size = 2
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=1.0,
- time_scale=2 / 3,
- from_pretrained=None,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
- enable_sequence_parallelism=True, # enable sq here
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=128,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 250
-load = None
-
-batch_size = 1
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/64x512x512-sp.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/64x512x512-sp.py
deleted file mode 100644
index 880adb25b83a9ed80ee0dffcdb4b5380c0dab558..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/64x512x512-sp.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=16,
- frame_interval=3,
- image_size=(512, 512),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 2
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=1.0,
- time_scale=2 / 3,
- from_pretrained=None,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
- enable_sequence_parallelism=True, # enable sq here
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 1000
-load = None
-
-batch_size = 1
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/64x512x512.py b/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/64x512x512.py
deleted file mode 100644
index b65a41955f81f583c0b371e983afd4184508182b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/opensora/train/64x512x512.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=64,
- frame_interval=3,
- image_size=(512, 512),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=1.0,
- time_scale=2 / 3,
- from_pretrained=None,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=64,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 250
-load = None
-
-batch_size = 4
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/16x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/16x256x256.py
deleted file mode 100644
index 5013c08739f54e174ab9394353f6055cca409e96..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/16x256x256.py
+++ /dev/null
@@ -1,32 +0,0 @@
-num_frames = 16
-fps = 8
-image_size = (256, 256)
-
-# Define model
-model = dict(
- type="PixArt-XL/2",
- space_scale=0.5,
- time_scale=1.0,
- from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
-)
-scheduler = dict(
- type="dpm-solver",
- num_sampling_steps=20,
- cfg_scale=7.0,
-)
-dtype = "bf16"
-
-# Others
-batch_size = 2
-seed = 42
-prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/1x1024MS.py b/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/1x1024MS.py
deleted file mode 100644
index e6af8c6773b2dde38be7203a98bfa2f59cde8901..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/1x1024MS.py
+++ /dev/null
@@ -1,34 +0,0 @@
-num_frames = 1
-fps = 1
-image_size = (1920, 512)
-multi_resolution = "PixArtMS"
-
-# Define model
-model = dict(
- type="PixArtMS-XL/2",
- space_scale=2.0,
- time_scale=1.0,
- no_temporal_pos_emb=True,
- from_pretrained="PixArt-XL-2-1024-MS.pth",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
-)
-scheduler = dict(
- type="dpm-solver",
- num_sampling_steps=20,
- cfg_scale=7.0,
-)
-dtype = "bf16"
-
-# Others
-batch_size = 2
-seed = 42
-prompt_path = "./assets/texts/t2i_samples.txt"
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/1x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/1x256x256.py
deleted file mode 100644
index 16f92602b6fab414726aad3a2cd3b79b0ee5abed..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/1x256x256.py
+++ /dev/null
@@ -1,33 +0,0 @@
-num_frames = 1
-fps = 1
-image_size = (256, 256)
-
-# Define model
-model = dict(
- type="PixArt-XL/2",
- space_scale=1.0,
- time_scale=1.0,
- no_temporal_pos_emb=True,
- from_pretrained="PixArt-XL-2-256x256.pth",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
-)
-scheduler = dict(
- type="dpm-solver",
- num_sampling_steps=20,
- cfg_scale=7.0,
-)
-dtype = "bf16"
-
-# Others
-batch_size = 2
-seed = 42
-prompt_path = "./assets/texts/t2i_samples.txt"
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/1x512x512.py b/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/1x512x512.py
deleted file mode 100644
index dbc90df5f51bff532b3309cb9f7140b267a00945..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/inference/1x512x512.py
+++ /dev/null
@@ -1,39 +0,0 @@
-num_frames = 1
-fps = 1
-image_size = (512, 512)
-
-# Define model
-model = dict(
- type="PixArt-XL/2",
- space_scale=1.0,
- time_scale=1.0,
- no_temporal_pos_emb=True,
- from_pretrained="PixArt-XL-2-512x512.pth",
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
-)
-scheduler = dict(
- type="dpm-solver",
- num_sampling_steps=20,
- cfg_scale=7.0,
-)
-dtype = "bf16"
-
-# prompt_path = "./assets/texts/t2i_samples.txt"
-prompt = [
- "Pirate ship trapped in a cosmic maelstrom nebula.",
- "A small cactus with a happy face in the Sahara desert.",
- "A small cactus with a sad face in the Sahara desert.",
-]
-
-# Others
-batch_size = 2
-seed = 42
-save_dir = "./samples/samples/"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/train/16x256x256.py b/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/train/16x256x256.py
deleted file mode 100644
index 701b9da32edc688a5d54cafa64fd76f233236157..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/train/16x256x256.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=16,
- frame_interval=3,
- image_size=(256, 256),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="PixArt-XL/2",
- space_scale=0.5,
- time_scale=1.0,
- from_pretrained="PixArt-XL-2-512x512.pth",
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 1000
-load = None
-
-batch_size = 8
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/train/1x512x512.py b/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/train/1x512x512.py
deleted file mode 100644
index 8b2b5d4aa0a50ef25a49d693e865dfd8e5644810..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/train/1x512x512.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=1,
- frame_interval=3,
- image_size=(512, 512),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-# Define model
-model = dict(
- type="PixArt-XL/2",
- space_scale=1.0,
- time_scale=1.0,
- no_temporal_pos_emb=True,
- from_pretrained="PixArt-XL-2-512x512.pth",
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 1000
-load = None
-
-batch_size = 32
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/train/64x512x512.py b/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/train/64x512x512.py
deleted file mode 100644
index 3448ce7c3a88da4d061336df8836ee94ae8b1116..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/configs/pixart/train/64x512x512.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Define dataset
-dataset = dict(
- type="VideoTextDataset",
- data_path=None,
- num_frames=64,
- frame_interval=3,
- image_size=(256, 256),
-)
-
-# Define acceleration
-num_workers = 4
-dtype = "bf16"
-grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
-
-
-# Define model
-model = dict(
- type="PixArt-XL/2",
- space_scale=1.0,
- time_scale=2 / 3,
- from_pretrained=None,
- enable_flashattn=True,
- enable_layernorm_kernel=True,
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=128,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True,
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="",
-)
-
-# Others
-seed = 42
-outputs = "outputs"
-wandb = False
-
-epochs = 1000
-log_every = 10
-ckpt_every = 250
-load = None
-
-batch_size = 4
-lr = 2e-5
-grad_clip = 1.0
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/acceleration.md b/PyTorch/built-in/mm/OpenSora1.1/docs/acceleration.md
deleted file mode 100644
index 6a9842739f7caa71bb02b134146072c270981169..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/acceleration.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# Acceleration
-
-Open-Sora aims to provide a high-speed training framework for diffusion models. We can achieve **55%** training speed acceleration when training on **64 frames 512x512 videos**. Our framework support training **1min 1080p videos**.
-
-## Accelerated Transformer
-
-Open-Sora boosts the training speed by:
-
-- Kernel optimization including [flash attention](https://github.com/Dao-AILab/flash-attention), fused layernorm kernel, and the ones compiled by colossalAI.
-- Hybrid parallelism including ZeRO.
-- Gradient checkpointing for larger batch size.
-
-Our training speed on images is comparable to [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT), a project to accelerate DiT training. The training speed is measured on 8 H800 GPUs with batch size 128, image size 256x256.
-
-| Model | Throughput (img/s/GPU) | Throughput (tokens/s/GPU) |
-| -------- | ---------------------- | ------------------------- |
-| DiT | 100 | 26k |
-| OpenDiT | 175 | 45k |
-| OpenSora | 175 | 45k |
-
-## Efficient STDiT
-
-Our STDiT adopts spatial-temporal attention to model the video data. Compared with directly applying full attention on DiT, our STDiT is more efficient as the number of frames increases. Our current framework only supports sequence parallelism for very long sequence.
-
-The training speed is measured on 8 H800 GPUs with acceleration techniques applied, GC means gradient checkpointing. Both with T5 conditioning like PixArt.
-
-| Model | Setting | Throughput (sample/s/GPU) | Throughput (tokens/s/GPU) |
-| ---------------- | -------------- | ------------------------- | ------------------------- |
-| DiT | 16x256 (4k) | 7.20 | 29k |
-| STDiT | 16x256 (4k) | 7.00 | 28k |
-| DiT | 16x512 (16k) | 0.85 | 14k |
-| STDiT | 16x512 (16k) | 1.45 | 23k |
-| DiT (GC) | 64x512 (65k) | 0.08 | 5k |
-| STDiT (GC) | 64x512 (65k) | 0.40 | 25k |
-| STDiT (GC, sp=2) | 360x512 (370k) | 0.10 | 18k |
-
-With a 4x downsampling in the temporal dimension with Video-VAE, an 24fps video has 450 frames. The gap between the speed of STDiT (28k tokens/s) and DiT on images (up to 45k tokens/s) mainly comes from the T5 and VAE encoding, and temporal attention.
-
-## Accelerated Encoder (T5, VAE)
-
-During training, texts are encoded by T5, and videos are encoded by VAE. Typically there are two ways to accelerate the training:
-
-1. Preprocess text and video data in advance and save them to disk.
-2. Encode text and video data during training, and accelerate the encoding process.
-
-For option 1, 120 tokens for one sample require 1M disk space, and a 64x64x64 latent requires 4M. Considering a training dataset with 10M video clips, the total disk space required is 50TB. Our storage system is not ready at this time for this scale of data.
-
-For option 2, we boost T5 speed and memory requirement. According to [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT), we find VAE consumes a large number of GPU memory. Thus we split batch size into smaller ones for VAE encoding. With both techniques, we can greatly accelerate the training speed.
-
-The training speed is measured on 8 H800 GPUs with STDiT.
-
-| Acceleration | Setting | Throughput (img/s/GPU) | Throughput (tokens/s/GPU) |
-| ------------ | ------------- | ---------------------- | ------------------------- |
-| Baseline | 16x256 (4k) | 6.16 | 25k |
-| w. faster T5 | 16x256 (4k) | 7.00 | 29k |
-| Baseline | 64x512 (65k) | 0.94 | 15k |
-| w. both | 64x512 (65k) | 1.45 | 23k |
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/commands.md b/PyTorch/built-in/mm/OpenSora1.1/docs/commands.md
deleted file mode 100644
index 2d7420f5f8ec31ca284c35b83adec38adece1499..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/commands.md
+++ /dev/null
@@ -1,212 +0,0 @@
-# Commands
-
-- [Inference](#inference)
- - [Inference with Open-Sora 1.1](#inference-with-open-sora-11)
- - [Inference with DiT pretrained on ImageNet](#inference-with-dit-pretrained-on-imagenet)
- - [Inference with Latte pretrained on UCF101](#inference-with-latte-pretrained-on-ucf101)
- - [Inference with PixArt-α pretrained weights](#inference-with-pixart-α-pretrained-weights)
- - [Inference with checkpoints saved during training](#inference-with-checkpoints-saved-during-training)
- - [Inference Hyperparameters](#inference-hyperparameters)
-- [Training](#training)
- - [Training Hyperparameters](#training-hyperparameters)
-- [Search batch size for buckets](#search-batch-size-for-buckets)
-
-## Inference
-
-You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).
-
-### Inference with Open-Sora 1.1
-
-Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument.
-
-```bash
-# image sampling with prompt path
-python scripts/inference.py configs/opensora-v1-1/inference/sample.py \
- --ckpt-path CKPT_PATH --prompt-path assets/texts/t2i_samples.txt --num-frames 1 --image-size 1024 1024
-
-# image sampling with prompt
-python scripts/inference.py configs/opensora-v1-1/inference/sample.py \
- --ckpt-path CKPT_PATH --prompt "A beautiful sunset over the city" --num-frames 1 --image-size 1024 1024
-
-# video sampling
-python scripts/inference.py configs/opensora-v1-1/inference/sample.py \
- --ckpt-path CKPT_PATH --prompt "A beautiful sunset over the city" --num-frames 16 --image-size 480 854
-```
-
-You can adjust the `--num-frames` and `--image-size` to generate different results. We recommend you to use the same image size as the training resolution, which is defined in [aspect.py](/opensora/datasets/aspect.py). Some examples are shown below.
-
-- 240p
- - 16:9 240x426
- - 3:4 276x368
- - 1:1 320x320
-- 480p
- - 16:9 480x854
- - 3:4 554x738
- - 1:1 640x640
-- 720p
- - 16:9 720x1280
- - 3:4 832x1110
- - 1:1 960x960
-
-`inference-long.py` is compatible with `inference.py` and supports advanced features.
-
-```bash
-# image condition
-python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
- --num-frames 32 --image-size 240 426 --sample-name image-cond \
- --prompt 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/wave.png","mask_strategy": "0"}'
-
-# video extending
-python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
- --num-frames 32 --image-size 240 426 --sample-name image-cond \
- --prompt 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,0,-8,8"}'
-
-# long video generation
-python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
- --num-frames 32 --image-size 240 426 --loop 16 --condition-frame-length 8 --sample-name long \
- --prompt '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16"}'
-
-# video connecting
-python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
- --num-frames 32 --image-size 240 426 --sample-name connect \
- --prompt 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}'
-
-# video editing
-python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
- --num-frames 32 --image-size 480 853 --sample-name edit \
- --prompt 'A cyberpunk-style city at night.{"reference_path": "https://cdn.pixabay.com/video/2021/10/12/91744-636709154_large.mp4","mask_strategy": "0,0,0,0,32,0.4"}'
-```
-
-### Inference with DiT pretrained on ImageNet
-
-The following command automatically downloads the pretrained weights on ImageNet and runs inference.
-
-```bash
-python scripts/inference.py configs/dit/inference/1x256x256-class.py --ckpt-path DiT-XL-2-256x256.pt
-```
-
-### Inference with Latte pretrained on UCF101
-
-The following command automatically downloads the pretrained weights on UCF101 and runs inference.
-
-```bash
-python scripts/inference.py configs/latte/inference/16x256x256-class.py --ckpt-path Latte-XL-2-256x256-ucf101.pt
-```
-
-### Inference with PixArt-α pretrained weights
-
-Download T5 into `./pretrained_models` and run the following command.
-
-```bash
-# 256x256
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x256x256.py --ckpt-path PixArt-XL-2-256x256.pth
-
-# 512x512
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x512x512.py --ckpt-path PixArt-XL-2-512x512.pth
-
-# 1024 multi-scale
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x1024MS.py --ckpt-path PixArt-XL-2-1024MS.pth
-```
-
-### Inference with checkpoints saved during training
-
-During training, an experiment logging folder is created in `outputs` directory. Under each checkpoint folder, e.g. `epoch12-global_step2000`, there is a `ema.pt` and the shared `model` folder. Run the following command to perform inference.
-
-```bash
-# inference with ema model
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000/ema.pt
-
-# inference with model
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000
-
-# inference with sequence parallelism
-# sequence parallelism is enabled automatically when nproc_per_node is larger than 1
-torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000
-```
-
-The second command will automatically generate a `model_ckpt.pt` file in the checkpoint folder.
-
-### Inference Hyperparameters
-
-1. DPM-solver is good at fast inference for images. However, the video result is not satisfactory. You can use it for fast demo purpose.
-
-```python
-type="dmp-solver"
-num_sampling_steps=20
-```
-
-2. You can use [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)'s finetuned VAE decoder on videos for inference (consumes more memory). However, we do not see significant improvement in the video result. To use it, download [the pretrained weights](https://huggingface.co/maxin-cn/Latte/tree/main/t2v_required_models/vae_temporal_decoder) into `./pretrained_models/vae_temporal_decoder` and modify the config file as follows.
-
-```python
-vae = dict(
- type="VideoAutoencoderKLTemporalDecoder",
- from_pretrained="pretrained_models/vae_temporal_decoder",
-)
-```
-
-## Training
-
-To resume training, run the following command. ``--load`` different from ``--ckpt-path`` as it loads the optimizer and dataloader states.
-
-```bash
-torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --load YOUR_PRETRAINED_CKPT
-```
-
-To enable wandb logging, add `--wandb` to the command.
-
-```bash
-WANDB_API_KEY=YOUR_WANDB_API_KEY torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --wandb True
-```
-
-You can modify corresponding config files to change the training settings. See more details [here](/docs/structure.md#training-config-demos).
-
-### Training Hyperparameters
-
-1. `dtype` is the data type for training. Only `fp16` and `bf16` are supported. ColossalAI automatically enables the mixed precision training for `fp16` and `bf16`. During training, we find `bf16` more stable.
-
-## Search batch size for buckets
-
-To search the batch size for buckets, run the following command.
-
-```bash
-torchrun --standalone --nproc_per_node 1 scripts/search_bs.py configs/opensora-v1-1/train/benchmark.py --data-path YOUR_CSV_PATH -o YOUR_OUTPUT_CONFIG_PATH --base-resolution 240p --base-frames 128 --batch-size-start 2 --batch-size-end 256 --batch-size-step 2
-```
-
-If your dataset is extremely large, you extract a subset of the dataset for the search.
-
-```bash
-# each bucket contains 1000 samples
-python tools/datasets/split.py YOUR_CSV_PATH -o YOUR_SUBSET_CSV_PATH -c configs/opensora-v1-1/train/video.py -l 1000
-```
-
-If you want to control the batch size search more granularly, you can configure batch size start, end, and step in the config file.
-
-Bucket config format:
-
-1. `{ resolution: {num_frames: (prob, batch_size)} }`, in this case batch_size is ignored when searching
-2. `{ resolution: {num_frames: (prob, (max_batch_size, ))} }`, batch_size is searched in the range `[batch_size_start, max_batch_size)`, batch_size_start is configured via CLI
-3. `{ resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }`, batch_size is searched in the range `[min_batch_size, max_batch_size)`
-4. `{ resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }`, batch_size is searched in the range `[min_batch_size, max_batch_size)` with step_size (grid search)
-5. `{ resolution: {num_frames: (0.0, None)} }`, this bucket will not be used
-
-Here is an example of the bucket config:
-
-```python
-bucket_config = {
-
- "240p": {
- 16: (1.0, (2, 32)),
- 32: (1.0, (2, 16)),
- 64: (1.0, (2, 8)),
- 128: (1.0, (2, 6)),
- },
- "256": {1: (1.0, (128, 300))},
- "512": {1: (0.5, (64, 128))},
- "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
- "720p": {16: (0.1, (2, 16)), 32: (0.0, None)}, # No examples now
- "1024": {1: (0.3, (8, 64))},
- "1080p": {1: (0.3, (2, 32))},
-}
-```
-
-It will print the best batch size (and corresponding step time) for each bucket and save the output config file.
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/config.md b/PyTorch/built-in/mm/OpenSora1.1/docs/config.md
deleted file mode 100644
index 5cecc73ccbc2124730a5f25ee174753a4bf16202..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/config.md
+++ /dev/null
@@ -1,320 +0,0 @@
-# Config Guide
-
-- [Inference Config](#inference-config)
-- [Advanced Inference config](#advanced-inference-config)
-- [Inference Args](#inference-args)
-- [Training Config](#training-config)
-- [Training Args](#training-args)
-- [Training Bucket Configs](#training-bucket-configs)
-
-Our config files follows [MMEgine](https://github.com/open-mmlab/mmengine). MMEngine will reads the config file (a `.py` file) and parse it into a dictionary-like object. We expose some fields in the config file to the command line arguments (defined in [opensora/utils/config_util.py](/opensora/utils/config_utils.py)). To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file.
-
-## Inference Config
-
-The explanation of each field is provided below.
-
-```python
-# Define sampling size
-num_frames = 64 # number of frames, 1 means image
-fps = 24 # frames per second (condition for generation)
-frame_interval = 3 # output video will have fps/frame_interval frames per second
-image_size = (240, 426) # image size (height, width)
-
-# Define model
-model = dict(
- type="STDiT2-XL/2", # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
- from_pretrained="PRETRAINED_MODEL", # (Optional) Load from pretrained model
- input_sq_size=512, # Base spatial position embedding size
- qk_norm=True, # Normalize query and key in attention
- enable_flashattn=True, # (Optional) Speed up training and inference with flash attention
- # Turn enable_flashattn to False if you skip flashattn installation
- enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
- # Turn enable_layernorm_kernel to False if you skip apex installation
-)
-vae = dict(
- type="VideoAutoencoderKL", # Select VAE type
- from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE
- micro_batch_size=4, # VAE with micro batch size to save memory
-)
-text_encoder = dict(
- type="t5", # Select text encoder type (t5, clip)
- from_pretrained="DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
- model_max_length=200, # Maximum length of input text
-)
-scheduler = dict(
- type="iddpm", # Select scheduler type (iddpm, dpm-solver)
- num_sampling_steps=100, # Number of sampling steps
- cfg_scale=7.0, # hyper-parameter for classifier-free diffusion
- cfg_channel=3, # how many channels to use for classifier-free diffusion, if None, use all channels
-)
-dtype = "bf16" # Computation type (fp16, fp32, bf16)
-
-# Condition
-prompt_path = "./assets/texts/t2v_samples.txt" # path to prompt file
-prompt = None # prompt has higher priority than prompt_path
-
-# Other settings
-batch_size = 1 # batch size
-seed = 42 # random seed
-save_dir = "./samples" # path to save samples
-```
-
-## Advanced Inference config
-
-The [`inference-long.py`](/scripts/inference-long.py) script is used to generate long videos, and it also provides all functions of the [`inference.py`](/scripts/inference.py) script. The following arguments are specific to the `inference-long.py` script.
-
-```python
-loop = 10
-condition_frame_length = 4
-reference_path = [
- "https://cdn.openai.com/tmp/s/interp/d0.mp4",
- None,
- "assets/images/condition/wave.png",
-]
-mask_strategy = [
- "0,0,0,0,8,0.3",
- None,
- "0,0,0,0,1;0,0,0,-1,1",
-]
-```
-
-The following figure provides an illustration of the `mask_strategy`:
-
-
-
-To generate a long video of infinite time, our strategy is to generate a video with a fixed length first, and then use the last `condition_frame_length` number of frames for the next video generation. This will loop for `loop` times. Thus, the total length of the video is `loop * (num_frames - condition_frame_length) + condition_frame_length`.
-
-To condition the generation on images or videos, we introduce the `mask_strategy`. It is 6 number tuples separated by `;`. Each tuple indicate an insertion of the condition image or video to the target generation. The meaning of each number is:
-
-- **First number**: the loop index of the condition image or video. (0 means the first loop, 1 means the second loop, etc.)
-- **Second number**: the index of the condition image or video in the `reference_path`.
-- **Third number**: the start frame of the condition image or video. (0 means the first frame, and images only have one frame)
-- **Fourth number**: the location to insert. (0 means insert at the beginning, 1 means insert at the end, and -1 means insert at the end of the video)
-- **Fifth number**: the number of frames to insert. (1 means insert one frame, and images only have one frame)
-- **Sixth number**: the edit rate of the condition image or video. (0 means no edit, 1 means full edit).
-
-To facilitate usage, we also accept passing the reference path and mask strategy as a json appended to the prompt. For example,
-
-```plaintext
-'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}'
-```
-
-## Inference Args
-
-You can use `python scripts/inference.py --help` to see the following arguments:
-
-- `--seed`: random seed
-- `--ckpt-path`: path to the checkpoint (`model["from_pretrained"]`)
-- `--batch-size`: batch size
-- `--save-dir`: path to save samples
-- `--sample-name`: if None, the sample will be name by `sample_{index}.mp4/png`, otherwise, the sample will be named by `{sample_name}_{index}.mp4/png`
-- `--start-index`: start index of the sample
-- `--end-index`: end index of the sample
-- `--num-sample`: number of samples to generate for each prompt. The sample will be suffixed by `-0`, `-1`, `-2`, etc.
-- `--prompt-as-path`: if True, use the prompt as the name for saving samples
-- `--prompt-path`: path to the prompt file
-- `--prompt`: prompt string list
-- `--num-frames`: number of frames
-- `--fps`: frames per second
-- `--image-size`: image size
-- `--num-sampling-steps`: number of sampling steps (`scheduler["num_sampling_steps"]`)
-- `--cfg-scale`: hyper-parameter for classifier-free diffusion (`scheduler["cfg_scale"]`)
-- `--loop`: loop for long video generation
-- `--condition-frame-length`: condition frame length for long video generation
-- `--reference-path`: reference path for long video generation
-- `--mask-strategy`: mask strategy for long video generation
-
-Example commands for inference can be found in [commands.md](/docs/commands.md).
-
-## Training Config
-
-```python
-# Define dataset
-dataset = dict(
- type="VariableVideoTextDataset", # Select dataset type
- # VideoTextDataset for OpenSora 1.0, VariableVideoTextDataset for OpenSora 1.1
- data_path=None, # Path to the dataset
- num_frames=None, # Number of frames, set None since we support dynamic training
- frame_interval=3, # Frame interval
- image_size=(None, None), # Image size, set None since we support dynamic training
- transform_name="resize_crop", # Transform name
-)
-# bucket config usage see next section
-bucket_config = {
- "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)},
- "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
- "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
- "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)},
- "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
- "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
- "1024": {1: (0.3, 20)},
- "1080p": {1: (0.4, 8)},
-}
-# mask ratio in training
-mask_ratios = {
- "mask_no": 0.75, # 75% no mask
- "mask_quarter_random": 0.025, # 2.5% random mask with 1 frame to 1/4 #frames
- "mask_quarter_head": 0.025, # 2.5% mask at the beginning with 1 frame to 1/4 #frames
- "mask_quarter_tail": 0.025, # 2.5% mask at the end with 1 frame to 1/4 #frames
- "mask_quarter_head_tail": 0.05, # 5% mask at the beginning and end with 1 frame to 1/4 #frames
- "mask_image_random": 0.025, # 2.5% random mask with 1 image to 1/4 #images
- "mask_image_head": 0.025, # 2.5% mask at the beginning with 1 image to 1/4 #images
- "mask_image_tail": 0.025, # 2.5% mask at the end with 1 image to 1/4 #images
- "mask_image_head_tail": 0.05, # 5% mask at the beginning and end with 1 image to 1/4 #images
-}
-
-# Define acceleration
-num_workers = 8 # Number of workers for dataloader
-num_bucket_build_workers = 16 # Number of workers for bucket building
-dtype = "bf16" # Computation type (fp16, fp32, bf16)
-grad_checkpoint = True # Use gradient checkpointing
-plugin = "zero2" # Plugin for training
-sp_size = 1 # Sequence parallel size
-
-# Define model
-model = dict(
- type="STDiT2-XL/2", # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
- from_pretrained=None, # Load from pretrained model
- input_sq_size=512, # Base spatial position embedding size
- qk_norm=True, # Normalize query and key in attention
- enable_flashattn=True, # (Optional) Speed up training and inference with flash attention
- enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
-)
-vae = dict(
- type="VideoAutoencoderKL", # Select VAE type
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=4, # VAE with micro batch size to save memory
- local_files_only=True, # Load from local files only (first time should be false)
-)
-text_encoder = dict(
- type="t5", # Select text encoder type (t5, clip)
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=200, # Maximum length of input text
- shardformer=True, # Use shardformer
- local_files_only=True, # Load from local files only (first time should be false)
-)
-scheduler = dict(
- type="iddpm", # Select scheduler type (iddpm, iddpm-speed)
- timestep_respacing="",
-)
-
-# Others
-seed = 42 # random seed
-outputs = "outputs" # path to save outputs
-wandb = False # Use wandb or not
-
-epochs = 1000 # Number of epochs (set a large number and kill the process when you want to stop)
-log_every = 10
-ckpt_every = 500
-load = None
-
-batch_size = None
-lr = 2e-5
-grad_clip = 1.0
-```
-
-## Training Args
-
-- `--seed`: random seed
-- `--ckpt-path`: path to the checkpoint (`model["from_pretrained"]`)
-- `--batch-size`: batch size
-- `--wandb`: use wandb or not
-- `--load`: path to the checkpoint to load
-- `--data-path`: path to the dataset (`dataset["data_path"]`)
-
-See [commands.md](/docs/commands.md) for example commands.
-
-## Training Bucket Configs
-
-We support multi-resolution/aspect-ratio/num_frames training with bucket. To enable dynamic training (for STDiT2), use `VariableVideoText` dataset, and set the `bucket_config` in the config. An example is:
-
-```python
-bucket_config = {
- "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
- "256": {1: (1.0, 256)},
- "512": {1: (1.0, 80)},
- "480p": {1: (1.0, 52), 16: (0.5, 4), 32: (0.0, None)},
- "720p": {16: (1.0, 2), 32: (0.0, None)},
- "1024": {1: (1.0, 20)},
- "1080p": {1: (1.0, 8)},
-}
-```
-
-This looks a bit difficult to understand at the first glance. Let's understand this config step by step.
-
-### Three-level bucket
-
-
-
-We design a three-level bucket: `(resolution, num_frames, aspect_ratios)`. The resolution and aspect ratios is predefined in [aspect.py](/opensora/datasets/aspect.py). Commonly used resolutions (e.g., 240p, 1080p) are supported, and the name represents the number of pixels (e.g., 240p is 240x426, however, we define 240p to represent any size with HxW approximately 240x426=102240 pixels). The aspect ratios are defined for each resolution. You do not need to define the aspect ratios in the `bucket_config`.
-
-The `num_frames` is the number of frames in each sample, with `num_frames=1` especially for images. If `frame_intervals` is not 1, a bucket with `num_frames=k` will contain videos with `k*frame_intervals` frames except for images. Only a video with more than `num_frames` and more than `resolution` pixels will be likely to be put into the bucket.
-
-The two number defined in the bucket config is `(keep_prob, batch_size)`. Since the memory and speed of samples from different buckets may be different, we use `batch_size` to balance the processing speed. Since our computation is limited, we cannot process videos with their original resolution as stated in OpenAI's sora's report. Thus, we give a `keep_prob` to control the number of samples in each bucket. The `keep_prob` is the probability to keep a sample in the bucket. Let's take the following config as an example:
-
-```python
-bucket_config = {
- "480p": {16: (1.0, 8),},
- "720p": {16: (0.5, 4),},
- "1080p": {16: (0.2, 2)},
- "4K", {16: (0.1, 1)},
-}
-```
-
-Given a 2K video with more than 16 frames, the program will first try to put it into bucket "1080p" since it has a larger resolution than 1080p but less than 4K. Since the `keep_prob` for 1080p is 20%, a random number is generated, and if it is less than 0.2, the video will be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "720p" bucket. Since the `keep_prob` for 720p is 50%, the video has a 50% chance to be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "480p" bucket directly as it is the smallest resolution.
-
-### Examples
-
-Let's see some simple examples to understand the bucket config. First, the aspect ratio bucket is compulsory, if you want to modify this you need to add your own resolution definition in [aspect.py](/opensora/datasets/aspect.py). Then, to keep only 256x256 resolution and 16 frames as OpenSora 1.0, you can use the following config:
-
-```python
-bucket_config = {
- "256": {16: (1.0, 8)},
-}
-```
-
-If you want to train a model supporting different resolutions of images, you can use the following config (example [image.py](/configs/opensora-v1-1/train/image.py)):
-
-```python
-bucket_config = {
- "256": {1: (1.0, 256)},
- "512": {1: (1.0, 80)},
- "480p": {1: (1.0, 52)},
- "1024": {1: (1.0, 20)},
- "1080p": {1: (1.0, 8)},
-}
-```
-
-Or if you find the number of high-resolution images is too large, you can modify the `keep_prob` to reduce the number of samples in the bucket:
-
-```python
-bucket_config = {
- "256": {1: (1.0, 256)},
- "512": {1: (0.8, 80)},
- "480p": {1: (0.5, 52)},
- "1024": {1: (0.5, 20)},
- "1080p": {1: (0.2, 8)},
-}
-```
-
-And similarly for videos (example [video.py](/configs/opensora-v1-1/train/video.py)):
-
-```python
-bucket_config = {
- "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
- "480p": {16: (1.0, 4)},
- "720p": {16: (0.5, 2)},
-}
-```
-
-Note that in the above case, a video with 480p resolution and more than 16 frames will all go into bucket `("480p", 16)`, since they all satisfy this bucket's requirement. But training long videos with 480p resolution may be slow, so you can modify the config as follows to enforce the video with more than 32 frames to go into the 240p bucket.
-
-```python
-bucket_config = {
- "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
- "480p": {16: (1.0, 4), 32: (0.0, None)},
- "720p": {16: (0.5, 2)},
-}
-```
-
-Combine the above examples together, we think you can understand the bucket config provided at the beginning of this section and in the config files.
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/data_processing.md b/PyTorch/built-in/mm/OpenSora1.1/docs/data_processing.md
deleted file mode 100644
index cff9bb22a51c4de5e51b3d55885f2d42e94faf68..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/data_processing.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# Data Processing
-We establish a complete pipeline for video/image data processing. The pipeline is shown below.
-
-
-
-First, raw videos,
-either from the Internet or public datasets, are split into shorter clips based on scene detection.
-Then, we evaluate these videos by predicting multiple scores using existing models. We first predict the aesthetic score
-and the optical flow score for a video. We also conduct OCR to detect texts in the video. Only videos with satisfactory
-evaluation results are sent to the next step for captioning. After captioning, the matching score is also calculated as
-an assessment of video-text alignment. Finally, we filter samples based on the matching score and
-conduct camera motion detection for the remaining samples.
-In summary, our pipeline produces video-text pairs which have high aesthetic quality, large video motion and strong
-semantic consistency.
-
-Below is an example workflow to process videos.
-
-```bash
-ROOT_VIDEO="/path/to/video/folder"
-ROOT_CLIPS="/path/to/video/clips/folder"
-ROOT_META="/path/to/meta/folder"
-
-# 1.1 Create a meta file from a video folder. This should output ${ROOT_META}/meta.csv
-python -m tools.datasets.convert video ${ROOT_VIDEO} --output ${ROOT_META}/meta.csv
-
-# 1.2 Get video information and remove broken videos. This should output ${ROOT_META}/meta_info_fmin1.csv
-python -m tools.datasets.datautil ${ROOT_META}/meta.csv --info --fmin 1
-
-# 2.1 Detect scenes. This should output ${ROOT_META}/meta_info_fmin1_timestamp.csv
-python -m tools.scene_cut.scene_detect ${ROOT_META}/meta_info_fmin1.csv
-
-# 2.2 Cut video into clips based on scenes. This should produce video clips under ${ROOT_CLIPS}
-python -m tools.scene_cut.cut ${ROOT_META}/meta_info_fmin1_timestamp.csv --save_dir ${ROOT_CLIPS}
-
-# 2.3 Create a meta file for video clips. This should output ${ROOT_META}/meta_clips.csv
-python -m tools.datasets.convert video ${ROOT_CLIPS} --output ${ROOT_META}/meta_clips.csv
-
-# 2.4 Get clips information and remove broken ones. This should output ${ROOT_META}/meta_clips_info_fmin1.csv
-python -m tools.datasets.datautil ${ROOT_META}/meta_clips.csv --info --fmin 1
-
-# 3.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_part*.csv
-torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference \
- ${ROOT_META}/meta_clips_info_fmin1.csv \
- --bs 1024 \
- --num_workers 16
-
-# 3.2 Merge files; This should output ${ROOT_META}/meta_clips_info_fmin1_aes.csv
-python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes_part*.csv --output ${ROOT_META}/meta_clips_info_fmin1_aes.csv
-
-# 3.2 Filter by aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv
-python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes.csv --aesmin 5
-
-# 4.1 Generate caption. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5_caption_part*.csv
-torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava \
- ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv \
- --dp-size 8 \
- --tp-size 1 \
- --model-path /path/to/llava-v1.6-mistral-7b \
- --prompt video
-
-# 4.2 Merge caption results. This should output ${ROOT_META}/meta_clips_caption.csv
-python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5_caption_part*.csv --output ${ROOT_META}/meta_clips_caption.csv
-
-# 4.3 Clean caption. This should output ${ROOT_META}/meta_clips_caption_cleaned.csv
-python -m tools.datasets.datautil \
- ${ROOT_META}/meta_clips_caption.csv \
- --clean-caption \
- --refine-llm-caption \
- --remove-empty-caption \
- --output ${ROOT_META}/meta_clips_caption_cleaned.csv
-```
-
-For more information, please refer to:
-- [Dataset Management](../tools/datasets/README.md)
-- [Scene Detection and Video Splitting](../tools/scene_cut/README.md)
-- [Scoring and Filtering](../tools/scoring/README.md)
-- [Captioning](../tools/caption/README.md)
\ No newline at end of file
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/datasets.md b/PyTorch/built-in/mm/OpenSora1.1/docs/datasets.md
deleted file mode 100644
index 2d5e9955092d8ed656405438365825babf7c3f73..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/datasets.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Datasets
-
-For Open-Sora 1.1, we conduct mixed training with both images and videos. The main datasets we use are listed below.
-Please refer to [README](/README.md#data-processing) for data processing.
-
-## Panda-70M
-[Panda-70M](https://github.com/snap-research/Panda-70M) is a large-scale dataset with 70M video-caption pairs.
-We use the [training-10M subset](https://github.com/snap-research/Panda-70M/tree/main/dataset_dataloading) for training,
-which contains ~10M videos of better quality.
-
-## Pexels
-[Pexels](https://www.pexels.com/) is a popular online platform that provides high-quality stock photos, videos, and music for free.
-Most videos from this website are of high quality. Thus, we use them for both pre-training and HQ fine-tuning.
-We really appreciate the great platform and the contributors!
-
-## Inter4K
-[Inter4K](https://github.com/alexandrosstergiou/Inter4K) is a dataset containing 1K video clips with 4K resolution.
-The dataset is proposed for super-resolution tasks. We use the dataset for HQ fine-tuning.
-
-
-## HD-VG-130M
-[HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) comprises 130M text-video pairs.
-The caption is generated by BLIP-2.
-We find the scene and the text quality are relatively poor. For OpenSora 1.0, we only use ~350K samples from this dataset.
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/report_01.md b/PyTorch/built-in/mm/OpenSora1.1/docs/report_01.md
deleted file mode 100644
index 07388bcfafdd33986cd976294b8b9ba747d3e0e6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/report_01.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Open-Sora 1.0 Report
-
-OpenAI's Sora is amazing at generating one minutes high quality videos. However, it reveals almost no information about its details. To make AI more "open", we are dedicated to build an open-source version of Sora. This report describes our first attempt to train a transformer-based video diffusion model.
-
-## Efficiency in choosing the architecture
-
-To lower the computational cost, we want to utilize existing VAE models. Sora uses spatial-temporal VAE to reduce the temporal dimensions. However, we found that there is no open-source high-quality spatial-temporal VAE model. [MAGVIT](https://github.com/google-research/magvit)'s 4x4x4 VAE is not open-sourced, while [VideoGPT](https://wilson1yan.github.io/videogpt/index.html)'s 2x4x4 VAE has a low quality in our experiments. Thus, we decided to use a 2D VAE (from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)) in our first version.
-
-The video training involves a large amount of tokens. Considering 24fps 1min videos, we have 1440 frames. With VAE downsampling 4x and patch size downsampling 2x, we have 1440x1024≈1.5M tokens. Full attention on 1.5M tokens leads to a huge computational cost. Thus, we use spatial-temporal attention to reduce the cost following [Latte](https://github.com/Vchitect/Latte).
-
-As shown in the figure, we insert a temporal attention right after each spatial attention in STDiT (ST stands for spatial-temporal). This is similar to variant 3 in Latte's paper. However, we do not control a similar number of parameters for these variants. While Latte's paper claims their variant is better than variant 3, our experiments on 16x256x256 videos show that with same number of iterations, the performance ranks as: DiT (full) > STDiT (Sequential) > STDiT (Parallel) ≈ Latte. Thus, we choose STDiT (Sequential) out of efficiency. Speed benchmark is provided [here](/docs/acceleration.md#efficient-stdit).
-
-
-
-To focus on video generation, we hope to train the model based on a powerful image generation model. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is an efficiently trained high-quality image generation model with T5-conditioned DiT structure. We initialize our model with PixArt-α and initialize the projection layer of inserted temporal attention with zero. This initialization preserves model's ability of image generation at beginning, while Latte's architecture cannot. The inserted attention increases the number of parameter from 580M to 724M.
-
-
-
-Drawing from the success of PixArt-α and Stable Video Diffusion, we also adopt a progressive training strategy: 16x256x256 on 366K pretraining datasets, and then 16x256x256, 16x512x512, and 64x512x512 on 20K datasets. With scaled position embedding, this strategy greatly reduces the computational cost.
-
-We also try to use a 3D patch embedder in DiT. However, with 2x downsampling on temporal dimension, the generated videos have a low quality. Thus, we leave the downsampling to temporal VAE in our next version. For now, we sample at every 3 frames with 16 frames training and every 2 frames with 64 frames training.
-
-## Data is the key to high quality
-
-We find that the number and quality of data have a great impact on the quality of generated videos, even larger than the model architecture and training strategy. At this time, we only prepared the first split (366K video clips) from [HD-VG-130M](https://github.com/daooshee/HD-VG-130M). The quality of these videos varies greatly, and the captions are not that accurate. Thus, we further collect 20k relatively high quality videos from [Pexels](https://www.pexels.com/), which provides free license videos. We label the video with LLaVA, an image captioning model, with three frames and a designed prompt. With designed prompt, LLaVA can generate good quality of captions.
-
-
-
-As we lay more emphasis on the quality of data, we prepare to collect more data and build a video preprocessing pipeline in our next version.
-
-## Training Details
-
-With a limited training budgets, we made only a few exploration. We find learning rate 1e-4 is too large and scales down to 2e-5. When training with a large batch size, we find `fp16` less stable than `bf16` and may lead to generation failure. Thus, we switch to `bf16` for training on 64x512x512. For other hyper-parameters, we follow previous works.
-
-## Loss curves
-
-16x256x256 Pretraining Loss Curve
-
-
-
-16x256x256 HQ Training Loss Curve
-
-
-
-16x512x512 HQ Training Loss Curve
-
-
-
-> Core Contributor: Zangwei Zheng*, Xiangyu Peng*, Shenggui Li, Hongxing Liu, Yang You
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/report_02.md b/PyTorch/built-in/mm/OpenSora1.1/docs/report_02.md
deleted file mode 100644
index ec54853d08dcb5852e15a2edeafb55e6f81c3381..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/report_02.md
+++ /dev/null
@@ -1,117 +0,0 @@
-# Open-Sora 1.1 Report
-
-- [Model Architecture Modification](#model-architecture-modification)
-- [Support for Multi-time/resolution/aspect ratio/fps Training](#support-for-multi-timeresolutionaspect-ratiofps-training)
-- [Masked DiT as Image/Video-to-Video Model](#masked-dit-as-imagevideo-to-video-model)
-- [Data Collection \& Pipeline](#data-collection--pipeline)
-- [Training Details](#training-details)
-- [Limitation and Future Work](#limitation-and-future-work)
-
-In Open-Sora 1.1 release, we train a 700M models on 10M data (Open-Sora 1.0 trained on 400K data) with a better STDiT architecture. We implement the following features mentioned in [sora's report](https://openai.com/research/video-generation-models-as-world-simulators):
-
-- Variable durations, resolutions, aspect ratios (Sampling flexibility, Improved framing and composition)
-- Prompting with images and videos (Animating images, Extending generated videos, Video-to-video editing, Connecting videos)
-- Image generation capabilities
-
-To achieve this goal, we use multi-task learning in the pretraining stage. For diffusion models, training with different sampled timestep is already a multi-task learning. We further extend this idea to multi-resolution, aspect ratio, frame length, fps, and different mask strategies for image and video conditioned generation. We train the model on **0s~15s, 144p to 720p, various aspect ratios** videos. Although the quality of time consistency is not that high due to limit training FLOPs, we can still see the potential of the model.
-
-## Model Architecture Modification
-
-We made the following modifications to the original ST-DiT for better training stability and performance (ST-DiT-2):
-
-- **[Rope embedding](https://arxiv.org/abs/2104.09864) for temporal attention**: Following LLM's best practice, we change the sinusoidal positional encoding to rope embedding for temporal attention since it is also a sequence prediction task.
-- **AdaIN and Layernorm for temporal attention**: we wrap the temporal attention with AdaIN and layernorm as the spatial attention to stabilize the training.
-- **[QK-normalization](https://arxiv.org/abs/2302.05442) with [RMSNorm](https://arxiv.org/abs/1910.07467)**: Following [SD3](https://arxiv.org/pdf/2403.03206.pdf), we appy QK-normalization to the all attention for better training stability in half-precision.
-- **Dynamic input size support and video infomation condition**: To support multi-resolution, aspect ratio, and fps training, we make ST-DiT-2 to accept any input size, and automatically scale positional embeddings. Extending [PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)'s idea, we conditioned on video's height, width, aspect ratio, frame length, and fps.
-- **Extending T5 tokens from 120 to 200**: our caption is usually less than 200 tokens, and we find the model can handle longer text well.
-
-## Support for Multi-time/resolution/aspect ratio/fps Training
-
-As mentioned in the [sora's report](https://openai.com/research/video-generation-models-as-world-simulators), training with original video's resolution, aspect ratio, and length increase sampling flexibility and improve framing and composition. We found three ways to achieve this goal:
-
-- [NaViT](https://arxiv.org/abs/2307.06304): support dynamic size within the same batch by masking, with little efficiency loss. However, the system is a bit complex to implement, and may not benefit from optimized kernels such as flash attention.
-- Padding ([FiT](https://arxiv.org/abs/2402.12376), [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)): support dynamic size within the same batch by padding. However, padding different resolutions to the same size is not efficient.
-- Bucket ([SDXL](https://arxiv.org/abs/2307.01952), [PixArt](https://arxiv.org/abs/2310.00426)): support dynamic size in different batches by bucketing, but the size must be the same within the same batch, and only a fixed number of size can be applied. With the same size in a batch, we do not need to implement complex masking or padding.
-
-For the simplicity of implementation, we choose the bucket method. We pre-define some fixed resolution, and allocate different samples to different bucket. The concern for bucketing is listed below. But we can see that the concern is not a big issue in our case.
-
-
-View the concerns
-
-- The bucket size is limited to a fixed number: First, in real-world applications, only a few aspect ratios (9:16, 3:4) and resolutions (240p, 1080p) are commonly used. Second, we find trained models can generalize well to unseen resolutions.
-- The size in each batch is the same, breaks the i.i.d. assumption: Since we are using multiple GPUs, the local batches on different GPUs have different sizes. We did not see a significant performance drop due to this issue.
-- The may not be enough samples to fill each bucket and the distribution may be biased: First, our dataset is large enough to fill each bucket when local batch size is not too large. Second, we should analyze the data's distribution on sizes and define the bucket size accordingly. Third, an unbalanced distribution did not affect the training process significantly.
-- Different resolutions and frame lengths may have different processing speed: Different from PixArt, which only deals with aspect ratios of similar resolutions (similar token numbers), we need to consider the processing speed of different resolutions and frame lengths. We can use the `bucket_config` to define the batch size for each bucket to ensure the processing speed is similar.
-
-
-
-
-
-As shown in the figure, a bucket is a triplet of `(resolution, num_frame, aspect_ratio)`. We provide pre-defined aspect ratios for different resolution that covers most of the common video aspect ratios. Before each epoch, we shuffle the dataset and allocate the samples to different buckets as shown in the figure. We put a sample into a bucket with largest resolution and frame length that is smaller than the video's.
-
-Considering our computational resource is limited, we further introduce two attributes `keep_prob` and `batch_size` for each `(resolution, num_frame)` to reduce the computational cost and enable multi-stage training. Specifically, a high-resolution video will be downsampled to a lower resolution with probability `1-keep_prob` and the batch size for each bucket is `batch_size`. In this way, we can control the number of samples in different buckets and balance the GPU load by search a good batch size for each bucket.
-
-A detailed explanation of the bucket usage in training is available in [docs/config.md](/docs/config.md#training-bucket-configs).
-
-## Masked DiT as Image/Video-to-Video Model
-
-Transformers can be easily extended to support image-to-image and video-to-video tasks. We propose a mask strategy to support image and video conditioning. The mask strategy is shown in the figure below.
-
-
-
-Typically, we unmask the frames to be conditioned on for image/video-to-video condition. During the ST-DiT forward, unmasked frames will have timestep 0, while others remain the same (t). We find directly apply the strategy to trained model yield poor results as the diffusion model did not learn to handle different timesteps in one sample during training.
-
-Inspired by [UL2](https://arxiv.org/abs/2205.05131), we introduce random mask strategy during training. Specifically, we randomly unmask the frames during training, including unmask the first frame, the first k frames, the last frame, the last k frames, the first and last k frames, random frames, etc. Based on Open-Sora 1.0, with 50% probability of applying masking, we see the model can learn to handle image conditioning (while 30% yields worse ability) for 10k steps, with a little text-to-video performance drop. Thus, for Open-Sora 1.1, we pretrain the model from scratch with masking strategy.
-
-An illustration of masking strategy config to use in inference is given as follow. A five number tuple provides great flexibility in defining the mask strategy. By conditioning on generated frames, we can autogressively generate infinite frames (although error propagates).
-
-
-
-A detailed explanation of the mask strategy usage is available in [docs/config.md](/docs/config.md#advanced-inference-config).
-
-## Data Collection & Pipeline
-
-As we found in Open-Sora 1.0, the data number and quality are crucial for training a good model, we work hard on scaling the dataset. First, we create an automatic pipeline following [SVD](https://arxiv.org/abs/2311.15127), inlcuding scene cutting, captioning, various scoring and filtering, and dataset management scripts and conventions. More infomation can be found in [docs/data_processing.md](/docs/data_processing.md).
-
-
-
-We plan to use [panda-70M](https://snap-research.github.io/Panda-70M/) and other data to traing the model, which is approximately 30M+ data. However, we find disk IO a botteleneck for training and data processing at the same time. Thus, we can only prepare a 10M dataset and did not go through all processing pipeline that we built. Finally, we use a dataset with 9.7M videos + 2.6M images for pre-training, and 560k videos + 1.6M images for fine-tuning. The pretraining dataset statistics are shown below. More information about the dataset can be found in [docs/datasets.md](/docs/datasets.md).
-
-Image text tokens (by T5 tokenizer):
-
-
-
-Video text tokens (by T5 tokenizer). We directly use panda's short caption for training, and caption other datasets by ourselves. The generated caption is usually less than 200 tokens.
-
-
-
-Video duration:
-
-
-
-## Training Details
-
-With limited computational resources, we have to carefully monitor the training process, and change the training strategy if we speculate the model is not learning well since there is no computation for ablation study. Thus, Open-Sora 1.1's training includes multiple changes, and as a result, ema is not applied.
-
-1. First, we fine-tune **6k** steps with images of different resolution from `Pixart-alpha-1024` checkpoints. We find the model easily adapts to generate images with different resolutions. We use [SpeeDiT](https://github.com/1zeryu/SpeeDiT) (iddpm-speed) to accelerate the diffusion training.
-2. **[Stage 1]** Then, we pretrain the model with gradient-checkpointing for **24k** steps, which takes **4 days** on 64 H800 GPUs. Although the number of samples seen by the model is the same, we find the model learns slowly compared to a smaller batch size. We speculate that at an early stage, the number of steps is more important for training. The most videos are in **240p** resolution, and the config is similar to [stage2.py](/configs/opensora-v1-1/train/stage2.py). The video looking is good, but the model does not know much about the temporal knowledge. We use mask ratio of 10%.
-3. **[Stage 1]** To increase the number of steps, we switch to a smaller batch size without gradient-checkpointing. We also add fps conditioning at this point. We trained **40k** steps for **2 days**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). We use a lower resolution as we find in Open-Sora 1.0 that the model can learn temporal knowledge with relatively low resolution.
-4. **[Stage 1]** We find the model cannot learn well for long videos, and find a noised generation result as speculated to be half-precision problem found in Open-Sora 1.0 training. Thus, we adopt the QK-normalization to stabilize the training. Similar to SD3, we find the model quickly adapt to the QK-normalization. We also switch iddpm-speed to iddpm, and increase the mask ratio to 25% as we find image-condition not learning well. We trained for **17k** steps for **14 hours**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). The stage 1 training lasts for approximately one week, with total step **81k**.
-5. **[Stage 2]** We switch to a higher resolution, where most videos are in **240p and 480p** resolution ([stage2.py](/configs/opensora-v1-1/train/stage2.py)). We trained **22k** steps for **one day** on all pre-training data.
-6. **[Stage 3]** We switch to a higher resolution, where most videos are in **480p and 720p** resolution ([stage3.py](/configs/opensora-v1-1/train/stage3.py)). We trained **4k** with **one day** on high-quality data. We find loading previous stage's optimizer state can help the model learn faster.
-
-To summarize, the training of Open-Sora 1.1 requires approximately **9 days** on 64 H800 GPUs.
-
-## Limitation and Future Work
-
-As we get one step closer to the replication of Sora, we find many limitations for the current model, and these limitations point to the future work.
-
-- **Generation Failure**: we fine many cases (especially when the total token number is large or the content is complex), our model fails to generate the scene. There may be a collapse in the temporal attention and we have identified a potential bug in our code. We are working hard to fix it. Besides, we will increase our model size and training data to improve the generation quality in the next version.
-- **Noisy generation and influency**: we find the generated model is sometimes noisy and not fluent, especially for long videos. We think the problem is due to not using a temporal VAE. As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we plan to develop a temporal VAE for the model in the next version.
-- **Lack of time consistency**: we find the model cannot generate videos with high time consistency. We think the problem is due to the lack of training FLOPs. We plan to collect more data and continue training the model to improve the time consistency.
-- **Bad human generation**: We find the model cannot generate high-quality human videos. We think the problem is due to the lack of human data. We plan to collect more human data and continue training the model to improve the human generation.
-- **Low aesthetic score**: we find the model's aesthetic score is not high. The problem is due to the lack of aesthetic score filtering, which is not conducted due to IO bottleneck. We plan to filter the data by aesthetic score and finetuning the model to improve the aesthetic score.
-- **Worse quality for longer video genration**: we find with a same prompt, the longer video has worse quality. This means the image quality is not equally adapted to different lengths of sequences.
-
-> - **Algorithm & Acceleration**: Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou
-> - **Data Collection & Pipeline**: Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/report_v1.md b/PyTorch/built-in/mm/OpenSora1.1/docs/report_v1.md
deleted file mode 100644
index edb3644711d2e6af2b6945246d0404c8a5d367dd..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/report_v1.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# Open-Sora v1 Report
-
-OpenAI's Sora is amazing at generating one minutes high quality videos. However, it reveals almost no information about its details. To make AI more "open", we are dedicated to build an open-source version of Sora. This report describes our first attempt to train a transformer-based video diffusion model.
-
-## Efficiency in choosing the architecture
-
-To lower the computational cost, we want to utilize existing VAE models. Sora uses spatial-temporal VAE to reduce the temporal dimensions. However, we found that there is no open-source high-quality spatial-temporal VAE model. [MAGVIT](https://github.com/google-research/magvit)'s 4x4x4 VAE is not open-sourced, while [VideoGPT](https://wilson1yan.github.io/videogpt/index.html)'s 2x4x4 VAE has a low quality in our experiments. Thus, we decided to use a 2D VAE (from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)) in our first version.
-
-The video training involves a large amount of tokens. Considering 24fps 1min videos, we have 1440 frames. With VAE downsampling 4x and patch size downsampling 2x, we have 1440x1024≈1.5M tokens. Full attention on 1.5M tokens leads to a huge computational cost. Thus, we use spatial-temporal attention to reduce the cost following [Latte](https://github.com/Vchitect/Latte).
-
-As shown in the figure, we insert a temporal attention right after each spatial attention in STDiT (ST stands for spatial-temporal). This is similar to variant 3 in Latte's paper. However, we do not control a similar number of parameters for these variants. While Latte's paper claims their variant is better than variant 3, our experiments on 16x256x256 videos show that with same number of iterations, the performance ranks as: DiT (full) > STDiT (Sequential) > STDiT (Parallel) ≈ Latte. Thus, we choose STDiT (Sequential) out of efficiency. Speed benchmark is provided [here](/docs/acceleration.md#efficient-stdit).
-
-
-
-To focus on video generation, we hope to train the model based on a powerful image generation model. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is an efficiently trained high-quality image generation model with T5-conditioned DiT structure. We initialize our model with PixArt-α and initialize the projection layer of inserted temporal attention with zero. This initialization preserves model's ability of image generation at beginning, while Latte's architecture cannot. The inserted attention increases the number of parameter from 580M to 724M.
-
-
-
-Drawing from the success of PixArt-α and Stable Video Diffusion, we also adopt a progressive training strategy: 16x256x256 on 366K pretraining datasets, and then 16x256x256, 16x512x512, and 64x512x512 on 20K datasets. With scaled position embedding, this strategy greatly reduces the computational cost.
-
-We also try to use a 3D patch embedder in DiT. However, with 2x downsampling on temporal dimension, the generated videos have a low quality. Thus, we leave the downsampling to temporal VAE in our next version. For now, we sample at every 3 frames with 16 frames training and every 2 frames with 64 frames training.
-
-## Data is the key to high quality
-
-We find that the number and quality of data have a great impact on the quality of generated videos, even larger than the model architecture and training strategy. At this time, we only prepared the first split (366K video clips) from [HD-VG-130M](https://github.com/daooshee/HD-VG-130M). The quality of these videos varies greatly, and the captions are not that accurate. Thus, we further collect 20k relatively high quality videos from [Pexels](https://www.pexels.com/), which provides free license videos. We label the video with LLaVA, an image captioning model, with three frames and a designed prompt. With designed prompt, LLaVA can generate good quality of captions.
-
-
-
-As we lay more emphasis on the quality of data, we prepare to collect more data and build a video preprocessing pipeline in our next version.
-
-## Training Details
-
-With a limited training budgets, we made only a few exploration. We find learning rate 1e-4 is too large and scales down to 2e-5. When training with a large batch size, we find `fp16` less stable than `bf16` and may lead to generation failure. Thus, we switch to `bf16` for training on 64x512x512. For other hyper-parameters, we follow previous works.
-
-## Loss curves
-
-16x256x256 Pretraining Loss Curve
-
-
-
-16x256x256 HQ Training Loss Curve
-
-
-
-16x512x512 HQ Training Loss Curve
-
-
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/structure.md b/PyTorch/built-in/mm/OpenSora1.1/docs/structure.md
deleted file mode 100644
index 418d0d09b5e9b465d8ac9d633b6d59d8e80f231e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/structure.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Repo Structure
-
-```plaintext
-Open-Sora
-├── README.md
-├── assets
-│ ├── images -> images used for image-conditioned generation
-│ ├── texts -> prompts used for text-conditioned generation
-│ └── readme -> images used in README
-├── configs -> Configs for training & inference
-├── docs
-│ ├── acceleration.md -> Report on acceleration & speed benchmark
-│ ├── command.md -> Commands for training & inference
-│ ├── datasets.md -> Datasets used in this project
-| ├── data_pipeline.md -> Data pipeline documents
-│ ├── structure.md -> This file
-│ ├── config.md -> Configs meaning
-│ ├── report_01.md -> Report for Open-Sora 1.1
-│ ├── report_02.md -> Report for Open-Sora 1.0
-│ └── zh_CN -> Chinese version of the above
-├── eval -> Evaluation scripts
-│ ├── README.md -> Evaluation documentation
-| ├── sample.sh -> script for quickly launching inference on predefined prompts
-| ├── launch.sh -> script for launching 8 cards sampling
-| ├── vbench -> for VBench evaluation
-│ └── vbench_i2v -> for VBench i2v evaluation
-├── gradio -> Gradio demo related code
-├── notebooks -> Jupyter notebooks for generating commands to run
-├── scripts
-│ ├── train.py -> diffusion training script
-│ ├── inference.py -> diffusion inference script
-│ ├── inference-long.py -> inference script supporting more advanced features
-│ └── misc -> misc scripts, including batch size search
-├── opensora
-│ ├── __init__.py
-│ ├── registry.py -> Registry helper
-│ ├── acceleration -> Acceleration related code
-│ ├── dataset -> Dataset related code
-│ ├── models
-│ │ ├── layers -> Common layers
-│ │ ├── vae -> VAE as image encoder
-│ │ ├── text_encoder -> Text encoder
-│ │ │ ├── classes.py -> Class id encoder (inference only)
-│ │ │ ├── clip.py -> CLIP encoder
-│ │ │ └── t5.py -> T5 encoder
-│ │ ├── dit
-│ │ ├── latte
-│ │ ├── pixart
-│ │ └── stdit -> Our STDiT related code
-│ ├── schedulers -> Diffusion schedulers
-│ │ ├── iddpm -> IDDPM for training and inference
-│ │ └── dpms -> DPM-Solver for fast inference
-│ └── utils
-├── tests -> Tests for the project
-└── tools -> Tools for data processing and more
-```
-
-## Configs
-
-Our config files follows [MMEgine](https://github.com/open-mmlab/mmengine). MMEngine will reads the config file (a `.py` file) and parse it into a dictionary-like object.
-
-```plaintext
-Open-Sora
-└── configs -> Configs for training & inference
- ├── opensora-v1-1 -> STDiT2 related configs
- │ ├── inference
- │ │ ├── sample.py -> Sample videos and images
- │ │ └── sample-ref.py -> Sample videos with image/video condition
- │ └── train
- │ ├── stage1.py -> Stage 1 training config
- │ ├── stage2.py -> Stage 2 training config
- │ ├── stage3.py -> Stage 3 training config
- │ ├── image.py -> Illustration of image training config
- │ ├── video.py -> Illustration of video training config
- │ └── benchmark.py -> For batch size searching
- ├── opensora -> STDiT related configs
- │ ├── inference
- │ │ ├── 16x256x256.py -> Sample videos 16 frames 256x256
- │ │ ├── 16x512x512.py -> Sample videos 16 frames 512x512
- │ │ └── 64x512x512.py -> Sample videos 64 frames 512x512
- │ └── train
- │ ├── 16x256x256.py -> Train on videos 16 frames 256x256
- │ ├── 16x256x256.py -> Train on videos 16 frames 256x256
- │ └── 64x512x512.py -> Train on videos 64 frames 512x512
- ├── dit -> DiT related configs
- │ ├── inference
- │ │ ├── 1x256x256-class.py -> Sample images with ckpts from DiT
- │ │ ├── 1x256x256.py -> Sample images with clip condition
- │ │ └── 16x256x256.py -> Sample videos
- │ └── train
- │ ├── 1x256x256.py -> Train on images with clip condition
- │ └── 16x256x256.py -> Train on videos
- ├── latte -> Latte related configs
- └── pixart -> PixArt related configs
-```
-
-## Tools
-
-```plaintext
-Open-Sora
-└── tools
- ├── datasets -> dataset management related code
- ├── scene_cut -> scene cut related code
- ├── caption -> caption related code
- ├── scoring -> scoring related code
- │ ├── aesthetic -> aesthetic scoring related code
- │ ├── matching -> matching scoring related code
- │ ├── ocr -> ocr scoring related code
- │ └── optical_flow -> optical flow scoring related code
- └── frame_interpolation -> frame interpolation related code
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/README.md b/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/README.md
deleted file mode 100644
index 21f8c6d79976c3181c9e621370ffffa2bcb61296..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/README.md
+++ /dev/null
@@ -1,211 +0,0 @@
-
-
-
-
-
-
-## Open-Sora: 完全开源的高效复现类Sora视频生成方案
-**Open-Sora**项目是一项致力于**高效**制作高质量视频,并使所有人都能使用其模型、工具和内容的计划。
-通过采用**开源**原则,Open-Sora 不仅实现了先进视频生成技术的低成本普及,还提供了一个精简且用户友好的方案,简化了视频制作的复杂性。
-通过 Open-Sora,我们希望更多开发者一起探索内容创作领域的创新、创造和包容。
-
-[[English Document]](/README.md)
-
- Open-Sora 项目目前处在早期阶段,并将持续更新。
-
-## 📰 资讯
-
-* **[2024.03.18]** 🔥 我们发布了**Open-Sora 1.0**,这是一个完全开源的视频生成项目。
-* Open-Sora 1.0 支持视频数据预处理、
加速训练、推理等全套流程。
-* 我们提供的[模型权重](/#model-weights)只需 3 天的训练就能生成 2 秒的 512x512 视频。
-* **[2024.03.04]** Open-Sora:开源Sora复现方案,成本降低46%,序列扩充至近百万。[[英文博客]](https://hpc-ai.com/blog/open-sora)
-
-## 🎥 最新视频
-
-| **2s 512×512** | **2s 512×512** | **2s 512×512** |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [
](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80) | [
](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc) | [
](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16) |
-| A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff. | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall. |
-| [
](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94) | [
](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9) | [
](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65) |
-| A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...] | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...] | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...] |
-
-视频经过降采样处理为`.gif`格式,以便显示。点击查看原始视频。为便于显示,文字经过修剪,全文请参见 [此处](/assets/texts/t2v_samples.txt)。在我们的[图片库](https://hpcaitech.github.io/Open-Sora/)中查看更多样本。
-
-## 🔆 新功能
-
-* 📍Open-Sora-v1 已发布。[这里](/#model-weights)提供了模型权重。只需 400K 视频片段和在单卡 H800 上训200天(类比Stable Video Diffusion 的 152M 样本),我们就能生成 2 秒的 512×512 视频。
-* ✅ 从图像扩散模型到视频扩散模型的三阶段训练。我们提供每个阶段的权重。
-* ✅ 支持训练加速,包括Transformer加速、更快的 T5 和 VAE 以及序列并行。在对 64x512x512 视频进行训练时,Open-Sora 可将训练速度提高**55%**。详细信息请参见[训练加速](/acceleration.md)。
-* ✅ 我们提供用于数据预处理的视频切割和字幕工具。有关说明请点击[此处](tools/data/README.md),我们的数据收集计划请点击 [数据集](docs/datasets.md)。
-* ✅ 我们发现来自[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的 VQ-VAE 质量较低,因此采用了来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original) 的高质量 VAE。我们还发现使用添加了时间维度的采样会导致生成质量降低。更多讨论,请参阅我们的 **[报告](docs/report_v1.md)**。
-* ✅ 我们研究了不同的架构,包括 DiT、Latte 和我们提出的 **STDiT**。我们的STDiT在质量和速度之间实现了更好的权衡。更多讨论,请参阅我们的 **[报告](docs/report_v1.md)**。
-* ✅ 支持剪辑和 T5 文本调节。
-* ✅ 通过将图像视为单帧视频,我们的项目支持在图像和视频(如 ImageNet 和 UCF101)上训练 DiT。更多说明请参见 [指令解析](command.md)。
-* ✅ 利用[DiT](https://github.com/facebookresearch/DiT)、[Latte](https://github.com/Vchitect/Latte) 和 [PixArt](https://pixart-alpha.github.io/) 的官方权重支持推理。
-
-
-查看更多
-
-* ✅ 重构代码库。请参阅[结构](structure.md),了解项目结构以及如何使用配置文件。
-
-
-
-### 下一步计划【按优先级排序】
-
-* [ ] 完成数据处理流程(包括密集光流、美学评分、文本图像相似性、重复数据删除等)。更多信息请参见[数据集](/docs/datasets.md)。**[项目进行中]**
-* [ ] 训练视频-VAE。 **[项目进行中]**
-
-
-查看更多
-
-* [ ] 支持图像和视频调节。
-* [ ] 评估流程。
-* [ ] 加入更好的调度程序,如 SD3 中的rectified flow程序。
-* [ ] 支持可变长宽比、分辨率和持续时间。
-* [ ] 发布后支持 SD3。
-
-
-
-## 目录
-
-* [安装](#installation)
-* [模型权重](/#model-weights)
-* [推理](/#inference)
-* [数据处理](/#data-processing)
-* [训练](/#training)
-* [贡献](/#contribution)
-* [声明](/#acknowledgement)
-* [引用](/#citation)
-
-## 安装
-
-```bash
-# create a virtual env
-conda create -n opensora python=3.10
-
-# install torch
-# the command below is for CUDA 12.1, choose install commands from
-# https://pytorch.org/get-started/locally/ based on your own CUDA version
-pip3 install torch torchvision
-
-# install flash attention (optional)
-pip install packaging ninja
-pip install flash-attn --no-build-isolation
-
-# install apex (optional)
-pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
-
-# install xformers
-pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
-
-# install this project
-git clone https://github.com/hpcaitech/Open-Sora
-cd Open-Sora
-pip install -v .
-```
-
-安装完成后,建议阅读[结构](structure.md),了解项目结构以及如何使用配置文件。
-
-## 模型权重
-
-| 分辨率 | 数据 | 迭代次数 | 批量大小 | GPU 天数 (H800) | 网址 |
-| ---------- | ------ | ----------- | ---------- | --------------- | ---------- |
-| 16×256×256 | 366K | 80k | 8×64 | 117 | [:link:]() |
-| 16×256×256 | 20K HQ | 24k | 8×64 | 45 | [:link:]() |
-| 16×512×512 | 20K HQ | 20k | 2×64 | 35 | [:link:]() |
-
-我们模型的权重部分由[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) 初始化。参数数量为 724M。有关训练的更多信息,请参阅我们的 **[报告](/docs/report_v1.md)**。有关数据集的更多信息,请参阅[数据](datasets.md)。HQ 表示高质量。
-:warning: **局限性**:我们的模型是在有限的预算内训练出来的。质量和文本对齐度相对较差。特别是在生成人类时,模型表现很差,无法遵循详细的指令。我们正在努力改进质量和文本对齐。
-
-## 推理
-
-要使用我们提供的权重进行推理,首先要将[T5](https://huggingface.co/DeepFloyd/t5-v1_1-xxl/tree/main)权重下载到pretrained_models/t5_ckpts/t5-v1_1-xxl 中。然后下载模型权重。运行以下命令生成样本。请参阅[此处](docs/structure.md#inference-config-demos)自定义配置。
-
-```bash
-# Sample 16x512x512 (20s/sample, 100 time steps, 24 GB memory)
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path OpenSora-v1-HQ-16x512x512.pth --prompt-path ./assets/texts/t2v_samples.txt
-
-# Sample 16x256x256 (5s/sample, 100 time steps, 22 GB memory)
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path OpenSora-v1-HQ-16x256x256.pth --prompt-path ./assets/texts/t2v_samples.txt
-
-# Sample 64x512x512 (40s/sample, 100 time steps)
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt
-
-# Sample 64x512x512 with sequence parallelism (30s/sample, 100 time steps)
-# sequence parallelism is enabled automatically when nproc_per_node is larger than 1
-torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth --prompt-path ./assets/texts/t2v_samples.txt
-
-```
-
-我们在 H800 GPU 上进行了速度测试。如需使用其他模型进行推理,请参阅[此处](commands_zh.md)获取更多说明。减小`vae.micro_batch_size`来降低显存使用(但取样速度会略微减慢)。
-
-## 数据处理
-
-高质量数据是高质量模型的关键。[这里](datasets.md)有我们使用过的数据集和数据收集计划。我们提供处理视频数据的工具。目前,我们的数据处理流程包括以下步骤:
-
-1. 下载数据集。[[文件](/tools/datasets/README.md)]
-2. 将视频分割成片段。 [[文件](/tools/scenedetect/README.md)]
-3. 生成视频字幕。 [[文件](/tools/caption/README.md)]
-
-## 训练
-
-要启动训练,首先要将[T5](https://huggingface.co/DeepFloyd/t5-v1_1-xxl/tree/main)权重下载到pretrained_models/t5_ckpts/t5-v1_1-xxl 中。然后运行以下命令在单个节点上启动训练。
-
-```bash
-# 1 GPU, 16x256x256
-torchrun --nnodes=1 --nproc_per_node=1 scripts/train.py configs/opensora/train/16x256x512.py --data-path YOUR_CSV_PATH
-# 8 GPUs, 64x512x512
-torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
-```
-
-要在多个节点上启动训练,请根据[ColossalAI](https://colossalai.org/docs/basics/launch_colossalai/#launch-with-colossal-ai-cli) 准备一个主机文件,并运行以下命令。
-
-```bash
-colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
-```
-
-有关其他模型的训练和高级使用方法,请参阅[此处](commands_zh.md)获取更多说明。
-
-## 贡献
-
-如果您希望为该项目做出贡献,可以参考 [贡献指南](/CONTRIBUTING.md).
-
-## 声明
-
-* [ColossalAI](https://github.com/hpcaitech/ColossalAI): A powerful large model parallel acceleration and optimization
-* [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers.
-* [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): An acceleration for DiT training. We adopt valuable acceleration strategies for training progress from OpenDiT.
-* [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): An open-source DiT-based text-to-image model.
-* [Latte](https://github.com/Vchitect/Latte): An attempt to efficiently train DiT for video.
-* [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model.
-* [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model.
-* [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder.
-* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
-
-我们对他们的出色工作和对开源的慷慨贡献表示感谢。
-
-## 引用
-
-```bibtex
-@software{opensora,
- author = {Zangwei Zheng and Xiangyu Peng and Yang You},
- title = {Open-Sora: Democratizing Efficient Video Production for All},
- month = {March},
- year = {2024},
- url = {https://github.com/hpcaitech/Open-Sora}
-}
-```
-
-[Zangwei Zheng](https://github.com/zhengzangw) and [Xiangyu Peng](https://github.com/xyupeng) equally contributed to this work during their internship at [HPC-AI Tech](https://hpc-ai.com/).
-
-## Star 走势
-
-[](https://star-history.com/#hpcaitech/Open-Sora&Date)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/acceleration.md b/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/acceleration.md
deleted file mode 100644
index 22220728ea341096932ed06b6acaae352d8cca9f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/acceleration.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# 加速
-
-Open-Sora 旨在为扩散模型提供一个高速训练框架。在 64 帧 512x512 视频上训练时,我们可以实现 **55%** 的训练速度加速。我们的框架支持训练
-**1分钟1080p视频**。
-
-## 加速的 Transformer
-
-Open-Sora 通过以下方式提高训练速度:
-
-- 内核优化,包括 [flash attention](https://github.com/Dao-AILab/flash-attention), 融合 layernorm 内核以及由 colossalAI
- 编译的内核。
-- 混合并行性,包括 ZeRO。
-- 用于更大批量的梯度检查点。
-
-我们在图像上的训练速度可与 [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT) 相媲美,这是一个加速 DiT
-训练的项目。训练速度是在批处理大小为 128、图像大小为 256x256 的 8 个 H800 GPU 上测量的。
-
-| 模型 | 吞吐量 (img/s/GPU) | 吞吐量 (tokens/s/GPU) |
-|----------|-----------------|--------------------|
-| DiT | 100 | 26k |
-| OpenDiT | 175 | 45k |
-| OpenSora | 175 | 45k |
-
-## 高效的 STDiT
-
-我们的 STDiT 采用时空注意力对视频数据进行建模。与直接全神贯注在 Dit 相比,我们的 STDiT 随着帧数的增加而更有效率。我们当前的框架仅支持序列超长序列的并行性。
-
-训练速度是在 8 个 H800 GPU 上测量的,应用了加速技术,GC 表示梯度检查点。
-两者都具有像 PixArt 一样的 T5 调节。
-
-| 模型 | 设置 | 吞吐量 (sample/s/GPU) | 吞吐量 (tokens/s/GPU) |
-|------------------|----------------|--------------------|--------------------|
-| DiT | 16x256 (4k) | 7.20 | 29k |
-| STDiT | 16x256 (4k) | 7.00 | 28k |
-| DiT | 16x512 (16k) | 0.85 | 14k |
-| STDiT | 16x512 (16k) | 1.45 | 23k |
-| DiT (GC) | 64x512 (65k) | 0.08 | 5k |
-| STDiT (GC) | 64x512 (65k) | 0.40 | 25k |
-| STDiT (GC, sp=2) | 360x512 (370k) | 0.10 | 18k |
-
-使用 Video-VAE 在时间维度上进行 4 倍下采样时,24fps 视频有 450 帧。STDiT(28k tokens/s) 和 DiT 对图像 (高达 45k tokens/s)
-两者之间的速度差距主要来自 T5 和 VAE 编码,以及时间注意力。
-
-## 加速的编码器 (T5, VAE)
-
-在训练过程中,文本由 T5 编码,视频由 VAE 编码。通常有两种方法可以加速训练:
-
-1. 提前预处理文本和视频数据并保存到磁盘。
-2. 在训练过程中对文本和视频数据进行编码,并加快编码过程。
-
-对于选项 1,一个样本的 120 个令牌需要 1M 磁盘空间,而 64x64x64 的潜在可能需要 4M。考虑训练 包含 10M 视频剪辑的数据集,所需的总磁盘空间为
-50TB。我们的存储系统目前还没有准备好 这种数据规模。
-
-对于选项 2,我们提高了 T5 速度和内存要求。根据在[OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT),我们发现 VAE
-消耗了大量的 GPU 内存。因此,我们
-将批大小拆分为较小的批大小,以便进行 VAE 编码。使用这两种技术,我们可以大大加快训练速度。
-
-训练速度是在 8 个带有 STDiT 的 H800 GPU 上测量的。
-
-| 加速模式 | 设置 | 吞吐量 (img/s/GPU) | 吞吐量 (tokens/s/GPU) |
-|--------------|---------------|-----------------|--------------------|
-| Baseline | 16x256 (4k) | 6.16 | 25k |
-| w. faster T5 | 16x256 (4k) | 7.00 | 29k |
-| Baseline | 64x512 (65k) | 0.94 | 15k |
-| w. both | 64x512 (65k) | 1.45 | 23k |
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/commands.md b/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/commands.md
deleted file mode 100644
index 6564293c4b7cb0528f665eabfe7ac4ed0611ec36..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/commands.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# 命令
-
-## 推理
-
-您可以修改相应的配置文件来更改推理设置。在 [此处](/docs/structure.md#inference-config-demos) 查看更多详细信息。
-
-### 在 ImageNet 上使用 DiT 预训练进行推理
-
-以下命令会自动在 ImageNet 上下载预训练权重并运行推理。
-
-```bash
-python scripts/inference.py configs/dit/inference/1x256x256-class.py --ckpt-path DiT-XL-2-256x256.pt
-```
-
-### 在 UCF101 上使用 Latte 预训练进行推理
-
-以下命令会自动下载 UCF101 上的预训练权重并运行推理。
-
-```bash
-python scripts/inference.py configs/latte/inference/16x256x256-class.py --ckpt-path Latte-XL-2-256x256-ucf101.pt
-```
-
-### 使用 PixArt-α 预训练权重进行推理
-
-将 T5 下载到 `./pretrained_models` 并运行以下命令。
-
-```bash
-# 256x256
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x256x256.py --ckpt-path PixArt-XL-2-256x256.pth
-
-# 512x512
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x512x512.py --ckpt-path PixArt-XL-2-512x512.pth
-
-# 1024 multi-scale
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x1024MS.py --ckpt-path PixArt-XL-2-1024MS.pth
-```
-
-### 使用训练期间保存的 checkpoints 进行推理
-
-在训练期间,会在 `outputs` 目录中创建一个实验日志记录文件夹。在每个 checkpoint 文件夹下(例如 `epoch12-global_step2000`),有一个 `ema.pt` 文件和共享的 `model` 文件夹。执行以下命令进行推理。
-
-```bash
-# 使用 ema 模型进行推理
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000/ema.pt
-
-# 使用模型进行推理
-torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000
-
-# 使用序列并行进行推理
-# 当 nproc_per_node 大于 1 时,将自动启用序列并行
-torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000
-```
-
-第二个命令将在 checkpoint 文件夹中自动生成一个 `model_ckpt.pt` 文件。
-
-### 推理超参数
-
-1. DPM 求解器擅长对图像进行快速推理。但是,它的视频推理的效果并不令人满意。若出于快速演示目的您可以使用这个求解器。
-
-```python
-type="dmp-solver"
-num_sampling_steps=20
-```
-
-2. 您可以在视频推理上使用 [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) 微调的 VAE 解码器(消耗更多内存)。但是,我们没有看到视频推理效果有明显改善。要使用它,请将 [预训练权重](https://huggingface.co/maxin-cn/Latte/tree/main/t2v_required_models/vae_temporal_decoder) 下载到 `./pretrained_models/vae_temporal_decoder` 中,并修改配置文件,如下所示。
-
-```python
-vae = dict(
- type="VideoAutoencoderKLTemporalDecoder",
- from_pretrained="pretrained_models/vae_temporal_decoder",
-)
-```
-
-## 训练
-
-如果您要继续训练,请运行以下命令。参数 ``--load`` 和 ``--ckpt-path`` 不同之处在于,它会加载优化器和数据加载器的状态。
-
-```bash
-torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --load YOUR_PRETRAINED_CKPT
-```
-
-如果要启用 wandb 日志,请添加到 `--wandb` 参数到命令中。
-
-```bash
-WANDB_API_KEY=YOUR_WANDB_API_KEY torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --wandb True
-```
-
-您可以修改相应的配置文件来更改训练设置。在 [此处](/docs/structure.md#training-config-demos) 查看更多详细信息。
-
-### 训练超参数
-
-1. `dtype` 是用于训练的数据类型。仅支持 `fp16` 和 `bf16`。ColossalAI 自动启用 `fp16` 和 `bf16` 的混合精度训练。在训练过程中,我们发现 `bf16` 更稳定。
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/datasets.md b/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/datasets.md
deleted file mode 100644
index 90c3fb41bcdb87ce4c52f03ad9c894fa5a3c1195..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/datasets.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# 数据集
-
-## 正在使用的数据集
-
-### HD-VG-130M
-
-[HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) 包括 130M 个文本视频对。标题是
-由 BLIP-2 生成。我们发现剪切和文本质量相对较差。它包含 20 个拆分。对于 OpenSora 1.0,我们使用第一个拆分。我们计划使用整个数据集并对其进行重新处理。
-
-### Inter4k
-
-[Inter4k](https://github.com/alexandrosstergiou/Inter4K) 是一个包含分辨率为 4K 的 1k 视频剪辑的数据集。这个
-数据集被提议用于超分辨率任务。我们使用数据集进行 HQ 训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。
-
-### Pexels.com
-
-[Pexels.com](https://www.pexels.com/) 是一个提供免费库存照片和视频的网站。我们收集的 19K 视频
-来自本网站的剪辑,用于高质量训练。处理过的视频可以从这里找到 [这里](README.md#数据处理) 。
-
-## 数据集监视列表
-
-我们也在关注以下数据集,并考虑在未来使用它们,这取决于我们的存储空间以及数据集的质量。
-
-| 名称 | 大小 | 描述 |
-|-------------------|--------------|-------------------------------|
-| Panda-70M | 70M videos | High quality video-text pairs |
-| WebVid-10M | 10M videos | Low quality |
-| InternVid-10M-FLT | 10M videos | |
-| EGO4D | 3670 hours | |
-| OpenDV-YouTube | 1700 hours | |
-| VidProM | 6.69M videos | |
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/report_v1.md b/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/report_v1.md
deleted file mode 100644
index bf12131a458c262540ce79597bb825739305ffd2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/report_v1.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Open-Sora v1 技术报告
-
-OpenAI的Sora在生成一分钟高质量视频方面非常出色。然而,它几乎没有透露任何关于其细节的信息。为了使人工智能更加“开放”,我们致力于构建一个开源版本的Sora。这份报告描述了我们第一次尝试训练一个基于Transformer的视频扩散模型。
-
-## 选择高效的架构
-
-为了降低计算成本,我们希望利用现有的VAE模型。Sora使用时空VAE来减少时间维度。然而,我们发现没有开源的高质量时空VAE模型。[MAGVIT](https://github.com/google-research/magvit)的4x4x4 VAE并未开源,而[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的2x4x4 VAE在我们的实验中质量较低。因此,我们决定在我们第一个版本中使用2D VAE(来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original))。
-
-视频训练涉及大量的token。考虑到24fps的1分钟视频,我们有1440帧。通过VAE下采样4倍和patch大小下采样2倍,我们得到了1440x1024≈150万个token。在150万个token上进行全注意力计算将带来巨大的计算成本。因此,我们使用时空注意力来降低成本,这是遵循[Latte](https://github.com/Vchitect/Latte)的方法。
-
-如图中所示,在STDiT(ST代表时空)中,我们在每个空间注意力之后立即插入一个时间注意力。这类似于Latte论文中的变种3。然而,我们并没有控制这些变体的相似数量的参数。虽然Latte的论文声称他们的变体比变种3更好,但我们在16x256x256视频上的实验表明,相同数量的迭代次数下,性能排名为:DiT(完整)> STDiT(顺序)> STDiT(并行)≈ Latte。因此,我们出于效率考虑选择了STDiT(顺序)。[这里](/docs/acceleration.md#efficient-stdit)提供了速度基准测试。
-
-
-
-
-为了专注于视频生成,我们希望基于一个强大的图像生成模型来训练我们的模型。PixArt-α是一个经过高效训练的高质量图像生成模型,具有T5条件化的DiT结构。我们使用[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)初始化我们的模型,并将插入的时间注意力的投影层初始化为零。这种初始化在开始时保留了模型的图像生成能力,而Latte的架构则不能。插入的注意力将参数数量从5.8亿增加到7.24亿。
-
-
-
-借鉴PixArt-α和Stable Video Diffusion的成功,我们还采用了渐进式训练策略:在366K预训练数据集上进行16x256x256的训练,然后在20K数据集上进行16x256x256、16x512x512和64x512x512的训练。通过扩展位置嵌入,这一策略极大地降低了计算成本。
-
-我们还尝试在DiT中使用3D patch嵌入器。然而,在时间维度上2倍下采样后,生成的视频质量较低。因此,我们将在下一版本中将下采样留给时间VAE。目前,我们在每3帧采样一次进行16帧训练,以及在每2帧采样一次进行64帧训练。
-
-
-## 数据是训练高质量模型的核心
-
-我们发现数据的数量和质量对生成视频的质量有很大的影响,甚至比模型架构和训练策略的影响还要大。目前,我们只从[HD-VG-130M](https://github.com/daooshee/HD-VG-130M)准备了第一批分割(366K个视频片段)。这些视频的质量参差不齐,而且字幕也不够准确。因此,我们进一步从提供免费许可视频的[Pexels](https://www.pexels.com/)收集了20k相对高质量的视频。我们使用LLaVA,一个图像字幕模型,通过三个帧和一个设计好的提示来标记视频。有了设计好的提示,LLaVA能够生成高质量的字幕。
-
-
-
-由于我们更加注重数据质量,我们准备收集更多数据,并在下一版本中构建一个视频预处理流程。
-
-## 训练细节
-
-在有限的训练预算下,我们只进行了一些探索。我们发现学习率1e-4过大,因此将其降低到2e-5。在进行大批量训练时,我们发现`fp16`比`bf16`不太稳定,可能会导致生成失败。因此,我们在64x512x512的训练中切换到`bf16`。对于其他超参数,我们遵循了之前的研究工作。
-
-## 损失曲线
-
-16x256x256 预训练损失曲线
-
-
-
-16x256x256 高质量训练损失曲线
-
-
-
-16x512x512 高质量训练损失曲线
-
-
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/report_v2.md b/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/report_v2.md
deleted file mode 100644
index 0b59c0d41a9842c6b3b76d768f02eeed7f1a4549..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/report_v2.md
+++ /dev/null
@@ -1,114 +0,0 @@
-# Open-Sora 1.1 技术报告
-
-- [模型架构修改](#模型架构修改)
-- [支持不同视频长度/分辨率/宽高比/帧率(fps)训练](#支持不同视频长度分辨率宽高比帧率fps训练)
-- [使用Masked DiT作为图生视频/视频生视频模型](#使用masked-dit作为图生视频视频生视频模型)
-- [数据收集和流程](#数据收集和流程)
-- [训练详情](#训练详情)
-- [结果和评价](#结果和评价)
-- [不足和下一步计划](#不足和下一步计划)
-
-在Open-Sora1.1版本中,我们使用了10M数据来训练经过结构调优后的STDiT的700M模型(Open-Sora1.0版本仅用400K数据)。我们实现了[Sora报告](https://openai.com/research/video-generation-models-as-world-simulators)中提到的以下功能:
-
-- 可变的视频时长、分辨率、宽高比(包括采样灵活性、改进的取景范围和构图)
-- 提示词增加图片和视频选项(使图像动起来、生成式增长视频、视频到视频编辑、连接不同视频)
-- 图像生成功能
-
-为了实现这一目标,我们在预训练阶段使用了多任务学习。对于扩散模型来说,用不同的采样时间步长进行训练已经是一种多任务学习。我们将这一思想在图像和视频的条件生成模型上,进一步扩展到多分辨率、宽高比、帧长、fps以及不同的掩码策略。我们在**0~15s、144p到720p、各种宽高比的视频**上训练模型。虽然由于训练FLOPs不足的限制,生成的视频在时间一致性上的表现没有那么高,但我们仍然可以看到这个模型的巨大潜力。
-
-## 模型架构修改
-
-我们对原始ST-DiT模型进行了以下修改,以获得更好的训练稳定性和模型性能(ST-DiT-2):
-
-- **在时间注意力模块中添加[旋转位置编码](https://arxiv.org/abs/2104.09864)**:遵循目前LLM的最佳实践,我们将时间注意力模块中的正弦位置编码更改为旋转位置编码,因为它也算一项序列预测任务。
-- **在时间注意力模块中添加AdaIN和Layernormal**:我们将时间注意力与AdaIN和Layer范数作为空间注意力包裹起来,以稳定训练。
-- **[QK归一化](https://arxiv.org/abs/2302.05442)与[RMSNorm](https://arxiv.org/abs/1910.07467)**:和[SD3](https://arxiv.org/pdf/2403.03206.pdf)类似地,我们应用QK归一化来提高半精度训练的稳定性。
-- **支持动态输入大小和视频条件限定**:为了支持多分辨率、宽高比和fps训练,我们ST-DiT-2来接受任何输入大小。延申[PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)的想法,我们支持限定视频的高度、宽度、宽高比、帧长和fps。
-- **将T5token数量从120扩展到200**:我们使用的视频描述通常少于200个token,我们发现模型也可以很好地处理更长的文本。
-
-## 支持不同视频长度/分辨率/宽高比/帧率(fps)训练
-
-正如[Sora报告](https://openai.com/research/video-generation-models-as-world-simulators)中提到的,使用原始无损视频的分辨率、宽高比和视频长度进行训练可以增加采样灵活性,改善取景和构图。我们找到了三种实现这一目标的方法:
-- [NaViT](https://arxiv.org/abs/2307.06304):通过不同掩码策略支持在同一训练批次内使用不同大小的数据,并且训练效率下降很少。然而,该系统实现起来有点复杂,并且可能无法兼容kernal优化技术(如flashattention)。
-- 填充([FiT](https://arxiv.org/abs/2402.12376),[Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)):通过填充支持同一批次内的不同大小的数据。然而,将不同的分辨率填充到相同的大小会导致效率降低。
-- 分桶训练([SDXL](https://arxiv.org/abs/2307.01952)、[PixArt](https://arxiv.org/abs/2310.00426)):支持通过分桶的方式在不同批次中动态调整大小,但在同一批次内数据大小必须相同,只能应用固定数量的数据大小。在一个批次中,我们不需要实现复杂的掩码或填充。
-
-为了更便捷的实现,我们选择分桶训练的方式。我们预先定义了一些固定的分辨率,并将不同的样本分配到不同的桶中。下面列出了分桶方案中值得注意的点。但我们可以看到,这些在我们的实验中并不是一个大问题。
-
-
-查看注意事项
-
-- 桶大小被限制为固定数量:首先,在实际应用中,通常只使用少数宽高比(9:16、3:4)和分辨率(240p、1080p)。其次,我们发现经过训练的模型可以很好地推广到未见过的解决方案。
-- 每批的大小相同,打破了独立同分布(i.i.d.)假设:由于我们使用多个 GPU,因此不同 GPU 上的本地批次具有不同的大小。我们没有发现此问题导致性能显着下降。
-- 可能没有足够的样本来填充每个桶,并且分布可能有偏差:首先,当本地批量大小不太大时,我们的数据集足够大以填充每个桶。其次,我们应该分析数据大小的分布并相应地定义桶大小。第三,分配不平衡并没有显着影响训练过程。
-- 不同的分辨率和帧长可能有不同的处理速度:与PixArt只处理相似分辨率(相似token数)的宽高比不同,我们需要考虑不同分辨率和帧长的处理速度。我们可以使用“bucket_config”来定义每个桶的批量大小,以确保处理速度相似。
-
-
-
-
-
-如图所示,桶是(分辨率,帧数量,宽高比)的三元组。我们为不同的分辨率提供预定义的宽高比,涵盖了大多数常见的视频宽高比。在每个epoch之前,我们打乱数据集并将样本分配到不同的桶中,如图所示。我们将样本放入最大分辨率和帧长度小于视频的桶中。
-
-考虑到我们的计算资源有限,我们进一步为每个(分辨率,num_frame)二元组引入keep_prob和batch_size两个属性,以降低计算成本并实现多阶段训练。具体来说,高清视频将以概率1-keep_prob下采样到较低分辨率的桶中,并且每个桶的样本数量是由batch_size属性决定的。这样,我们可以控制不同桶中的样本数量,并通过为每个桶搜索合适的数据量来平衡GPU负载。
-
-有关训练中桶使用的详细说明,请参阅[配置文件](/docs/config.md#training-bucket-configs).
-
-## 使用Masked DiT作为图生视频/视频生视频模型
-
-Transformer可以很容易地扩展到支持图生图和视频生视频的任务。我们提出了一种蒙版策略来支持图像和视频的调节。蒙版策略如下图所示。
-
-
-
-在将图像或视频转换成另一个视频的过程中,我们通常会选择出需要作为条件的帧并取消其掩码(unmask)。在使用ST-DiT模型进行前向传播时,被选择取消掩码(unmask)的帧将被赋予时间步长0,而其他帧则保持它们原有的时间步长t。我们发现,如果直接将这种策略应用到训练好的模型上,会得到较差的结果,因为扩散模型在训练过程中并未学会如何处理一个样本中具有不同时间步长的帧。
-
-受[UL2](https://arxiv.org/abs/2205.05131)的启发,我们在训练期间引入了随机掩码策略。具体来说,我们在训练期间随机取消掩码帧,包括取消掩码第一帧,前k帧,最后k帧,最后k帧,第一和最后k帧,随机帧等。基于Open-Sora 1.0模型,以50%的概率应用掩码策略,我们发现模型能够在10,000步的训练中学会处理图像条件(而30%的概率会导致处理能力变差),同时文本到视频的性能略有下降。因此,在Open-Sora 1.1版本中,我们从头开始预训练模型,并采用了掩码策略。
-
-下图给出了用于推理的掩码策略配置的说明。五数字元组在定义掩码策略方面提供了极大的灵活性。
-
-
-
-掩码策略用法的详细说明可在[配置文件](/docs/config.md#advanced-inference-config)中查看.
-
-
-## 数据收集和流程
-
-正如我们在Sora1.0版本中看见的那样,数据数量和质量对于训练一个好的模型至关重要,因此,我们努力扩展数据集。首先,我们创建了一个遵循[SVD](https://arxiv.org/abs/2311.15127)的自动流水线,包括场景切割、字幕、各种评分和过滤以及数据集管理脚本和通用惯例。
-
-
-
-我们计划使用[panda-70M](https://snap-research.github.io/Panda-70M/)和其他数据来训练模型,大约包含3000万条数据。然而,我们发现磁盘输入输出(disk IO)在同时进行训练和数据处理时成为了一个瓶颈。因此,我们只能准备一个包含1000万条数据的数据集,并且没有完成我们构建的所有处理流程。最终,我们使用了包含970万视频和260万图像的数据集进行预训练,以及560,000视频和160万图像的数据集进行微调。预训练数据集的统计信息如下所示。
-
-图像文本标记 (使用T5分词器):
-
-
-视频文本标记 (使用T5分词器)。我们直接使用Panda的短视频描述进行训练,并自己给其他数据集加视频描述。生成的字幕通常少于200个token。
-
-
-视频时长:
-
-
-## 训练详情
-
-由于计算资源有限,我们必须仔细监控训练过程,并在推测模型学习不佳时更改训练策略,因为没有消融研究的计算。因此,Open-Sora1.1版本的训练包括多个更改,所以,指数移动平均(EMA)未被应用。
-
-1. 首先,我们从`Pixart-alpha-1024`的模型checkpoint开始,使用不同分辨率的图像进行了6000步的微调。我们发现模型能够很容易地适应并生成不同分辨率的图像。为了加快扩散过程的训练,我们使用了[SpeeDiT](https://github.com/1zeryu/SpeeDiT)(iddpm-speed)技术。
-2. **[阶段一]** 然后,我们使用梯度检查点(gradient-checkpointing)技术对模型进行了**24,000**步的预训练,这个过程在64个H800 GPU上运行了**4天**。尽管模型看到的数据样本数量相同,我们发现与使用较小批量大小相比,模型的学习速度较慢。我们推测,在训练的早期阶段,步数的数量对于训练更为重要。大多数视频的分辨率是**240p**,预训练时使用的配置与[stage2.py](/configs/opensora-v1-1/train/stage2.py)相似。
-3. **[阶段一]** 为了增加训练步数,我们改用了更小的批量大小,并且没有使用梯度检查点技术。在这个阶段,我们还引入了帧率(fps)条件。模型训练了**40,000**步,持续了**2天**。训练中使用的视频大多数是**144p**分辨率,使用的配置文件是[stage1.py](/configs/opensora-v1-1/train/stage1.py)。我们使用较低的分辨率,因为我们在Open-Sora 1.0版本中发现模型可以以相对较低的分辨率学习时间知识。
-4. **[阶段一]** 我们发现模型不能很好地学习长视频,并在Open-Sora1.0训练中发现了一个噪声生成结果,推测是半精度问题。因此,我们采用QK-归一化来稳定训练。我们还将iddpm-speed切换成iddpm。我们训练了**17k**步**14小时**。大多数视频的分辨率是144p,预训练时使用的配置是[stage1.py](/configs/opensora-v1-1/train/stage1.py)。阶段1训练持续约一周,总步长**81k**。
-5. **[阶段二]** 我们切换到更高的分辨率,其中大多数视频是**240p和480p**分辨率([stage2.py](/configs/opensora-v1-1/train/stage2.py))。我们在所有预训练数据上训练了**22000**步,持续**一天**。
-6. **[阶段三]** 我们切换到更高的分辨率,大多数视频的分辨率是**480p和720p**([stage3.py](/configs/opensora-v1-1/train/stage3.py))。我们在高质量数据上训了**4000**步,用时**一天**。
-
-## 结果和评价
-
-## 不足和下一步计划
-
-随着我们离Sora的复现又近了一步,我们发现当前模型存在许多不足,这些不足将在我们下阶段工作中得到改善。
-
-- **噪音的生成和影响**:我们发现生成的模型,特别是长视频中,有时很多噪点,不流畅。我们认为问题在于没有使用时间VAE。由于[Pixart-Sigma](https://arxiv.org/abs/2403.04692)发现适应新VAE很容易,我们计划在下一个版本中为模型开发时间VAE。
-- **缺乏时间一致性**:我们发现模型无法生成具有高时间一致性的视频,我们认为问题是由于缺乏训练FLOPs,我们计划收集更多数据并继续训练模型以提高时间一致性。
-- **人像生成质量低**:我们发现模型无法生成高质量的人类视频,我们认为问题是由于缺乏人类数据,我们计划收集更多的人类数据,并继续训练模型以提高人类生成。
-- **美学得分低**:我们发现模型的美学得分不高。问题在于缺少美学得分过滤,由于IO瓶颈没我们没有进行这一步骤。我们计划通过美学得分和微调模型来过滤数据,以提高美学得分。
-- **长视频生成质量低**:我们发现,使用同样的提示词,视频越长,质量越差。这意味着图像质量不能同等地被不同长度的序列所适应。
-
-> - **算法与加速实现**:Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou
-> - **数据收集与处理**:Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu
diff --git a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/structure.md b/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/structure.md
deleted file mode 100644
index 6e25d84c2b8a696d9cd77baf4883a1655b3a362d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/docs/zh_CN/structure.md
+++ /dev/null
@@ -1,179 +0,0 @@
-# 代码仓库和配置文件结构
-
-## 代码仓库结构
-
-```plaintext
-Open-Sora
-├── README.md
-├── docs
-│ ├── acceleration.md -> Acceleration & Speed benchmark
-│ ├── command.md -> Commands for training & inference
-│ ├── datasets.md -> Datasets used in this project
-│ ├── structure.md -> This file
-│ └── report_v1.md -> Report for Open-Sora v1
-├── scripts
-│ ├── train.py -> diffusion training script
-│ └── inference.py -> Report for Open-Sora v1
-├── configs -> Configs for training & inference
-├── opensora
-│ ├── __init__.py
-│ ├── registry.py -> Registry helper
-│ ├── acceleration -> Acceleration related code
-│ ├── dataset -> Dataset related code
-│ ├── models
-│ │ ├── layers -> Common layers
-│ │ ├── vae -> VAE as image encoder
-│ │ ├── text_encoder -> Text encoder
-│ │ │ ├── classes.py -> Class id encoder (inference only)
-│ │ │ ├── clip.py -> CLIP encoder
-│ │ │ └── t5.py -> T5 encoder
-│ │ ├── dit
-│ │ ├── latte
-│ │ ├── pixart
-│ │ └── stdit -> Our STDiT related code
-│ ├── schedulers -> Diffusion schedulers
-│ │ ├── iddpm -> IDDPM for training and inference
-│ │ └── dpms -> DPM-Solver for fast inference
-│ └── utils
-└── tools -> Tools for data processing and more
-```
-
-## 配置文件结构
-
-
-我们的配置文件遵循[MMEgine](https://github.com/open-mmlab/mmengine)。 MMEngine 将读取配置文件(“.py”文件)并将其解析为类似字典的对象。
-
-```plaintext
-Open-Sora
-└── configs -> Configs for training & inference
- ├── opensora -> STDiT related configs
- │ ├── inference
- │ │ ├── 16x256x256.py -> Sample videos 16 frames 256x256
- │ │ ├── 16x512x512.py -> Sample videos 16 frames 512x512
- │ │ └── 64x512x512.py -> Sample videos 64 frames 512x512
- │ └── train
- │ ├── 16x256x256.py -> Train on videos 16 frames 256x256
- │ ├── 16x256x256.py -> Train on videos 16 frames 256x256
- │ └── 64x512x512.py -> Train on videos 64 frames 512x512
- ├── dit -> DiT related configs
- │ ├── inference
- │ │ ├── 1x256x256-class.py -> Sample images with ckpts from DiT
- │ │ ├── 1x256x256.py -> Sample images with clip condition
- │ │ └── 16x256x256.py -> Sample videos
- │ └── train
- │ ├── 1x256x256.py -> Train on images with clip condition
- │ └── 16x256x256.py -> Train on videos
- ├── latte -> Latte related configs
- └── pixart -> PixArt related configs
-```
-
-## 推理配置演示
-
-要更改推理设置,可以直接修改相应的配置文件。或者您可以传递参数来覆盖配置文件([config_utils.py](/opensora/utils/config_utils.py))。要更改采样提示,您应该修改传递给“--prompt_path”参数的“.txt”文件。
-
-```plaintext
---prompt_path ./assets/texts/t2v_samples.txt -> prompt_path
---ckpt-path ./path/to/your/ckpt.pth -> model["from_pretrained"]
-```
-
-下面提供了每个字段的解释。
-
-```python
-# Define sampling size
-num_frames = 64 # number of frames
-fps = 24 // 2 # frames per second (divided by 2 for frame_interval=2)
-image_size = (512, 512) # image size (height, width)
-
-# Define model
-model = dict(
- type="STDiT-XL/2", # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
- space_scale=1.0, # (Optional) Space positional encoding scale (new height / old height)
- time_scale=2 / 3, # (Optional) Time positional encoding scale (new frame_interval / old frame_interval)
- enable_flashattn=True, # (Optional) Speed up training and inference with flash attention
- enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
- from_pretrained="PRETRAINED_MODEL", # (Optional) Load from pretrained model
- no_temporal_pos_emb=True, # (Optional) Disable temporal positional encoding (for image)
-)
-vae = dict(
- type="VideoAutoencoderKL", # Select VAE type
- from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE
- micro_batch_size=128, # VAE with micro batch size to save memory
-)
-text_encoder = dict(
- type="t5", # Select text encoder type (t5, clip)
- from_pretrained="DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
- model_max_length=120, # Maximum length of input text
-)
-scheduler = dict(
- type="iddpm", # Select scheduler type (iddpm, dpm-solver)
- num_sampling_steps=100, # Number of sampling steps
- cfg_scale=7.0, # hyper-parameter for classifier-free diffusion
-)
-dtype = "fp16" # Computation type (fp16, fp32, bf16)
-
-# Other settings
-batch_size = 1 # batch size
-seed = 42 # random seed
-prompt_path = "./assets/texts/t2v_samples.txt" # path to prompt file
-save_dir = "./samples" # path to save samples
-```
-
-## 训练配置演示
-
-```python
-# Define sampling size
-num_frames = 64
-frame_interval = 2 # sample every 2 frames
-image_size = (512, 512)
-
-# Define dataset
-root = None # root path to the dataset
-data_path = "CSV_PATH" # path to the csv file
-use_image_transform = False # True if training on images
-num_workers = 4 # number of workers for dataloader
-
-# Define acceleration
-dtype = "bf16" # Computation type (fp16, bf16)
-grad_checkpoint = True # Use gradient checkpointing
-plugin = "zero2" # Plugin for distributed training (zero2, zero2-seq)
-sp_size = 1 # Sequence parallelism size (1 for no sequence parallelism)
-
-# Define model
-model = dict(
- type="STDiT-XL/2",
- space_scale=1.0,
- time_scale=2 / 3,
- from_pretrained="YOUR_PRETRAINED_MODEL",
- enable_flashattn=True, # Enable flash attention
- enable_layernorm_kernel=True, # Enable layernorm kernel
-)
-vae = dict(
- type="VideoAutoencoderKL",
- from_pretrained="stabilityai/sd-vae-ft-ema",
- micro_batch_size=128,
-)
-text_encoder = dict(
- type="t5",
- from_pretrained="DeepFloyd/t5-v1_1-xxl",
- model_max_length=120,
- shardformer=True, # Enable shardformer for T5 acceleration
-)
-scheduler = dict(
- type="iddpm",
- timestep_respacing="", # Default 1000 timesteps
-)
-
-# Others
-seed = 42
-outputs = "outputs" # path to save checkpoints
-wandb = False # Use wandb for logging
-
-epochs = 1000 # number of epochs (just large enough, kill when satisfied)
-log_every = 10
-ckpt_every = 250
-load = None # path to resume training
-
-batch_size = 4
-lr = 2e-5
-grad_clip = 1.0 # gradient clipping
-```
diff --git a/PyTorch/built-in/mm/OpenSora1.1/eval/README.md b/PyTorch/built-in/mm/OpenSora1.1/eval/README.md
deleted file mode 100644
index 3ec8c7e0b219df6c87e36d3599987b7f27e73ed3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/eval/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Evalution
-
-## Human evaluation
-
-To conduct human evaluation, we need to generate various samples. We provide many prompts in `assets/texts`, and defined some test setting covering different resolution, duration and aspect ratio in `eval/sample.sh`. To facilitate the usage of multiple GPUs, we split sampling tasks into several parts.
-
-```bash
-# image (1)
-bash eval/sample.sh /path/to/ckpt -1
-# video (2a 2b 2c ...)
-bash eval/sample.sh /path/to/ckpt -2a
-# launch 8 jobs at once (you must read the script to understand the details)
-bash eval/launch.sh /path/to/ckpt
-```
-
-## VBench
-
-[VBench](https://github.com/Vchitect/VBench) is a benchmark for short text to video generation. We provide a script for easily generating samples required by VBench.
-
-```bash
-# vbench tasks (4a 4b 4c ...)
-bash eval/sample.sh /path/to/ckpt -4a
-# launch 8 jobs at once (you must read the script to understand the details)
-bash eval/launch.sh /path/to/ckpt
-```
-
-After generation, install the VBench package according to their [instructions](https://github.com/Vchitect/VBench?tab=readme-ov-file#hammer-installation). Then, run the following commands to evaluate the generated samples.
-
-```bash
-bash eval/vbench/vbench.sh /path/to/video_folder
-```
-
-## VBench-i2v
-
-[VBench-i2v](https://github.com/Vchitect/VBench/tree/master/vbench2_beta_i2v) is a benchmark for short image to video generation (beta version).
-
-TBD
diff --git a/PyTorch/built-in/mm/OpenSora1.1/eval/launch.sh b/PyTorch/built-in/mm/OpenSora1.1/eval/launch.sh
deleted file mode 100644
index 1ceffab7c0dacd46e886cb34238003c3d8c085e5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/eval/launch.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-set -x
-set -e
-
-CKPT=$1
-if [[ $CKPT == *"ema"* ]]; then
- parentdir=$(dirname $CKPT)
- CKPT_BASE=$(basename $parentdir)_ema
-else
- CKPT_BASE=$(basename $CKPT)
-fi
-LOG_BASE=logs/sample/$CKPT_BASE
-echo "Logging to $LOG_BASE"
-
-# == sample & human evaluation ==
-# CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -1 >${LOG_BASE}_1.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=1 bash eval/sample.sh $CKPT -2a >${LOG_BASE}_2a.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=2 bash eval/sample.sh $CKPT -2b >${LOG_BASE}_2b.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=3 bash eval/sample.sh $CKPT -2c >${LOG_BASE}_2c.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=4 bash eval/sample.sh $CKPT -2d >${LOG_BASE}_2d.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=5 bash eval/sample.sh $CKPT -2e >${LOG_BASE}_2e.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=6 bash eval/sample.sh $CKPT -2f >${LOG_BASE}_2f.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=7 bash eval/sample.sh $CKPT -2g >${LOG_BASE}_2g.log 2>&1 &
-
-# CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -2h >${LOG_BASE}_2h.log 2>&1 &
-
-# == vbench ==
-# CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -4a >${LOG_BASE}_4a.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=1 bash eval/sample.sh $CKPT -4b >${LOG_BASE}_4b.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=2 bash eval/sample.sh $CKPT -4c >${LOG_BASE}_4c.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=3 bash eval/sample.sh $CKPT -4d >${LOG_BASE}_4d.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=4 bash eval/sample.sh $CKPT -4e >${LOG_BASE}_4e.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=5 bash eval/sample.sh $CKPT -4f >${LOG_BASE}_4f.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=6 bash eval/sample.sh $CKPT -4g >${LOG_BASE}_4g.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=7 bash eval/sample.sh $CKPT -4h >${LOG_BASE}_4h.log 2>&1 &
-
-# == vbench i2v ==
-# CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -5a >${LOG_BASE}_5a.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=1 bash eval/sample.sh $CKPT -5b >${LOG_BASE}_5b.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=2 bash eval/sample.sh $CKPT -5c >${LOG_BASE}_5c.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=3 bash eval/sample.sh $CKPT -5d >${LOG_BASE}_5d.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=4 bash eval/sample.sh $CKPT -5e >${LOG_BASE}_5e.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=5 bash eval/sample.sh $CKPT -5f >${LOG_BASE}_5f.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=6 bash eval/sample.sh $CKPT -5g >${LOG_BASE}_5g.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=7 bash eval/sample.sh $CKPT -5h >${LOG_BASE}_5h.log 2>&1 &
-
-# kill all by: pkill -f "inference"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/eval/multiple.sh b/PyTorch/built-in/mm/OpenSora1.1/eval/multiple.sh
deleted file mode 100644
index 5e1dfe0288ef58cd3ce7b5ba673d104064a62d9e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/eval/multiple.sh
+++ /dev/null
@@ -1,281 +0,0 @@
-#!/bin/bash
-
-set -x
-set -e
-
-CKPT=$1
-PROMPT=$2
-NUM_SAMPLE=3
-NAME=$(date +%Y%m%d%H%M%S)
-CMD="python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py"
-if [[ $CKPT == *"ema"* ]]; then
- parentdir=$(dirname $CKPT)
- CKPT_BASE=$(basename $parentdir)_ema
-else
- CKPT_BASE=$(basename $CKPT)
-fi
-OUTPUT="./samples/samples_${CKPT_BASE}_${NAME}"
-start=$(date +%s)
-
-# Generate samples
-
-# == 16x240p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x240p_1_1 \
- --num-frames 16 --image-size 320 320 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x240p_16_9 \
- --num-frames 16 --image-size 240 426 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x240p_9_16 \
- --num-frames 16 --image-size 426 240 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x240p_4_3 \
- --num-frames 16 --image-size 276 368 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x240p_3_4 \
- --num-frames 16 --image-size 368 276 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x240p_1_2 \
- --num-frames 16 --image-size 226 452 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x240p_2_1 \
- --num-frames 16 --image-size 452 226 --num-sample $NUM_SAMPLE
-
-# == 64x240p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x240p_1_1 \
- --num-frames 64 --image-size 320 320 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x240p_16_9 \
- --num-frames 64 --image-size 240 426 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x240p_9_16 \
- --num-frames 64 --image-size 426 240 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x240p_4_3 \
- --num-frames 64 --image-size 276 368 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x240p_3_4 \
- --num-frames 64 --image-size 368 276 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x240p_1_2 \
- --num-frames 64 --image-size 226 452 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x240p_2_1 \
- --num-frames 64 --image-size 452 226 --num-sample $NUM_SAMPLE
-
-# == 128x240p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x240p_1_1 \
- --num-frames 128 --image-size 320 320 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x240p_16_9 \
- --num-frames 128 --image-size 240 426 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x240p_9_16 \
- --num-frames 128 --image-size 426 240 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x240p_4_3 \
- --num-frames 128 --image-size 276 368 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x240p_3_4 \
- --num-frames 128 --image-size 368 276 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x240p_1_2 \
- --num-frames 128 --image-size 226 452 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x240p_2_1 \
- --num-frames 128 --image-size 452 226 --num-sample $NUM_SAMPLE
-
-# == 16x360p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x360p_1_1 \
- --num-frames 16 --image-size 480 480 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x360p_16_9 \
- --num-frames 16 --image-size 360 640 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x360p_9_16 \
- --num-frames 16 --image-size 640 360 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x360p_4_3 \
- --num-frames 16 --image-size 416 554 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x360p_3_4 \
- --num-frames 16 --image-size 554 416 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x360p_1_2 \
- --num-frames 16 --image-size 360 640 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x360p_2_1 \
- --num-frames 16 --image-size 640 360 --num-sample $NUM_SAMPLE
-
-# == 64x360p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x360p_1_1 \
- --num-frames 64 --image-size 480 480 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x360p_16_9 \
- --num-frames 64 --image-size 360 640 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x360p_9_16 \
- --num-frames 64 --image-size 640 360 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x360p_4_3 \
- --num-frames 64 --image-size 416 554 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x360p_3_4 \
- --num-frames 64 --image-size 554 416 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x360p_1_2 \
- --num-frames 64 --image-size 360 640 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x360p_2_1 \
- --num-frames 64 --image-size 640 360 --num-sample $NUM_SAMPLE
-
-# == 128x360p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x360p_1_1 \
- --num-frames 128 --image-size 480 480 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x360p_16_9 \
- --num-frames 128 --image-size 360 640 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x360p_9_16 \
- --num-frames 128 --image-size 640 360 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x360p_4_3 \
- --num-frames 128 --image-size 416 554 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x360p_3_4 \
- --num-frames 128 --image-size 554 416 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x360p_1_2 \
- --num-frames 128 --image-size 360 640 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 128x360p_2_1 \
- --num-frames 128 --image-size 640 360 --num-sample $NUM_SAMPLE
-
-# == 16x480p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x480p_1_1 \
- --num-frames 16 --image-size 640 640 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x480p_16_9 \
- --num-frames 16 --image-size 480 854 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x480p_9_16 \
- --num-frames 16 --image-size 854 480 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x480p_4_3 \
- --num-frames 16 --image-size 554 738 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x480p_3_4 \
- --num-frames 16 --image-size 738 554 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x480p_1_2 \
- --num-frames 16 --image-size 452 904 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x480p_2_1 \
- --num-frames 16 --image-size 904 452 --num-sample $NUM_SAMPLE
-
-# == 32x480p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x480p_1_1 \
- --num-frames 32 --image-size 640 640 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x480p_16_9 \
- --num-frames 32 --image-size 480 854 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x480p_9_16 \
- --num-frames 32 --image-size 854 480 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x480p_4_3 \
- --num-frames 32 --image-size 554 738 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x480p_3_4 \
- --num-frames 32 --image-size 738 554 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x480p_1_2 \
- --num-frames 32 --image-size 452 904 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x480p_2_1 \
- --num-frames 32 --image-size 904 452 --num-sample $NUM_SAMPLE
-
-# == 64x480p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x480p_1_1 \
- --num-frames 64 --image-size 640 640 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x480p_16_9 \
- --num-frames 64 --image-size 480 854 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x480p_9_16 \
- --num-frames 64 --image-size 854 480 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x480p_4_3 \
- --num-frames 64 --image-size 554 738 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x480p_3_4 \
- --num-frames 64 --image-size 738 554 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x480p_1_2 \
- --num-frames 64 --image-size 452 904 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 64x480p_2_1 \
- --num-frames 64 --image-size 904 452 --num-sample $NUM_SAMPLE
-
-# == 16x720p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x720p_1_1 \
- --num-frames 16 --image-size 960 960 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x720p_16_9 \
- --num-frames 16 --image-size 720 1280 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x720p_9_16 \
- --num-frames 16 --image-size 1280 720 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x720p_4_3 \
- --num-frames 16 --image-size 832 1108 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x720p_3_4 \
- --num-frames 16 --image-size 1108 832 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x720p_1_2 \
- --num-frames 16 --image-size 1358 600 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 16x720p_2_1 \
- --num-frames 16 --image-size 600 1358
-
-# == 32x720p ==
-# 1:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x720p_1_1 \
- --num-frames 32 --image-size 960 960 --num-sample $NUM_SAMPLE
-# 16:9
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x720p_16_9 \
- --num-frames 32 --image-size 720 1280 --num-sample $NUM_SAMPLE
-# 9:16
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x720p_9_16 \
- --num-frames 32 --image-size 1280 720 --num-sample $NUM_SAMPLE
-# 4:3
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x720p_4_3 \
- --num-frames 32 --image-size 832 1108 --num-sample $NUM_SAMPLE
-# 3:4
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x720p_3_4 \
- --num-frames 32 --image-size 1108 832 --num-sample $NUM_SAMPLE
-# 1:2
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x720p_1_2 \
- --num-frames 32 --image-size 1358 600 --num-sample $NUM_SAMPLE
-# 2:1
-eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --sample-name 32x720p_2_1 \
- --num-frames 32 --image-size 600 1358
-
-### End
-
-end=$(date +%s)
-
-runtime=$((end - start))
-
-echo "Runtime: $runtime seconds"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/eval/sample.sh b/PyTorch/built-in/mm/OpenSora1.1/eval/sample.sh
deleted file mode 100644
index 7c4386d2e60c317ba9b76cf9353b93d6e6f8f413..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/eval/sample.sh
+++ /dev/null
@@ -1,372 +0,0 @@
-#!/bin/bash
-
-# set -x
-set -e
-
-CKPT=$1
-
-CMD="python scripts/inference.py configs/opensora-v1-1/inference/sample.py"
-CMD_REF="python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py"
-if [[ $CKPT == *"ema"* ]]; then
- parentdir=$(dirname $CKPT)
- CKPT_BASE=$(basename $parentdir)_ema
-else
- CKPT_BASE=$(basename $CKPT)
-fi
-OUTPUT="./samples/samples_${CKPT_BASE}"
-start=$(date +%s)
-DEFAULT_BS=8
-
-### Functions
-
-function run_image() { # 10min
- # 1.1 1024x1024
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --image-size 1024 1024 --sample-name 1024x1024 --batch-size $DEFAULT_BS
-
- # 1.2 240x426
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --image-size 240 426 --sample-name 240x426 --end-index 3 --batch-size $DEFAULT_BS
-
- # 1.3 512x512
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name 512x512 --end-index 3 --batch-size $DEFAULT_BS
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name 512x512 --end-index 3 --batch-size $DEFAULT_BS
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name 512x512 --end-index 3 --batch-size $DEFAULT_BS
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name 512x512 --end-index 3 --batch-size $DEFAULT_BS
-
- # 1.4 720p multi-resolution
- # 1:1
- PROMPT="Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens."
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --image-size 960 960 --sample-name 720p_1_1
- # 16:9
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --image-size 720 1280 --sample-name 720p_16_9
- # 9:16
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --image-size 1280 720 --sample-name 720p_9_16
- # 4:3
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --image-size 832 1108 --sample-name 720p_4_3
- # 3:4
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --image-size 1108 832 --sample-name 720p_3_4
- # 1:2
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --image-size 1358 600 --sample-name 720p_1_2
- # 2:1
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --image-size 600 1358 --sample-name 720p_2_1
-}
-
-function run_video_a() { # 30min, sample & multi-resolution
- # sample
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16 --image-size 144 256 --sample-name sample_16x144x256 --batch-size $DEFAULT_BS
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16 --image-size 240 426 --sample-name sample_16x240x426 --batch-size $DEFAULT_BS
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 32 --image-size 240 426 --sample-name sample_32x240x426 --batch-size $DEFAULT_BS
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 64 --image-size 240 426 --sample-name sample_64x240x426 --batch-size $DEFAULT_BS
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16 --image-size 480 854 --sample-name sample_16x480x854 --batch-size $DEFAULT_BS
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 32 --image-size 480 854 --sample-name sample_32x480x854 --batch-size $DEFAULT_BS
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16 --image-size 720 1280 --sample-name sample_16x720x1280 --batch-size $DEFAULT_BS
-}
-
-function run_video_b() { # 30min, short 16x240p & 64x240p
- # 32x240p, short
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 32 --image-size 240 426 --sample-name short_32x240x426 --batch-size $DEFAULT_BS
-
- # 64x240p, short
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 64 --image-size 240 426 --sample-name short_64x240x426 --batch-size $DEFAULT_BS
-}
-
-function run_video_c() { # 30min, sora 16x240p & short 128x240p
- # 16x240p, sora
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16 --image-size 426 240 --sample-name sora_16x426x240 --batch-size $DEFAULT_BS
-
- # 16x240p, sora
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16 --image-size 240 426 --sample-name sora_16x240x426 --batch-size $DEFAULT_BS
-
- # 128x240p, sora
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 128 --image-size 240 426 --sample-name sora_128x240x426 --batch-size $DEFAULT_BS
-}
-
-function run_video_d() { # 30min, sora 32x480p
- # 32x480p, short
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 32 --image-size 480 854 --sample-name short_32x480x854 --batch-size $DEFAULT_BS
-}
-
-function run_video_e() { # 30min
- # 64x480p, sora
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 64 --image-size 480 854 --sample-name sora_64x480x854 --batch-size 4
-}
-
-function run_video_f() { # 30min
- # 16x720p, sora
- eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16 --image-size 720 1280 --sample-name sora_16x720x1280 --batch-size $DEFAULT_BS
-}
-
-function run_video_g() {
- # 16x720p multi-resolution
- # 1:1
- PROMPT="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures."
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 960 960 --sample-name 720p_1_1
- # 16:9
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 720 1280 --sample-name 720p_16_9
- # 9:16
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 1280 720 --sample-name 720p_9_16
- # 4:3
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 832 1108 --sample-name 720p_4_3
- # 3:4
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 1108 832 --sample-name 720p_3_4
- # 1:2
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 1358 600 --sample-name 720p_1_2
- # 2:1
- eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 600 1358 --sample-name 720p_2_1
-}
-
-function run_video_h() { # 23min
- # 3.1 image-conditioned long video generation
- eval $CMD_REF --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L10C4_16x240x426 \
- --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
- --num-frames 16 --image-size 240 426 \
- --loop 5 --condition-frame-length 4 \
- --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
- --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS
-
- eval $CMD_REF --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L10C4_64x240x426 \
- --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
- --num-frames 64 --image-size 240 426 \
- --loop 5 --condition-frame-length 16 \
- --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
- --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS
-
- # 3.2
- eval $CMD_REF --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_128x240x426 \
- --prompt-path assets/texts/t2v_ref.txt --start-index 3 --end-index 6 \
- --num-frames 128 --image-size 240 426 \
- --loop 1 \
- --reference-path assets/images/condition/cliff.png "assets/images/condition/cactus-sad.png\;assets/images/condition/cactus-happy.png" https://cdn.openai.com/tmp/s/interp/d0.mp4 \
- --mask-strategy "0\;0,0,0,-1,1" "0\;0,1,0,-1,1" "0,0,0,0,64,0.5" --batch-size $DEFAULT_BS
-}
-
-# vbench has 950 samples
-
-VBENCH_BS=32 # 80GB
-VBENCH_FRAMES=16
-VBENCH_H=240
-VBENCH_W=426
-
-function run_vbenck_a() { # 2h
- eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_dimension.txt \
- --batch-size $VBENCH_BS --num-frames $VBENCH_FRAMES --image-size $VBENCH_H $VBENCH_W --start-index 0 --end-index 120
-}
-
-function run_vbenck_b() { # 2h
- eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_dimension.txt --batch-size $VBENCH_BS --num-frames $VBENCH_FRAMES --image-size $VBENCH_H $VBENCH_W --start-index 120 --end-index 240
-}
-
-function run_vbenck_c() { # 2h
- eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_dimension.txt \
- --batch-size $VBENCH_BS --num-frames $VBENCH_FRAMES --image-size $VBENCH_H $VBENCH_W --start-index 240 --end-index 360
-}
-
-function run_vbenck_d() { # 2h
- eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_dimension.txt \
- --batch-size $VBENCH_BS --num-frames $VBENCH_FRAMES --image-size $VBENCH_H $VBENCH_W --start-index 360 --end-index 480
-}
-
-function run_vbenck_e() { # 2h
- eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_dimension.txt \
- --batch-size $VBENCH_BS --num-frames $VBENCH_FRAMES --image-size $VBENCH_H $VBENCH_W --start-index 480 --end-index 600
-}
-
-function run_vbenck_f() { # 2h
- eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_dimension.txt \
- --batch-size $VBENCH_BS --num-frames $VBENCH_FRAMES --image-size $VBENCH_H $VBENCH_W --start-index 600 --end-index 720
-}
-
-function run_vbenck_g() { # 2h
- eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_dimension.txt \
- --batch-size $VBENCH_BS --num-frames $VBENCH_FRAMES --image-size $VBENCH_H $VBENCH_W --start-index 720 --end-index 840
-}
-
-function run_vbenck_h() { # 2h
- eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_dimension.txt \
- --batch-size $VBENCH_BS --num-frames $VBENCH_FRAMES --image-size $VBENCH_H $VBENCH_W --start-index 840
-}
-
-# vbench-i2v has 1120 samples
-
-VBENCH_I2V_FRAMES=16
-VBENCH_I2V_H=256
-VBENCH_I2V_W=256
-
-function run_vbenck_i2v_a() {
- eval $CMD_REF --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_i2v.txt \
- --start-index 0 --end-index 140 \
- --num-frames $VBENCH_I2V_FRAMES --image-size $VBENCH_I2V_H $VBENCH_I2V_W --batch-size $VBENCH_BS
-}
-
-function run_vbenck_i2v_b() {
- eval $CMD_REF --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_i2v.txt \
- --start-index 140 --end-index 280 \
- --num-frames $VBENCH_I2V_FRAMES --image-size $VBENCH_I2V_H $VBENCH_I2V_W --batch-size $VBENCH_BS
-}
-
-function run_vbenck_i2v_c() {
- eval $CMD_REF --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_i2v.txt \
- --start-index 280 --end-index 420 \
- --num-frames $VBENCH_I2V_FRAMES --image-size $VBENCH_I2V_H $VBENCH_I2V_W --batch-size $VBENCH_BS
-}
-
-function run_vbenck_i2v_d() {
- eval $CMD_REF --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_i2v.txt \
- --start-index 420 --end-index 560 \
- --num-frames $VBENCH_I2V_FRAMES --image-size $VBENCH_I2V_H $VBENCH_I2V_W --batch-size $VBENCH_BS
-}
-
-function run_vbenck_i2v_e() {
- eval $CMD_REF --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_i2v.txt \
- --start-index 560 --end-index 700 \
- --num-frames $VBENCH_I2V_FRAMES --image-size $VBENCH_I2V_H $VBENCH_I2V_W --batch-size $VBENCH_BS
-}
-
-function run_vbenck_i2v_f() {
- eval $CMD_REF --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_i2v.txt \
- --start-index 700 --end-index 840 \
- --num-frames $VBENCH_I2V_FRAMES --image-size $VBENCH_I2V_H $VBENCH_I2V_W --batch-size $VBENCH_BS
-}
-
-function run_vbenck_i2v_g() {
- eval $CMD_REF --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_i2v.txt \
- --start-index 840 --end-index 980 \
- --num-frames $VBENCH_I2V_FRAMES --image-size $VBENCH_I2V_H $VBENCH_I2V_W --batch-size $VBENCH_BS
-}
-
-function run_vbenck_i2v_h() {
- eval $CMD_REF --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
- --prompt-path assets/texts/VBench/all_i2v.txt \
- --start-index 980 \
- --num-frames $VBENCH_I2V_FRAMES --image-size $VBENCH_I2V_H $VBENCH_I2V_W --batch-size $VBENCH_BS
-}
-
-### Main
-
-for arg in "$@"; do
- # image
- if [[ "$arg" = -1 ]] || [[ "$arg" = --image ]]; then
- echo "Running image samples..."
- run_image
- fi
- if [[ "$arg" = -2a ]] || [[ "$arg" = --video ]]; then
- echo "Running video samples a..."
- run_video_a
- fi
- if [[ "$arg" = -2b ]] || [[ "$arg" = --video ]]; then
- echo "Running video samples b..."
- run_video_b
- fi
- if [[ "$arg" = -2c ]] || [[ "$arg" = --video ]]; then
- echo "Running video samples c..."
- run_video_c
- fi
- if [[ "$arg" = -2d ]] || [[ "$arg" = --video ]]; then
- echo "Running video samples d..."
- run_video_d
- fi
- if [[ "$arg" = -2e ]] || [[ "$arg" = --video ]]; then
- echo "Running video samples e..."
- run_video_e
- fi
- if [[ "$arg" = -2f ]] || [[ "$arg" = --video ]]; then
- echo "Running video samples f..."
- run_video_f
- fi
- if [[ "$arg" = -2g ]] || [[ "$arg" = --video ]]; then
- echo "Running video samples g..."
- run_video_g
- fi
- if [[ "$arg" = -2h ]] || [[ "$arg" = --video ]]; then
- echo "Running video samples h..."
- run_video_h
- fi
- # vbench
- if [[ "$arg" = -4a ]] || [[ "$arg" = --vbench ]]; then
- echo "Running vbench samples a..."
- run_vbenck_a
- fi
- if [[ "$arg" = -4b ]] || [[ "$arg" = --vbench ]]; then
- echo "Running vbench samples b..."
- run_vbenck_b
- fi
- if [[ "$arg" = -4c ]] || [[ "$arg" = --vbench ]]; then
- echo "Running vbench samples c..."
- run_vbenck_c
- fi
- if [[ "$arg" = -4d ]] || [[ "$arg" = --vbench ]]; then
- echo "Running vbench samples d..."
- run_vbenck_d
- fi
- if [[ "$arg" = -4e ]] || [[ "$arg" = --vbench ]]; then
- echo "Running vbench samples e..."
- run_vbenck_e
- fi
- if [[ "$arg" = -4f ]] || [[ "$arg" = --vbench ]]; then
- echo "Running vbench samples f..."
- run_vbenck_f
- fi
- if [[ "$arg" = -4g ]] || [[ "$arg" = --vbench ]]; then
- echo "Running vbench samples g..."
- run_vbenck_g
- fi
- if [[ "$arg" = -4h ]] || [[ "$arg" = --vbench ]]; then
- echo "Running vbench samples h..."
- run_vbenck_h
- fi
- # vbench-i2v
- if [[ "$arg" = -5a ]] || [[ "$arg" = --vbench-i2v ]]; then
- echo "Running vbench-i2v samples a..."
- run_vbenck_i2v_a
- fi
- if [[ "$arg" = -5b ]] || [[ "$arg" = --vbench-i2v ]]; then
- echo "Running vbench-i2v samples b..."
- run_vbenck_i2v_b
- fi
- if [[ "$arg" = -5c ]] || [[ "$arg" = --vbench-i2v ]]; then
- echo "Running vbench-i2v samples c..."
- run_vbenck_i2v_c
- fi
- if [[ "$arg" = -5d ]] || [[ "$arg" = --vbench-i2v ]]; then
- echo "Running vbench-i2v samples d..."
- run_vbenck_i2v_d
- fi
- if [[ "$arg" = -5e ]] || [[ "$arg" = --vbench-i2v ]]; then
- echo "Running vbench-i2v samples e..."
- run_vbenck_i2v_e
- fi
- if [[ "$arg" = -5f ]] || [[ "$arg" = --vbench-i2v ]]; then
- echo "Running vbench-i2v samples f..."
- run_vbenck_i2v_f
- fi
- if [[ "$arg" = -5g ]] || [[ "$arg" = --vbench-i2v ]]; then
- echo "Running vbench-i2v samples g..."
- run_vbenck_i2v_g
- fi
- if [[ "$arg" = -5h ]] || [[ "$arg" = --vbench-i2v ]]; then
- echo "Running vbench-i2v samples h..."
- run_vbenck_i2v_h
- fi
-done
-
-### End
-
-end=$(date +%s)
-
-runtime=$((end - start))
-
-echo "Runtime: $runtime seconds"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/eval/vbench/VBench_full_info.json b/PyTorch/built-in/mm/OpenSora1.1/eval/vbench/VBench_full_info.json
deleted file mode 100644
index e60c40eb0050a5304791490972be3b32de309e4a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/eval/vbench/VBench_full_info.json
+++ /dev/null
@@ -1,9132 +0,0 @@
-[
- {
- "prompt_en": "In a still frame, a stop sign",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "a toilet, frozen in time",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "a laptop, frozen in time",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of alley",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of bar",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of barn",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of bathroom",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of bedroom",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of cliff",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, courtyard",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, gas station",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of house",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "indoor gymnasium, frozen in time",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of indoor library",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of kitchen",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of palace",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, parking lot",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, phone booth",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of restaurant",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of tower",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a bowl",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of an apple",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a bench",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a bed",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a chair",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a cup",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a dining table",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, a pear",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a bunch of grapes",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a bowl on the kitchen counter",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a beautiful, handcrafted ceramic bowl",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of an antique bowl",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of an exquisite mahogany dining table",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a wooden bench in the park",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, a park bench with a view of the lake",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a vintage rocking chair was placed on the porch",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of the phone booth was tucked away in a quiet alley",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved fa\u00e7ades",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a country estate's library featured elegant wooden shelves",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time",
- "dimension": [
- "temporal_flickering"
- ]
- },
- {
- "prompt_en": "a bird and a cat",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "bird and cat"
- }
- }
- },
- {
- "prompt_en": "a cat and a dog",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "cat and dog"
- }
- }
- },
- {
- "prompt_en": "a dog and a horse",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "dog and horse"
- }
- }
- },
- {
- "prompt_en": "a horse and a sheep",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "horse and sheep"
- }
- }
- },
- {
- "prompt_en": "a sheep and a cow",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "sheep and cow"
- }
- }
- },
- {
- "prompt_en": "a cow and an elephant",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "cow and elephant"
- }
- }
- },
- {
- "prompt_en": "an elephant and a bear",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "elephant and bear"
- }
- }
- },
- {
- "prompt_en": "a bear and a zebra",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "bear and zebra"
- }
- }
- },
- {
- "prompt_en": "a zebra and a giraffe",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "zebra and giraffe"
- }
- }
- },
- {
- "prompt_en": "a giraffe and a bird",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "giraffe and bird"
- }
- }
- },
- {
- "prompt_en": "a chair and a couch",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "chair and couch"
- }
- }
- },
- {
- "prompt_en": "a couch and a potted plant",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "couch and potted plant"
- }
- }
- },
- {
- "prompt_en": "a potted plant and a tv",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "potted plant and tv"
- }
- }
- },
- {
- "prompt_en": "a tv and a laptop",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "tv and laptop"
- }
- }
- },
- {
- "prompt_en": "a laptop and a remote",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "laptop and remote"
- }
- }
- },
- {
- "prompt_en": "a remote and a keyboard",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "remote and keyboard"
- }
- }
- },
- {
- "prompt_en": "a keyboard and a cell phone",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "keyboard and cell phone"
- }
- }
- },
- {
- "prompt_en": "a cell phone and a book",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "cell phone and book"
- }
- }
- },
- {
- "prompt_en": "a book and a clock",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "book and clock"
- }
- }
- },
- {
- "prompt_en": "a clock and a backpack",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "clock and backpack"
- }
- }
- },
- {
- "prompt_en": "a backpack and an umbrella",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "backpack and umbrella"
- }
- }
- },
- {
- "prompt_en": "an umbrella and a handbag",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "umbrella and handbag"
- }
- }
- },
- {
- "prompt_en": "a handbag and a tie",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "handbag and tie"
- }
- }
- },
- {
- "prompt_en": "a tie and a suitcase",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "tie and suitcase"
- }
- }
- },
- {
- "prompt_en": "a suitcase and a vase",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "suitcase and vase"
- }
- }
- },
- {
- "prompt_en": "a vase and scissors",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "vase and scissors"
- }
- }
- },
- {
- "prompt_en": "scissors and a teddy bear",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "scissors and teddy bear"
- }
- }
- },
- {
- "prompt_en": "a teddy bear and a frisbee",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "teddy bear and frisbee"
- }
- }
- },
- {
- "prompt_en": "a frisbee and skis",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "frisbee and skis"
- }
- }
- },
- {
- "prompt_en": "skis and a snowboard",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "skis and snowboard"
- }
- }
- },
- {
- "prompt_en": "a snowboard and a sports ball",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "snowboard and sports ball"
- }
- }
- },
- {
- "prompt_en": "a sports ball and a kite",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "sports ball and kite"
- }
- }
- },
- {
- "prompt_en": "a kite and a baseball bat",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "kite and baseball bat"
- }
- }
- },
- {
- "prompt_en": "a baseball bat and a baseball glove",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "baseball bat and baseball glove"
- }
- }
- },
- {
- "prompt_en": "a baseball glove and a skateboard",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "baseball glove and skateboard"
- }
- }
- },
- {
- "prompt_en": "a skateboard and a surfboard",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "skateboard and surfboard"
- }
- }
- },
- {
- "prompt_en": "a surfboard and a tennis racket",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "surfboard and tennis racket"
- }
- }
- },
- {
- "prompt_en": "a tennis racket and a bottle",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "tennis racket and bottle"
- }
- }
- },
- {
- "prompt_en": "a bottle and a chair",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "bottle and chair"
- }
- }
- },
- {
- "prompt_en": "an airplane and a train",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "airplane and train"
- }
- }
- },
- {
- "prompt_en": "a train and a boat",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "train and boat"
- }
- }
- },
- {
- "prompt_en": "a boat and an airplane",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "boat and airplane"
- }
- }
- },
- {
- "prompt_en": "a bicycle and a car",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "bicycle and car"
- }
- }
- },
- {
- "prompt_en": "a car and a motorcycle",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "car and motorcycle"
- }
- }
- },
- {
- "prompt_en": "a motorcycle and a bus",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "motorcycle and bus"
- }
- }
- },
- {
- "prompt_en": "a bus and a traffic light",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "bus and traffic light"
- }
- }
- },
- {
- "prompt_en": "a traffic light and a fire hydrant",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "traffic light and fire hydrant"
- }
- }
- },
- {
- "prompt_en": "a fire hydrant and a stop sign",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "fire hydrant and stop sign"
- }
- }
- },
- {
- "prompt_en": "a stop sign and a parking meter",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "stop sign and parking meter"
- }
- }
- },
- {
- "prompt_en": "a parking meter and a truck",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "parking meter and truck"
- }
- }
- },
- {
- "prompt_en": "a truck and a bicycle",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "truck and bicycle"
- }
- }
- },
- {
- "prompt_en": "a toilet and a hair drier",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "toilet and hair drier"
- }
- }
- },
- {
- "prompt_en": "a hair drier and a toothbrush",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "hair drier and toothbrush"
- }
- }
- },
- {
- "prompt_en": "a toothbrush and a sink",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "toothbrush and sink"
- }
- }
- },
- {
- "prompt_en": "a sink and a toilet",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "sink and toilet"
- }
- }
- },
- {
- "prompt_en": "a wine glass and a chair",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "wine glass and chair"
- }
- }
- },
- {
- "prompt_en": "a cup and a couch",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "cup and couch"
- }
- }
- },
- {
- "prompt_en": "a fork and a potted plant",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "fork and potted plant"
- }
- }
- },
- {
- "prompt_en": "a knife and a tv",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "knife and tv"
- }
- }
- },
- {
- "prompt_en": "a spoon and a laptop",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "spoon and laptop"
- }
- }
- },
- {
- "prompt_en": "a bowl and a remote",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "bowl and remote"
- }
- }
- },
- {
- "prompt_en": "a banana and a keyboard",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "banana and keyboard"
- }
- }
- },
- {
- "prompt_en": "an apple and a cell phone",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "apple and cell phone"
- }
- }
- },
- {
- "prompt_en": "a sandwich and a book",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "sandwich and book"
- }
- }
- },
- {
- "prompt_en": "an orange and a clock",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "orange and clock"
- }
- }
- },
- {
- "prompt_en": "broccoli and a backpack",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "broccoli and backpack"
- }
- }
- },
- {
- "prompt_en": "a carrot and an umbrella",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "carrot and umbrella"
- }
- }
- },
- {
- "prompt_en": "a hot dog and a handbag",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "hot dog and handbag"
- }
- }
- },
- {
- "prompt_en": "a pizza and a tie",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "pizza and tie"
- }
- }
- },
- {
- "prompt_en": "a donut and a suitcase",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "donut and suitcase"
- }
- }
- },
- {
- "prompt_en": "a cake and a vase",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "cake and vase"
- }
- }
- },
- {
- "prompt_en": "an oven and scissors",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "oven and scissors"
- }
- }
- },
- {
- "prompt_en": "a toaster and a teddy bear",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "toaster and teddy bear"
- }
- }
- },
- {
- "prompt_en": "a microwave and a frisbee",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "microwave and frisbee"
- }
- }
- },
- {
- "prompt_en": "a refrigerator and skis",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "refrigerator and skis"
- }
- }
- },
- {
- "prompt_en": "a bicycle and an airplane",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "bicycle and airplane"
- }
- }
- },
- {
- "prompt_en": "a car and a train",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "car and train"
- }
- }
- },
- {
- "prompt_en": "a motorcycle and a boat",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "motorcycle and boat"
- }
- }
- },
- {
- "prompt_en": "a person and a toilet",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "person and toilet"
- }
- }
- },
- {
- "prompt_en": "a person and a hair drier",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "person and hair drier"
- }
- }
- },
- {
- "prompt_en": "a person and a toothbrush",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "person and toothbrush"
- }
- }
- },
- {
- "prompt_en": "a person and a sink",
- "dimension": [
- "multiple_objects"
- ],
- "auxiliary_info": {
- "multiple_objects": {
- "object": "person and sink"
- }
- }
- },
- {
- "prompt_en": "A person is riding a bike",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is marching",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is roller skating",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is tasting beer",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is clapping",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is drawing",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is petting animal (not cat)",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is eating watermelon",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is playing harp",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is wrestling",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is riding scooter",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is sweeping floor",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is skateboarding",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is dunking basketball",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is playing flute",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is stretching leg",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is tying tie",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is skydiving",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is shooting goal (soccer)",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is playing piano",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is finger snapping",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is canoeing or kayaking",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is laughing",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is digging",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is clay pottery making",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is shooting basketball",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is bending back",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is shaking hands",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is bandaging",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is push up",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is catching or throwing frisbee",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is playing trumpet",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is flying kite",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is filling eyebrows",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is shuffling cards",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is folding clothes",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is smoking",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is tai chi",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is squat",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is playing controller",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is throwing axe",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is giving or receiving award",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is air drumming",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is taking a shower",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is planting trees",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is sharpening knives",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is robot dancing",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is rock climbing",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is hula hooping",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is writing",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is bungee jumping",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is pushing cart",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is cleaning windows",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is cutting watermelon",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is cheerleading",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is washing hands",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is ironing",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is cutting nails",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is hugging",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is trimming or shaving beard",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is jogging",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is making bed",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is washing dishes",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is grooming dog",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is doing laundry",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is knitting",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is reading book",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is baby waking up",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is massaging legs",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is brushing teeth",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is crawling baby",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is motorcycling",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is driving car",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is sticking tongue out",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is shaking head",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is sword fighting",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is doing aerobics",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is strumming guitar",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is riding or walking with horse",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is archery",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is catching or throwing baseball",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is playing chess",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is rock scissors paper",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is using computer",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is arranging flowers",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is bending metal",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is ice skating",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is climbing a rope",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is crying",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is dancing ballet",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is getting a haircut",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is running on treadmill",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is kissing",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is counting money",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is barbequing",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is peeling apples",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is milking cow",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is shining shoes",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is making snowman",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "A person is sailing",
- "dimension": [
- "human_action"
- ]
- },
- {
- "prompt_en": "a person swimming in ocean",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a person giving a presentation to a room full of colleagues",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a person washing the dishes",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a person eating a burger",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a person walking in the snowstorm",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a person drinking coffee in a cafe",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a person playing guitar",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bicycle leaning against a tree",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bicycle gliding through a snowy field",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bicycle slowing down to stop",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bicycle accelerating to gain speed",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a car stuck in traffic during rush hour",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a car turning a corner",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a car slowing down to stop",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a car accelerating to gain speed",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a motorcycle cruising along a coastal highway",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a motorcycle turning a corner",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a motorcycle slowing down to stop",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a motorcycle gliding through a snowy field",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a motorcycle accelerating to gain speed",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "an airplane soaring through a clear blue sky",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "an airplane taking off",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "an airplane landing smoothly on a runway",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "an airplane accelerating to gain speed",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bus turning a corner",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bus stuck in traffic during rush hour",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bus accelerating to gain speed",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a train speeding down the tracks",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a train crossing over a tall bridge",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a train accelerating to gain speed",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a truck turning a corner",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a truck anchored in a tranquil bay",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a truck stuck in traffic during rush hour",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a truck slowing down to stop",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a truck accelerating to gain speed",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a boat sailing smoothly on a calm lake",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a boat slowing down to stop",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a boat accelerating to gain speed",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bird soaring gracefully in the sky",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bird building a nest from twigs and leaves",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bird flying over a snowy forest",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a cat grooming itself meticulously with its tongue",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a cat playing in park",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a cat drinking water",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a cat running happily",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a dog enjoying a peaceful walk",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a dog playing in park",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a dog drinking water",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a dog running happily",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a horse bending down to drink water from a river",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a horse galloping across an open field",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a horse taking a peaceful walk",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a horse running to join a herd of its kind",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a sheep bending down to drink water from a river",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a sheep taking a peaceful walk",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a sheep running to join a herd of its kind",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a cow bending down to drink water from a river",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a cow chewing cud while resting in a tranquil barn",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a cow running to join a herd of its kind",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "an elephant spraying itself with water using its trunk to cool down",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "an elephant taking a peaceful walk",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "an elephant running to join a herd of its kind",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bear catching a salmon in its powerful jaws",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bear sniffing the air for scents of food",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bear climbing a tree",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a bear hunting for prey",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a zebra bending down to drink water from a river",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a zebra running to join a herd of its kind",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a zebra taking a peaceful walk",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a giraffe bending down to drink water from a river",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a giraffe taking a peaceful walk",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a giraffe running to join a herd of its kind",
- "dimension": [
- "subject_consistency",
- "dynamic_degree",
- "motion_smoothness"
- ]
- },
- {
- "prompt_en": "a person",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "person"
- }
- }
- },
- {
- "prompt_en": "a bicycle",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "bicycle"
- }
- }
- },
- {
- "prompt_en": "a car",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "car"
- }
- }
- },
- {
- "prompt_en": "a motorcycle",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "motorcycle"
- }
- }
- },
- {
- "prompt_en": "an airplane",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "airplane"
- }
- }
- },
- {
- "prompt_en": "a bus",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "bus"
- }
- }
- },
- {
- "prompt_en": "a train",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "train"
- }
- }
- },
- {
- "prompt_en": "a truck",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "truck"
- }
- }
- },
- {
- "prompt_en": "a boat",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "boat"
- }
- }
- },
- {
- "prompt_en": "a traffic light",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "traffic light"
- }
- }
- },
- {
- "prompt_en": "a fire hydrant",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "fire hydrant"
- }
- }
- },
- {
- "prompt_en": "a stop sign",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "stop sign"
- }
- }
- },
- {
- "prompt_en": "a parking meter",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "parking meter"
- }
- }
- },
- {
- "prompt_en": "a bench",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "bench"
- }
- }
- },
- {
- "prompt_en": "a bird",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "bird"
- }
- }
- },
- {
- "prompt_en": "a cat",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "cat"
- }
- }
- },
- {
- "prompt_en": "a dog",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "dog"
- }
- }
- },
- {
- "prompt_en": "a horse",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "horse"
- }
- }
- },
- {
- "prompt_en": "a sheep",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "sheep"
- }
- }
- },
- {
- "prompt_en": "a cow",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "cow"
- }
- }
- },
- {
- "prompt_en": "an elephant",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "elephant"
- }
- }
- },
- {
- "prompt_en": "a bear",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "bear"
- }
- }
- },
- {
- "prompt_en": "a zebra",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "zebra"
- }
- }
- },
- {
- "prompt_en": "a giraffe",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "giraffe"
- }
- }
- },
- {
- "prompt_en": "a backpack",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "backpack"
- }
- }
- },
- {
- "prompt_en": "an umbrella",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "umbrella"
- }
- }
- },
- {
- "prompt_en": "a handbag",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "handbag"
- }
- }
- },
- {
- "prompt_en": "a tie",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "tie"
- }
- }
- },
- {
- "prompt_en": "a suitcase",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "suitcase"
- }
- }
- },
- {
- "prompt_en": "a frisbee",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "frisbee"
- }
- }
- },
- {
- "prompt_en": "skis",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "skis"
- }
- }
- },
- {
- "prompt_en": "a snowboard",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "snowboard"
- }
- }
- },
- {
- "prompt_en": "a sports ball",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "sports ball"
- }
- }
- },
- {
- "prompt_en": "a kite",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "kite"
- }
- }
- },
- {
- "prompt_en": "a baseball bat",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "baseball bat"
- }
- }
- },
- {
- "prompt_en": "a baseball glove",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "baseball glove"
- }
- }
- },
- {
- "prompt_en": "a skateboard",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "skateboard"
- }
- }
- },
- {
- "prompt_en": "a surfboard",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "surfboard"
- }
- }
- },
- {
- "prompt_en": "a tennis racket",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "tennis racket"
- }
- }
- },
- {
- "prompt_en": "a bottle",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "bottle"
- }
- }
- },
- {
- "prompt_en": "a wine glass",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "wine glass"
- }
- }
- },
- {
- "prompt_en": "a cup",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "cup"
- }
- }
- },
- {
- "prompt_en": "a fork",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "fork"
- }
- }
- },
- {
- "prompt_en": "a knife",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "knife"
- }
- }
- },
- {
- "prompt_en": "a spoon",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "spoon"
- }
- }
- },
- {
- "prompt_en": "a bowl",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "bowl"
- }
- }
- },
- {
- "prompt_en": "a banana",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "banana"
- }
- }
- },
- {
- "prompt_en": "an apple",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "apple"
- }
- }
- },
- {
- "prompt_en": "a sandwich",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "sandwich"
- }
- }
- },
- {
- "prompt_en": "an orange",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "orange"
- }
- }
- },
- {
- "prompt_en": "broccoli",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "broccoli"
- }
- }
- },
- {
- "prompt_en": "a carrot",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "carrot"
- }
- }
- },
- {
- "prompt_en": "a hot dog",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "hot dog"
- }
- }
- },
- {
- "prompt_en": "a pizza",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "pizza"
- }
- }
- },
- {
- "prompt_en": "a donut",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "donut"
- }
- }
- },
- {
- "prompt_en": "a cake",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "cake"
- }
- }
- },
- {
- "prompt_en": "a chair",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "chair"
- }
- }
- },
- {
- "prompt_en": "a couch",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "couch"
- }
- }
- },
- {
- "prompt_en": "a potted plant",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "potted plant"
- }
- }
- },
- {
- "prompt_en": "a bed",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "bed"
- }
- }
- },
- {
- "prompt_en": "a dining table",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "dining table"
- }
- }
- },
- {
- "prompt_en": "a toilet",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "toilet"
- }
- }
- },
- {
- "prompt_en": "a tv",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "tv"
- }
- }
- },
- {
- "prompt_en": "a laptop",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "laptop"
- }
- }
- },
- {
- "prompt_en": "a remote",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "remote"
- }
- }
- },
- {
- "prompt_en": "a keyboard",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "keyboard"
- }
- }
- },
- {
- "prompt_en": "a cell phone",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "cell phone"
- }
- }
- },
- {
- "prompt_en": "a microwave",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "microwave"
- }
- }
- },
- {
- "prompt_en": "an oven",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "oven"
- }
- }
- },
- {
- "prompt_en": "a toaster",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "toaster"
- }
- }
- },
- {
- "prompt_en": "a sink",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "sink"
- }
- }
- },
- {
- "prompt_en": "a refrigerator",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "refrigerator"
- }
- }
- },
- {
- "prompt_en": "a book",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "book"
- }
- }
- },
- {
- "prompt_en": "a clock",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "clock"
- }
- }
- },
- {
- "prompt_en": "a vase",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "vase"
- }
- }
- },
- {
- "prompt_en": "scissors",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "scissors"
- }
- }
- },
- {
- "prompt_en": "a teddy bear",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "teddy bear"
- }
- }
- },
- {
- "prompt_en": "a hair drier",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "hair drier"
- }
- }
- },
- {
- "prompt_en": "a toothbrush",
- "dimension": [
- "object_class"
- ],
- "auxiliary_info": {
- "object_class": {
- "object": "toothbrush"
- }
- }
- },
- {
- "prompt_en": "a red bicycle",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "red"
- }
- }
- },
- {
- "prompt_en": "a green bicycle",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "green"
- }
- }
- },
- {
- "prompt_en": "a blue bicycle",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "blue"
- }
- }
- },
- {
- "prompt_en": "a yellow bicycle",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "yellow"
- }
- }
- },
- {
- "prompt_en": "an orange bicycle",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "orange"
- }
- }
- },
- {
- "prompt_en": "a purple bicycle",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "purple"
- }
- }
- },
- {
- "prompt_en": "a pink bicycle",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "pink"
- }
- }
- },
- {
- "prompt_en": "a black bicycle",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "black"
- }
- }
- },
- {
- "prompt_en": "a white bicycle",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "white"
- }
- }
- },
- {
- "prompt_en": "a red car",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "red"
- }
- }
- },
- {
- "prompt_en": "a green car",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "green"
- }
- }
- },
- {
- "prompt_en": "a blue car",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "blue"
- }
- }
- },
- {
- "prompt_en": "a yellow car",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "yellow"
- }
- }
- },
- {
- "prompt_en": "an orange car",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "orange"
- }
- }
- },
- {
- "prompt_en": "a purple car",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "purple"
- }
- }
- },
- {
- "prompt_en": "a pink car",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "pink"
- }
- }
- },
- {
- "prompt_en": "a black car",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "black"
- }
- }
- },
- {
- "prompt_en": "a white car",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "white"
- }
- }
- },
- {
- "prompt_en": "a red bird",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "red"
- }
- }
- },
- {
- "prompt_en": "a green bird",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "green"
- }
- }
- },
- {
- "prompt_en": "a blue bird",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "blue"
- }
- }
- },
- {
- "prompt_en": "a yellow bird",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "yellow"
- }
- }
- },
- {
- "prompt_en": "an orange bird",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "orange"
- }
- }
- },
- {
- "prompt_en": "a purple bird",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "purple"
- }
- }
- },
- {
- "prompt_en": "a pink bird",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "pink"
- }
- }
- },
- {
- "prompt_en": "a black bird",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "black"
- }
- }
- },
- {
- "prompt_en": "a white bird",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "white"
- }
- }
- },
- {
- "prompt_en": "a black cat",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "black"
- }
- }
- },
- {
- "prompt_en": "a white cat",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "white"
- }
- }
- },
- {
- "prompt_en": "an orange cat",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "orange"
- }
- }
- },
- {
- "prompt_en": "a yellow cat",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "yellow"
- }
- }
- },
- {
- "prompt_en": "a red umbrella",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "red"
- }
- }
- },
- {
- "prompt_en": "a green umbrella",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "green"
- }
- }
- },
- {
- "prompt_en": "a blue umbrella",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "blue"
- }
- }
- },
- {
- "prompt_en": "a yellow umbrella",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "yellow"
- }
- }
- },
- {
- "prompt_en": "an orange umbrella",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "orange"
- }
- }
- },
- {
- "prompt_en": "a purple umbrella",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "purple"
- }
- }
- },
- {
- "prompt_en": "a pink umbrella",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "pink"
- }
- }
- },
- {
- "prompt_en": "a black umbrella",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "black"
- }
- }
- },
- {
- "prompt_en": "a white umbrella",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "white"
- }
- }
- },
- {
- "prompt_en": "a red suitcase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "red"
- }
- }
- },
- {
- "prompt_en": "a green suitcase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "green"
- }
- }
- },
- {
- "prompt_en": "a blue suitcase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "blue"
- }
- }
- },
- {
- "prompt_en": "a yellow suitcase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "yellow"
- }
- }
- },
- {
- "prompt_en": "an orange suitcase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "orange"
- }
- }
- },
- {
- "prompt_en": "a purple suitcase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "purple"
- }
- }
- },
- {
- "prompt_en": "a pink suitcase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "pink"
- }
- }
- },
- {
- "prompt_en": "a black suitcase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "black"
- }
- }
- },
- {
- "prompt_en": "a white suitcase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "white"
- }
- }
- },
- {
- "prompt_en": "a red bowl",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "red"
- }
- }
- },
- {
- "prompt_en": "a green bowl",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "green"
- }
- }
- },
- {
- "prompt_en": "a blue bowl",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "blue"
- }
- }
- },
- {
- "prompt_en": "a yellow bowl",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "yellow"
- }
- }
- },
- {
- "prompt_en": "an orange bowl",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "orange"
- }
- }
- },
- {
- "prompt_en": "a purple bowl",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "purple"
- }
- }
- },
- {
- "prompt_en": "a pink bowl",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "pink"
- }
- }
- },
- {
- "prompt_en": "a black bowl",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "black"
- }
- }
- },
- {
- "prompt_en": "a white bowl",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "white"
- }
- }
- },
- {
- "prompt_en": "a red chair",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "red"
- }
- }
- },
- {
- "prompt_en": "a green chair",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "green"
- }
- }
- },
- {
- "prompt_en": "a blue chair",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "blue"
- }
- }
- },
- {
- "prompt_en": "a yellow chair",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "yellow"
- }
- }
- },
- {
- "prompt_en": "an orange chair",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "orange"
- }
- }
- },
- {
- "prompt_en": "a purple chair",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "purple"
- }
- }
- },
- {
- "prompt_en": "a pink chair",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "pink"
- }
- }
- },
- {
- "prompt_en": "a black chair",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "black"
- }
- }
- },
- {
- "prompt_en": "a white chair",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "white"
- }
- }
- },
- {
- "prompt_en": "a red clock",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "red"
- }
- }
- },
- {
- "prompt_en": "a green clock",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "green"
- }
- }
- },
- {
- "prompt_en": "a blue clock",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "blue"
- }
- }
- },
- {
- "prompt_en": "a yellow clock",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "yellow"
- }
- }
- },
- {
- "prompt_en": "an orange clock",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "orange"
- }
- }
- },
- {
- "prompt_en": "a purple clock",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "purple"
- }
- }
- },
- {
- "prompt_en": "a pink clock",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "pink"
- }
- }
- },
- {
- "prompt_en": "a black clock",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "black"
- }
- }
- },
- {
- "prompt_en": "a white clock",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "white"
- }
- }
- },
- {
- "prompt_en": "a red vase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "red"
- }
- }
- },
- {
- "prompt_en": "a green vase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "green"
- }
- }
- },
- {
- "prompt_en": "a blue vase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "blue"
- }
- }
- },
- {
- "prompt_en": "a yellow vase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "yellow"
- }
- }
- },
- {
- "prompt_en": "an orange vase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "orange"
- }
- }
- },
- {
- "prompt_en": "a purple vase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "purple"
- }
- }
- },
- {
- "prompt_en": "a pink vase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "pink"
- }
- }
- },
- {
- "prompt_en": "a black vase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "black"
- }
- }
- },
- {
- "prompt_en": "a white vase",
- "dimension": [
- "color"
- ],
- "auxiliary_info": {
- "color": {
- "color": "white"
- }
- }
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "Van Gogh style"
- }
- }
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, oil painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "oil painting"
- }
- }
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "by Hokusai, in the style of Ukiyo"
- }
- }
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, black and white",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "black and white"
- }
- }
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pixel art",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "pixel art"
- }
- }
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "in cyberpunk style"
- }
- }
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, animated style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "animated style"
- }
- }
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, watercolor painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "watercolor painting"
- }
- }
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, surrealism style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "surrealism style"
- }
- }
- },
- {
- "prompt_en": "The bund Shanghai, Van Gogh style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "Van Gogh style"
- }
- }
- },
- {
- "prompt_en": "The bund Shanghai, oil painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "oil painting"
- }
- }
- },
- {
- "prompt_en": "The bund Shanghai by Hokusai, in the style of Ukiyo",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "by Hokusai, in the style of Ukiyo"
- }
- }
- },
- {
- "prompt_en": "The bund Shanghai, black and white",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "black and white"
- }
- }
- },
- {
- "prompt_en": "The bund Shanghai, pixel art",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "pixel art"
- }
- }
- },
- {
- "prompt_en": "The bund Shanghai, in cyberpunk style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "in cyberpunk style"
- }
- }
- },
- {
- "prompt_en": "The bund Shanghai, animated style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "animated style"
- }
- }
- },
- {
- "prompt_en": "The bund Shanghai, watercolor painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "watercolor painting"
- }
- }
- },
- {
- "prompt_en": "The bund Shanghai, surrealism style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "surrealism style"
- }
- }
- },
- {
- "prompt_en": "a shark is swimming in the ocean, Van Gogh style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "Van Gogh style"
- }
- }
- },
- {
- "prompt_en": "a shark is swimming in the ocean, oil painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "oil painting"
- }
- }
- },
- {
- "prompt_en": "a shark is swimming in the ocean by Hokusai, in the style of Ukiyo",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "by Hokusai, in the style of Ukiyo"
- }
- }
- },
- {
- "prompt_en": "a shark is swimming in the ocean, black and white",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "black and white"
- }
- }
- },
- {
- "prompt_en": "a shark is swimming in the ocean, pixel art",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "pixel art"
- }
- }
- },
- {
- "prompt_en": "a shark is swimming in the ocean, in cyberpunk style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "in cyberpunk style"
- }
- }
- },
- {
- "prompt_en": "a shark is swimming in the ocean, animated style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "animated style"
- }
- }
- },
- {
- "prompt_en": "a shark is swimming in the ocean, watercolor painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "watercolor painting"
- }
- }
- },
- {
- "prompt_en": "a shark is swimming in the ocean, surrealism style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "surrealism style"
- }
- }
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, Van Gogh style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "Van Gogh style"
- }
- }
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, oil painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "oil painting"
- }
- }
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "by Hokusai, in the style of Ukiyo"
- }
- }
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, black and white",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "black and white"
- }
- }
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, pixel art",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "pixel art"
- }
- }
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, in cyberpunk style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "in cyberpunk style"
- }
- }
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, animated style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "animated style"
- }
- }
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, watercolor painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "watercolor painting"
- }
- }
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, surrealism style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "surrealism style"
- }
- }
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, Van Gogh style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "Van Gogh style"
- }
- }
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, oil painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "oil painting"
- }
- }
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "by Hokusai, in the style of Ukiyo"
- }
- }
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, black and white",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "black and white"
- }
- }
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, pixel art",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "pixel art"
- }
- }
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, in cyberpunk style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "in cyberpunk style"
- }
- }
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, animated style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "animated style"
- }
- }
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, watercolor painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "watercolor painting"
- }
- }
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, surrealism style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "surrealism style"
- }
- }
- },
- {
- "prompt_en": "Gwen Stacy reading a book, Van Gogh style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "Van Gogh style"
- }
- }
- },
- {
- "prompt_en": "Gwen Stacy reading a book, oil painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "oil painting"
- }
- }
- },
- {
- "prompt_en": "Gwen Stacy reading a book by Hokusai, in the style of Ukiyo",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "by Hokusai, in the style of Ukiyo"
- }
- }
- },
- {
- "prompt_en": "Gwen Stacy reading a book, black and white",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "black and white"
- }
- }
- },
- {
- "prompt_en": "Gwen Stacy reading a book, pixel art",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "pixel art"
- }
- }
- },
- {
- "prompt_en": "Gwen Stacy reading a book, in cyberpunk style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "in cyberpunk style"
- }
- }
- },
- {
- "prompt_en": "Gwen Stacy reading a book, animated style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "animated style"
- }
- }
- },
- {
- "prompt_en": "Gwen Stacy reading a book, watercolor painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "watercolor painting"
- }
- }
- },
- {
- "prompt_en": "Gwen Stacy reading a book, surrealism style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "surrealism style"
- }
- }
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "Van Gogh style"
- }
- }
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "oil painting"
- }
- }
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "by Hokusai, in the style of Ukiyo"
- }
- }
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "black and white"
- }
- }
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "pixel art"
- }
- }
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "in cyberpunk style"
- }
- }
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "animated style"
- }
- }
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "watercolor painting"
- }
- }
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "surrealism style"
- }
- }
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "Van Gogh style"
- }
- }
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "oil painting"
- }
- }
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "by Hokusai, in the style of Ukiyo"
- }
- }
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "black and white"
- }
- }
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "pixel art"
- }
- }
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "in cyberpunk style"
- }
- }
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "animated style"
- }
- }
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "watercolor painting"
- }
- }
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "surrealism style"
- }
- }
- },
- {
- "prompt_en": "An astronaut flying in space, Van Gogh style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "Van Gogh style"
- }
- }
- },
- {
- "prompt_en": "An astronaut flying in space, oil painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "oil painting"
- }
- }
- },
- {
- "prompt_en": "An astronaut flying in space by Hokusai, in the style of Ukiyo",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "by Hokusai, in the style of Ukiyo"
- }
- }
- },
- {
- "prompt_en": "An astronaut flying in space, black and white",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "black and white"
- }
- }
- },
- {
- "prompt_en": "An astronaut flying in space, pixel art",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "pixel art"
- }
- }
- },
- {
- "prompt_en": "An astronaut flying in space, in cyberpunk style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "in cyberpunk style"
- }
- }
- },
- {
- "prompt_en": "An astronaut flying in space, animated style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "animated style"
- }
- }
- },
- {
- "prompt_en": "An astronaut flying in space, watercolor painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "watercolor painting"
- }
- }
- },
- {
- "prompt_en": "An astronaut flying in space, surrealism style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "surrealism style"
- }
- }
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "Van Gogh style"
- }
- }
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "oil painting"
- }
- }
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "by Hokusai, in the style of Ukiyo"
- }
- }
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "black and white"
- }
- }
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "pixel art"
- }
- }
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "in cyberpunk style"
- }
- }
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "animated style"
- }
- }
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "watercolor painting"
- }
- }
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style",
- "dimension": [
- "appearance_style"
- ],
- "auxiliary_info": {
- "appearance_style": {
- "appearance_style": "surrealism style"
- }
- }
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in super slow motion",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom in",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom out",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan left",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan right",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt up",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt down",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, racking focus",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, in super slow motion",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, zoom in",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, zoom out",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, pan left",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, pan right",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, tilt up",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, tilt down",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, with an intense shaking effect",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, featuring a steady and smooth perspective",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, racking focus",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean, in super slow motion",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean, zoom in",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean, zoom out",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean, pan left",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean, pan right",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean, tilt up",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean, tilt down",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean, with an intense shaking effect",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean, featuring a steady and smooth perspective",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean, racking focus",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, in super slow motion",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, zoom in",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, zoom out",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, pan left",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, pan right",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, tilt up",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, tilt down",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, with an intense shaking effect",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris, racking focus",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, in super slow motion",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, zoom in",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, zoom out",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, pan left",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, pan right",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, tilt up",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, tilt down",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, with an intense shaking effect",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset, racking focus",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book, in super slow motion",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book, zoom in",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book, zoom out",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book, pan left",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book, pan right",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book, tilt up",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book, tilt down",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book, with an intense shaking effect",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book, featuring a steady and smooth perspective",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book, racking focus",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space, in super slow motion",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space, zoom in",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space, zoom out",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space, pan left",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space, pan right",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space, tilt up",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space, tilt down",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space, with an intense shaking effect",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space, featuring a steady and smooth perspective",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space, racking focus",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus",
- "dimension": [
- "temporal_style"
- ]
- },
- {
- "prompt_en": "Close up of grapes on a rotating table.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Turtle swimming in ocean.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A storm trooper vacuuming the beach.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A panda standing on a surfboard in the ocean in sunset.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "An astronaut feeding ducks on a sunny afternoon, reflection from the water.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Two pandas discussing an academic paper.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Sunset time lapse at the beach with moving clouds and colors in the sky.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A fat rabbit wearing a purple robe walking through a fantasy landscape.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A koala bear playing piano in the forest.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "An astronaut flying in space.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Fireworks.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "An animated painting of fluffy white clouds moving in sky.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Flying through fantasy landscapes.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A bigfoot walking in the snowstorm.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A squirrel eating a burger.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A cat wearing sunglasses and working as a lifeguard at a pool.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Splash of turquoise water in extreme slow motion, alpha channel included.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "an ice cream is melting on the table.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "a drone flying over a snowy forest.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "a shark is swimming in the ocean.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Aerial panoramic video from a drone of a fantasy land.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "a teddy bear is swimming in the ocean.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "time lapse of sunrise on mars.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "golden fish swimming in the ocean.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "An artist brush painting on a canvas close up.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A drone view of celebration with Christmas tree and fireworks, starry sky - background.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Campfire at night in a snowy forest with starry sky in the background.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "a fantasy landscape",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A 3D model of a 1800s victorian house.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "this is how I do makeup in the morning.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A raccoon that looks like a turtle, digital art.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Robot dancing in Times Square.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Busy freeway at night.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Balloon full of water exploding in extreme slow motion.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "An astronaut is riding a horse in the space in a photorealistic style.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Sewing machine, old sewing machine working.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Vampire makeup face of beautiful girl, red contact lenses.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Ashtray full of butts on table, smoke flowing on black background, close-up",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Pacific coast, carmel by the sea ocean and waves.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A teddy bear is playing drum kit in NYC Times Square.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A corgi is playing drum kit.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "An Iron man is playing the electronic guitar, high electronic guitar.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A raccoon is playing the electronic guitar.",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A corgi's head depicted as an explosion of a nebula",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A fantasy landscape",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A future where humans have achieved teleportation technology",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A jellyfish floating through the ocean, with bioluminescent tentacles",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A Mars rover moving on Mars",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A panda drinking coffee in a cafe in Paris",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A space shuttle launching into orbit, with flames and smoke billowing out from the engines",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A steam train moving on a mountainside",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A super cool giant robot in Cyberpunk Beijing",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Cinematic shot of Van Gogh's selfie, Van Gogh style",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Gwen Stacy reading a book",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Iron Man flying in the sky",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, oil painting",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Yoda playing guitar on the stage",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A car moving slowly on an empty street, rainy evening",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A cat eating food out of a bowl",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A cat wearing sunglasses at a pool",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A confused panda in calculus class",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A cute fluffy panda eating Chinese food in a restaurant",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A cute happy Corgi playing in park, sunset",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A cute raccoon playing guitar in a boat on the ocean",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A lightning striking atop of eiffel tower, dark clouds in the sky",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A modern art museum, with colorful paintings",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A panda cooking in the kitchen",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A panda playing on a swing set",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A polar bear is playing guitar",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A raccoon dressed in suit playing the trumpet, stage background",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A shark swimming in clear Caribbean ocean",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A super robot protecting city",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "A teddy bear washing the dishes",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "An epic tornado attacking above a glowing city at night, the tornado is made of smoke",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Clown fish swimming through the coral reef",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Hyper-realistic spaceship landing on Mars",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "The bund Shanghai, vibrant color",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Vincent van Gogh is painting in the room",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "Yellow flowers swing in the wind",
- "dimension": [
- "overall_consistency",
- "aesthetic_quality",
- "imaging_quality"
- ]
- },
- {
- "prompt_en": "alley",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "alley"
- }
- }
- }
- },
- {
- "prompt_en": "amusement park",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "amusement park"
- }
- }
- }
- },
- {
- "prompt_en": "aquarium",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "aquarium"
- }
- }
- }
- },
- {
- "prompt_en": "arch",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "arch"
- }
- }
- }
- },
- {
- "prompt_en": "art gallery",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "art gallery"
- }
- }
- }
- },
- {
- "prompt_en": "bathroom",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "bathroom"
- }
- }
- }
- },
- {
- "prompt_en": "bakery shop",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "bakery shop"
- }
- }
- }
- },
- {
- "prompt_en": "ballroom",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "ballroom"
- }
- }
- }
- },
- {
- "prompt_en": "bar",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "bar"
- }
- }
- }
- },
- {
- "prompt_en": "barn",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "barn"
- }
- }
- }
- },
- {
- "prompt_en": "basement",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "basement"
- }
- }
- }
- },
- {
- "prompt_en": "beach",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "beach"
- }
- }
- }
- },
- {
- "prompt_en": "bedroom",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "bedroom"
- }
- }
- }
- },
- {
- "prompt_en": "bridge",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "bridge"
- }
- }
- }
- },
- {
- "prompt_en": "botanical garden",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "botanical garden"
- }
- }
- }
- },
- {
- "prompt_en": "cafeteria",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "cafeteria"
- }
- }
- }
- },
- {
- "prompt_en": "campsite",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "campsite"
- }
- }
- }
- },
- {
- "prompt_en": "campus",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "campus"
- }
- }
- }
- },
- {
- "prompt_en": "carrousel",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "carrousel"
- }
- }
- }
- },
- {
- "prompt_en": "castle",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "castle"
- }
- }
- }
- },
- {
- "prompt_en": "cemetery",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "cemetery"
- }
- }
- }
- },
- {
- "prompt_en": "classroom",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "classroom"
- }
- }
- }
- },
- {
- "prompt_en": "cliff",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "cliff"
- }
- }
- }
- },
- {
- "prompt_en": "crosswalk",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "crosswalk"
- }
- }
- }
- },
- {
- "prompt_en": "construction site",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "construction site"
- }
- }
- }
- },
- {
- "prompt_en": "corridor",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "corridor"
- }
- }
- }
- },
- {
- "prompt_en": "courtyard",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "courtyard"
- }
- }
- }
- },
- {
- "prompt_en": "desert",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "desert"
- }
- }
- }
- },
- {
- "prompt_en": "downtown",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "downtown"
- }
- }
- }
- },
- {
- "prompt_en": "driveway",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "driveway"
- }
- }
- }
- },
- {
- "prompt_en": "farm",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "farm"
- }
- }
- }
- },
- {
- "prompt_en": "food court",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "food court"
- }
- }
- }
- },
- {
- "prompt_en": "football field",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "football field"
- }
- }
- }
- },
- {
- "prompt_en": "forest road",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "forest road"
- }
- }
- }
- },
- {
- "prompt_en": "fountain",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "fountain"
- }
- }
- }
- },
- {
- "prompt_en": "gas station",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "gas station"
- }
- }
- }
- },
- {
- "prompt_en": "glacier",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "glacier"
- }
- }
- }
- },
- {
- "prompt_en": "golf course",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "golf course"
- }
- }
- }
- },
- {
- "prompt_en": "indoor gymnasium",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "indoor gymnasium"
- }
- }
- }
- },
- {
- "prompt_en": "harbor",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "harbor"
- }
- }
- }
- },
- {
- "prompt_en": "highway",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "highway"
- }
- }
- }
- },
- {
- "prompt_en": "hospital",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "hospital"
- }
- }
- }
- },
- {
- "prompt_en": "house",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "house"
- }
- }
- }
- },
- {
- "prompt_en": "iceberg",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "iceberg"
- }
- }
- }
- },
- {
- "prompt_en": "industrial area",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "industrial area"
- }
- }
- }
- },
- {
- "prompt_en": "jail cell",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "jail cell"
- }
- }
- }
- },
- {
- "prompt_en": "junkyard",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "junkyard"
- }
- }
- }
- },
- {
- "prompt_en": "kitchen",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "kitchen"
- }
- }
- }
- },
- {
- "prompt_en": "indoor library",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "indoor library"
- }
- }
- }
- },
- {
- "prompt_en": "lighthouse",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "lighthouse"
- }
- }
- }
- },
- {
- "prompt_en": "laboratory",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "laboratory"
- }
- }
- }
- },
- {
- "prompt_en": "mansion",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "mansion"
- }
- }
- }
- },
- {
- "prompt_en": "marsh",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "marsh"
- }
- }
- }
- },
- {
- "prompt_en": "mountain",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "mountain"
- }
- }
- }
- },
- {
- "prompt_en": "indoor movie theater",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "indoor movie theater"
- }
- }
- }
- },
- {
- "prompt_en": "indoor museum",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "indoor museum"
- }
- }
- }
- },
- {
- "prompt_en": "music studio",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "music studio"
- }
- }
- }
- },
- {
- "prompt_en": "nursery",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "nursery"
- }
- }
- }
- },
- {
- "prompt_en": "ocean",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "ocean"
- }
- }
- }
- },
- {
- "prompt_en": "office",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "office"
- }
- }
- }
- },
- {
- "prompt_en": "palace",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "palace"
- }
- }
- }
- },
- {
- "prompt_en": "parking lot",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "parking lot"
- }
- }
- }
- },
- {
- "prompt_en": "pharmacy",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "pharmacy"
- }
- }
- }
- },
- {
- "prompt_en": "phone booth",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "phone booth"
- }
- }
- }
- },
- {
- "prompt_en": "raceway",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "raceway"
- }
- }
- }
- },
- {
- "prompt_en": "restaurant",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "restaurant"
- }
- }
- }
- },
- {
- "prompt_en": "river",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "river"
- }
- }
- }
- },
- {
- "prompt_en": "science museum",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "science museum"
- }
- }
- }
- },
- {
- "prompt_en": "shower",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "shower"
- }
- }
- }
- },
- {
- "prompt_en": "ski slope",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "ski slope"
- }
- }
- }
- },
- {
- "prompt_en": "sky",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "sky"
- }
- }
- }
- },
- {
- "prompt_en": "skyscraper",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "skyscraper"
- }
- }
- }
- },
- {
- "prompt_en": "baseball stadium",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "baseball stadium"
- }
- }
- }
- },
- {
- "prompt_en": "staircase",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "staircase"
- }
- }
- }
- },
- {
- "prompt_en": "street",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "street"
- }
- }
- }
- },
- {
- "prompt_en": "supermarket",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "supermarket"
- }
- }
- }
- },
- {
- "prompt_en": "indoor swimming pool",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "indoor swimming pool"
- }
- }
- }
- },
- {
- "prompt_en": "tower",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "tower"
- }
- }
- }
- },
- {
- "prompt_en": "outdoor track",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "outdoor track"
- }
- }
- }
- },
- {
- "prompt_en": "train railway",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "train railway"
- }
- }
- }
- },
- {
- "prompt_en": "train station platform",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "train station platform"
- }
- }
- }
- },
- {
- "prompt_en": "underwater coral reef",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "underwater coral reef"
- }
- }
- }
- },
- {
- "prompt_en": "valley",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "valley"
- }
- }
- }
- },
- {
- "prompt_en": "volcano",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "volcano"
- }
- }
- }
- },
- {
- "prompt_en": "waterfall",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "waterfall"
- }
- }
- }
- },
- {
- "prompt_en": "windmill",
- "dimension": [
- "scene",
- "background_consistency"
- ],
- "auxiliary_info": {
- "scene": {
- "scene": {
- "scene": "windmill"
- }
- }
- }
- },
- {
- "prompt_en": "a bicycle on the left of a car, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "bicycle",
- "object_b": "car",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a car on the right of a motorcycle, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "car",
- "object_b": "motorcycle",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a motorcycle on the left of a bus, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "motorcycle",
- "object_b": "bus",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a bus on the right of a traffic light, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "bus",
- "object_b": "traffic light",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a traffic light on the left of a fire hydrant, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "traffic light",
- "object_b": "fire hydrant",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a fire hydrant on the right of a stop sign, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "fire hydrant",
- "object_b": "stop sign",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a stop sign on the left of a parking meter, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "stop sign",
- "object_b": "parking meter",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a parking meter on the right of a bench, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "parking meter",
- "object_b": "bench",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a bench on the left of a truck, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "bench",
- "object_b": "truck",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a truck on the right of a bicycle, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "truck",
- "object_b": "bicycle",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a bird on the left of a cat, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "bird",
- "object_b": "cat",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a cat on the right of a dog, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "cat",
- "object_b": "dog",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a dog on the left of a horse, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "dog",
- "object_b": "horse",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a horse on the right of a sheep, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "horse",
- "object_b": "sheep",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a sheep on the left of a cow, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "sheep",
- "object_b": "cow",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a cow on the right of an elephant, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "cow",
- "object_b": "elephant",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "an elephant on the left of a bear, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "elephant",
- "object_b": "bear",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a bear on the right of a zebra, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "bear",
- "object_b": "zebra",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a zebra on the left of a giraffe, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "zebra",
- "object_b": "giraffe",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a giraffe on the right of a bird, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "giraffe",
- "object_b": "bird",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a bottle on the left of a wine glass, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "bottle",
- "object_b": "wine glass",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a wine glass on the right of a cup, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "wine glass",
- "object_b": "cup",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a cup on the left of a fork, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "cup",
- "object_b": "fork",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a fork on the right of a knife, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "fork",
- "object_b": "knife",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a knife on the left of a spoon, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "knife",
- "object_b": "spoon",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a spoon on the right of a bowl, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "spoon",
- "object_b": "bowl",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a bowl on the left of a bottle, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "bowl",
- "object_b": "bottle",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a potted plant on the left of a remote, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "potted plant",
- "object_b": "remote",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a remote on the right of a clock, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "remote",
- "object_b": "clock",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a clock on the left of a vase, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "clock",
- "object_b": "vase",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a vase on the right of scissors, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "vase",
- "object_b": "scissors",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "scissors on the left of a teddy bear, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "scissors",
- "object_b": "teddy bear",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a teddy bear on the right of a potted plant, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "teddy bear",
- "object_b": "potted plant",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a frisbee on the left of a sports ball, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "frisbee",
- "object_b": "sports ball",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a sports ball on the right of a baseball bat, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "sports ball",
- "object_b": "baseball bat",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a baseball bat on the left of a baseball glove, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "baseball bat",
- "object_b": "baseball glove",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a baseball glove on the right of a tennis racket, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "baseball glove",
- "object_b": "tennis racket",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a tennis racket on the left of a frisbee, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "tennis racket",
- "object_b": "frisbee",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a toilet on the left of a hair drier, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "toilet",
- "object_b": "hair drier",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a hair drier on the right of a toothbrush, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "hair drier",
- "object_b": "toothbrush",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a toothbrush on the left of a sink, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "toothbrush",
- "object_b": "sink",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a sink on the right of a toilet, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "sink",
- "object_b": "toilet",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a chair on the left of a couch, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "chair",
- "object_b": "couch",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a couch on the right of a bed, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "couch",
- "object_b": "bed",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a bed on the left of a tv, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "bed",
- "object_b": "tv",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a tv on the right of a dining table, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "tv",
- "object_b": "dining table",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a dining table on the left of a chair, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "dining table",
- "object_b": "chair",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "an airplane on the left of a train, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "airplane",
- "object_b": "train",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "a train on the right of a boat, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "train",
- "object_b": "boat",
- "relationship": "on the right of"
- }
- }
- }
- },
- {
- "prompt_en": "a boat on the left of an airplane, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "boat",
- "object_b": "airplane",
- "relationship": "on the left of"
- }
- }
- }
- },
- {
- "prompt_en": "an oven on the top of a toaster, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "oven",
- "object_b": "toaster",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "an oven on the bottom of a toaster, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "oven",
- "object_b": "toaster",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a toaster on the top of a microwave, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "toaster",
- "object_b": "microwave",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a toaster on the bottom of a microwave, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "toaster",
- "object_b": "microwave",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a microwave on the top of an oven, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "microwave",
- "object_b": "oven",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a microwave on the bottom of an oven, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "microwave",
- "object_b": "oven",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a banana on the top of an apple, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "banana",
- "object_b": "apple",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a banana on the bottom of an apple, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "banana",
- "object_b": "apple",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "an apple on the top of a sandwich, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "apple",
- "object_b": "sandwich",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "an apple on the bottom of a sandwich, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "apple",
- "object_b": "sandwich",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a sandwich on the top of an orange, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "sandwich",
- "object_b": "orange",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a sandwich on the bottom of an orange, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "sandwich",
- "object_b": "orange",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "an orange on the top of a carrot, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "orange",
- "object_b": "carrot",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "an orange on the bottom of a carrot, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "orange",
- "object_b": "carrot",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a carrot on the top of a hot dog, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "carrot",
- "object_b": "hot dog",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a carrot on the bottom of a hot dog, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "carrot",
- "object_b": "hot dog",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a hot dog on the top of a pizza, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "hot dog",
- "object_b": "pizza",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a hot dog on the bottom of a pizza, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "hot dog",
- "object_b": "pizza",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a pizza on the top of a donut, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "pizza",
- "object_b": "donut",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a pizza on the bottom of a donut, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "pizza",
- "object_b": "donut",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a donut on the top of broccoli, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "donut",
- "object_b": "broccoli",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a donut on the bottom of broccoli, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "donut",
- "object_b": "broccoli",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "broccoli on the top of a banana, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "broccoli",
- "object_b": "banana",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "broccoli on the bottom of a banana, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "broccoli",
- "object_b": "banana",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "skis on the top of a snowboard, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "skis",
- "object_b": "snowboard",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "skis on the bottom of a snowboard, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "skis",
- "object_b": "snowboard",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a snowboard on the top of a kite, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "snowboard",
- "object_b": "kite",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a snowboard on the bottom of a kite, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "snowboard",
- "object_b": "kite",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a kite on the top of a skateboard, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "kite",
- "object_b": "skateboard",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a kite on the bottom of a skateboard, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "kite",
- "object_b": "skateboard",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a skateboard on the top of a surfboard, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "skateboard",
- "object_b": "surfboard",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a skateboard on the bottom of a surfboard, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "skateboard",
- "object_b": "surfboard",
- "relationship": "on the bottom of"
- }
- }
- }
- },
- {
- "prompt_en": "a surfboard on the top of skis, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "surfboard",
- "object_b": "skis",
- "relationship": "on the top of"
- }
- }
- }
- },
- {
- "prompt_en": "a surfboard on the bottom of skis, front view",
- "dimension": [
- "spatial_relationship"
- ],
- "auxiliary_info": {
- "spatial_relationship": {
- "spatial_relationship": {
- "object_a": "surfboard",
- "object_b": "skis",
- "relationship": "on the bottom of"
- }
- }
- }
- }
-]
diff --git a/PyTorch/built-in/mm/OpenSora1.1/eval/vbench/vbench.sh b/PyTorch/built-in/mm/OpenSora1.1/eval/vbench/vbench.sh
deleted file mode 100644
index caffd7d982f341ac0f7ca947bf5d27a01de82193..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/eval/vbench/vbench.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Base path for videos
-videos_path=$1
-videos_base=$(basename $videos_path)
-json_path=./eval/vbench/VBench_full_info.json
-output_path=./evaluation_results/$videos_base
-
-# Define the dimension list
-dimensions=(
- # Quality Score
- "subject_consistency"
- "background_consistency"
- "motion_smoothness"
- "dynamic_degree"
- "aesthetic_quality"
- "imaging_quality"
- "temporal_flickering"
- # Semantic Score
- "object_class"
- "multiple_objects"
- "color"
- "spatial_relationship"
- "scene"
- "temporal_style"
- "overall_consistency"
- "human_action"
- "appearance_style"
-)
-
-# Loop over each dimension
-for i in "${!dimensions[@]}"; do
- # Get the dimension and corresponding folder
- dimension=${dimensions[i]}
-
- # Construct the video path
- echo "$dimension $videos_path"
-
- # Run the evaluation script
- vbench evaluate --videos_path $videos_path --dimension $dimension --full_json_dir $json_path --output_path $output_path
-done
diff --git a/PyTorch/built-in/mm/OpenSora1.1/eval/vbench_i2v/json_to_txt.py b/PyTorch/built-in/mm/OpenSora1.1/eval/vbench_i2v/json_to_txt.py
deleted file mode 100644
index 9d9184d61ef50d82ac1badc5480e5df3cabec212..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/eval/vbench_i2v/json_to_txt.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import json
-import os
-
-RESOLUTIONS = ["1-1", "16-9", "7-4", "8-5"]
-
-cache_root = "cache/crop"
-resolution = RESOLUTIONS[0]
-json_file = "vbench2_beta_i2v/vbench2_i2v_full_info.json"
-save_path = "all_i2v.txt"
-
-data = json.load(open(json_file))
-txt = [
- f'{x["prompt_en"]}{{"reference_path": "{os.path.join(cache_root, resolution, x["image_name"])}", "mask_strategy": "0,0,0,1,0"}}'
- for x in data
-]
-with open(save_path, "w") as f:
- f.write("\n".join(txt))
diff --git a/PyTorch/built-in/mm/OpenSora1.1/eval/vbench_i2v/vbench2_i2v_full_info.json b/PyTorch/built-in/mm/OpenSora1.1/eval/vbench_i2v/vbench2_i2v_full_info.json
deleted file mode 100644
index f6d17ff26ed4a0bf53c4faec4d60c28a66b436d0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/eval/vbench_i2v/vbench2_i2v_full_info.json
+++ /dev/null
@@ -1,8946 +0,0 @@
-[
- {
- "prompt_en": "a close up of a blue and orange liquid",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "abstract",
- "image_name": "a close up of a blue and orange liquid.jpg"
- },
- {
- "prompt_en": "a close up of a blue and orange liquid, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close up of a blue and orange liquid.jpg"
- },
- {
- "prompt_en": "a close up of a blue and orange liquid, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close up of a blue and orange liquid.jpg"
- },
- {
- "prompt_en": "a close up of a blue and orange liquid, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close up of a blue and orange liquid.jpg"
- },
- {
- "prompt_en": "a close up of a blue and orange liquid, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close up of a blue and orange liquid.jpg"
- },
- {
- "prompt_en": "a close up of a blue and orange liquid, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close up of a blue and orange liquid.jpg"
- },
- {
- "prompt_en": "a close up of a blue and orange liquid, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close up of a blue and orange liquid.jpg"
- },
- {
- "prompt_en": "a close up of a blue and orange liquid, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close up of a blue and orange liquid.jpg"
- },
- {
- "prompt_en": "A black and white abstract video featuring mesmerizing bubbles",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "abstract",
- "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
- },
- {
- "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
- },
- {
- "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
- },
- {
- "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
- },
- {
- "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
- },
- {
- "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
- },
- {
- "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
- },
- {
- "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
- },
- {
- "prompt_en": "a blue and white smoke is swirly in the dark",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "abstract",
- "image_name": "a blue and white smoke is swirly in the dark.jpg"
- },
- {
- "prompt_en": "a blue and white smoke is swirly in the dark, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a blue and white smoke is swirly in the dark.jpg"
- },
- {
- "prompt_en": "a blue and white smoke is swirly in the dark, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a blue and white smoke is swirly in the dark.jpg"
- },
- {
- "prompt_en": "a blue and white smoke is swirly in the dark, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a blue and white smoke is swirly in the dark.jpg"
- },
- {
- "prompt_en": "a blue and white smoke is swirly in the dark, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a blue and white smoke is swirly in the dark.jpg"
- },
- {
- "prompt_en": "a blue and white smoke is swirly in the dark, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a blue and white smoke is swirly in the dark.jpg"
- },
- {
- "prompt_en": "a blue and white smoke is swirly in the dark, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a blue and white smoke is swirly in the dark.jpg"
- },
- {
- "prompt_en": "a blue and white smoke is swirly in the dark, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a blue and white smoke is swirly in the dark.jpg"
- },
- {
- "prompt_en": "a close-up view of a sea fan in the water",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "abstract",
- "image_name": "a close-up view of a sea fan in the water.jpg"
- },
- {
- "prompt_en": "a close-up view of a sea fan in the water, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close-up view of a sea fan in the water.jpg"
- },
- {
- "prompt_en": "a close-up view of a sea fan in the water, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close-up view of a sea fan in the water.jpg"
- },
- {
- "prompt_en": "a close-up view of a sea fan in the water, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close-up view of a sea fan in the water.jpg"
- },
- {
- "prompt_en": "a close-up view of a sea fan in the water, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close-up view of a sea fan in the water.jpg"
- },
- {
- "prompt_en": "a close-up view of a sea fan in the water, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close-up view of a sea fan in the water.jpg"
- },
- {
- "prompt_en": "a close-up view of a sea fan in the water, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close-up view of a sea fan in the water.jpg"
- },
- {
- "prompt_en": "a close-up view of a sea fan in the water, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a close-up view of a sea fan in the water.jpg"
- },
- {
- "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "abstract",
- "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
- },
- {
- "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
- },
- {
- "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
- },
- {
- "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
- },
- {
- "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
- },
- {
- "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
- },
- {
- "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
- },
- {
- "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
- },
- {
- "prompt_en": "a purple and yellow abstract painting with a black background",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "abstract",
- "image_name": "a purple and yellow abstract painting with a black background.jpg"
- },
- {
- "prompt_en": "a purple and yellow abstract painting with a black background, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a purple and yellow abstract painting with a black background.jpg"
- },
- {
- "prompt_en": "a purple and yellow abstract painting with a black background, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a purple and yellow abstract painting with a black background.jpg"
- },
- {
- "prompt_en": "a purple and yellow abstract painting with a black background, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a purple and yellow abstract painting with a black background.jpg"
- },
- {
- "prompt_en": "a purple and yellow abstract painting with a black background, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a purple and yellow abstract painting with a black background.jpg"
- },
- {
- "prompt_en": "a purple and yellow abstract painting with a black background, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a purple and yellow abstract painting with a black background.jpg"
- },
- {
- "prompt_en": "a purple and yellow abstract painting with a black background, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a purple and yellow abstract painting with a black background.jpg"
- },
- {
- "prompt_en": "a purple and yellow abstract painting with a black background, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a purple and yellow abstract painting with a black background.jpg"
- },
- {
- "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "abstract",
- "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
- },
- {
- "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
- },
- {
- "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
- },
- {
- "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
- },
- {
- "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
- },
- {
- "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
- },
- {
- "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
- },
- {
- "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
- },
- {
- "prompt_en": "a view of a star trail in the night sky",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "abstract",
- "image_name": "a view of a star trail in the night sky.jpg"
- },
- {
- "prompt_en": "a view of a star trail in the night sky, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a view of a star trail in the night sky.jpg"
- },
- {
- "prompt_en": "a view of a star trail in the night sky, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a view of a star trail in the night sky.jpg"
- },
- {
- "prompt_en": "a view of a star trail in the night sky, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a view of a star trail in the night sky.jpg"
- },
- {
- "prompt_en": "a view of a star trail in the night sky, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a view of a star trail in the night sky.jpg"
- },
- {
- "prompt_en": "a view of a star trail in the night sky, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a view of a star trail in the night sky.jpg"
- },
- {
- "prompt_en": "a view of a star trail in the night sky, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a view of a star trail in the night sky.jpg"
- },
- {
- "prompt_en": "a view of a star trail in the night sky, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "abstract",
- "image_name": "a view of a star trail in the night sky.jpg"
- },
- {
- "prompt_en": "an aerial view of a small town on the edge of the ocean",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
- },
- {
- "prompt_en": "an aerial view of a small town on the edge of the ocean, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
- },
- {
- "prompt_en": "an aerial view of a small town on the edge of the ocean, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
- },
- {
- "prompt_en": "an aerial view of a small town on the edge of the ocean, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
- },
- {
- "prompt_en": "an aerial view of a small town on the edge of the ocean, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
- },
- {
- "prompt_en": "an aerial view of a small town on the edge of the ocean, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
- },
- {
- "prompt_en": "an aerial view of a small town on the edge of the ocean, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
- },
- {
- "prompt_en": "an aerial view of a small town on the edge of the ocean, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
- },
- {
- "prompt_en": "Colorful buildings on the seaside cliffs",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "Colorful buildings on the seaside cliffs.jpg"
- },
- {
- "prompt_en": "Colorful buildings on the seaside cliffs, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "Colorful buildings on the seaside cliffs.jpg"
- },
- {
- "prompt_en": "Colorful buildings on the seaside cliffs, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "Colorful buildings on the seaside cliffs.jpg"
- },
- {
- "prompt_en": "Colorful buildings on the seaside cliffs, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "Colorful buildings on the seaside cliffs.jpg"
- },
- {
- "prompt_en": "Colorful buildings on the seaside cliffs, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "Colorful buildings on the seaside cliffs.jpg"
- },
- {
- "prompt_en": "Colorful buildings on the seaside cliffs, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "Colorful buildings on the seaside cliffs.jpg"
- },
- {
- "prompt_en": "Colorful buildings on the seaside cliffs, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "Colorful buildings on the seaside cliffs.jpg"
- },
- {
- "prompt_en": "Colorful buildings on the seaside cliffs, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "Colorful buildings on the seaside cliffs.jpg"
- },
- {
- "prompt_en": "a bunch of houses that are on a hillside",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a bunch of houses that are on a hillside.jpg"
- },
- {
- "prompt_en": "a bunch of houses that are on a hillside, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bunch of houses that are on a hillside.jpg"
- },
- {
- "prompt_en": "a bunch of houses that are on a hillside, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bunch of houses that are on a hillside.jpg"
- },
- {
- "prompt_en": "a bunch of houses that are on a hillside, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bunch of houses that are on a hillside.jpg"
- },
- {
- "prompt_en": "a bunch of houses that are on a hillside, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bunch of houses that are on a hillside.jpg"
- },
- {
- "prompt_en": "a bunch of houses that are on a hillside, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bunch of houses that are on a hillside.jpg"
- },
- {
- "prompt_en": "a bunch of houses that are on a hillside, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bunch of houses that are on a hillside.jpg"
- },
- {
- "prompt_en": "a bunch of houses that are on a hillside, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bunch of houses that are on a hillside.jpg"
- },
- {
- "prompt_en": "a building that is sitting on the side of a pond",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a building that is sitting on the side of a pond.jpg"
- },
- {
- "prompt_en": "a building that is sitting on the side of a pond, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a building that is sitting on the side of a pond.jpg"
- },
- {
- "prompt_en": "a building that is sitting on the side of a pond, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a building that is sitting on the side of a pond.jpg"
- },
- {
- "prompt_en": "a building that is sitting on the side of a pond, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a building that is sitting on the side of a pond.jpg"
- },
- {
- "prompt_en": "a building that is sitting on the side of a pond, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a building that is sitting on the side of a pond.jpg"
- },
- {
- "prompt_en": "a building that is sitting on the side of a pond, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a building that is sitting on the side of a pond.jpg"
- },
- {
- "prompt_en": "a building that is sitting on the side of a pond, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a building that is sitting on the side of a pond.jpg"
- },
- {
- "prompt_en": "a building that is sitting on the side of a pond, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a building that is sitting on the side of a pond.jpg"
- },
- {
- "prompt_en": "an aerial view of a busy city with a bridge in the background",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
- },
- {
- "prompt_en": "an aerial view of a busy city with a bridge in the background, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
- },
- {
- "prompt_en": "an aerial view of a busy city with a bridge in the background, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
- },
- {
- "prompt_en": "an aerial view of a busy city with a bridge in the background, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
- },
- {
- "prompt_en": "an aerial view of a busy city with a bridge in the background, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
- },
- {
- "prompt_en": "an aerial view of a busy city with a bridge in the background, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
- },
- {
- "prompt_en": "an aerial view of a busy city with a bridge in the background, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
- },
- {
- "prompt_en": "an aerial view of a busy city with a bridge in the background, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
- },
- {
- "prompt_en": "a bridge that is over a body of water",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a bridge that is over a body of water.jpg"
- },
- {
- "prompt_en": "a bridge that is over a body of water, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bridge that is over a body of water.jpg"
- },
- {
- "prompt_en": "a bridge that is over a body of water, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bridge that is over a body of water.jpg"
- },
- {
- "prompt_en": "a bridge that is over a body of water, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bridge that is over a body of water.jpg"
- },
- {
- "prompt_en": "a bridge that is over a body of water, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bridge that is over a body of water.jpg"
- },
- {
- "prompt_en": "a bridge that is over a body of water, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bridge that is over a body of water.jpg"
- },
- {
- "prompt_en": "a bridge that is over a body of water, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bridge that is over a body of water.jpg"
- },
- {
- "prompt_en": "a bridge that is over a body of water, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a bridge that is over a body of water.jpg"
- },
- {
- "prompt_en": "a pile of wood sitting next to a log house",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a pile of wood sitting next to a log house.jpg"
- },
- {
- "prompt_en": "a pile of wood sitting next to a log house, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pile of wood sitting next to a log house.jpg"
- },
- {
- "prompt_en": "a pile of wood sitting next to a log house, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pile of wood sitting next to a log house.jpg"
- },
- {
- "prompt_en": "a pile of wood sitting next to a log house, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pile of wood sitting next to a log house.jpg"
- },
- {
- "prompt_en": "a pile of wood sitting next to a log house, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pile of wood sitting next to a log house.jpg"
- },
- {
- "prompt_en": "a pile of wood sitting next to a log house, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pile of wood sitting next to a log house.jpg"
- },
- {
- "prompt_en": "a pile of wood sitting next to a log house, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pile of wood sitting next to a log house.jpg"
- },
- {
- "prompt_en": "a pile of wood sitting next to a log house, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pile of wood sitting next to a log house.jpg"
- },
- {
- "prompt_en": "a view of a snowy mountain side with many buildings",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a view of a snowy mountain side with many buildings.jpg"
- },
- {
- "prompt_en": "a view of a snowy mountain side with many buildings, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a view of a snowy mountain side with many buildings.jpg"
- },
- {
- "prompt_en": "a view of a snowy mountain side with many buildings, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a view of a snowy mountain side with many buildings.jpg"
- },
- {
- "prompt_en": "a view of a snowy mountain side with many buildings, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a view of a snowy mountain side with many buildings.jpg"
- },
- {
- "prompt_en": "a view of a snowy mountain side with many buildings, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a view of a snowy mountain side with many buildings.jpg"
- },
- {
- "prompt_en": "a view of a snowy mountain side with many buildings, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a view of a snowy mountain side with many buildings.jpg"
- },
- {
- "prompt_en": "a view of a snowy mountain side with many buildings, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a view of a snowy mountain side with many buildings.jpg"
- },
- {
- "prompt_en": "a view of a snowy mountain side with many buildings, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a view of a snowy mountain side with many buildings.jpg"
- },
- {
- "prompt_en": "san francisco skyline at sunset",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "san francisco skyline at sunset.jpg"
- },
- {
- "prompt_en": "san francisco skyline at sunset, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "san francisco skyline at sunset.jpg"
- },
- {
- "prompt_en": "san francisco skyline at sunset, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "san francisco skyline at sunset.jpg"
- },
- {
- "prompt_en": "san francisco skyline at sunset, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "san francisco skyline at sunset.jpg"
- },
- {
- "prompt_en": "san francisco skyline at sunset, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "san francisco skyline at sunset.jpg"
- },
- {
- "prompt_en": "san francisco skyline at sunset, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "san francisco skyline at sunset.jpg"
- },
- {
- "prompt_en": "san francisco skyline at sunset, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "san francisco skyline at sunset.jpg"
- },
- {
- "prompt_en": "san francisco skyline at sunset, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "san francisco skyline at sunset.jpg"
- },
- {
- "prompt_en": "a castle on top of a hill covered in snow",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a castle on top of a hill covered in snow.jpg"
- },
- {
- "prompt_en": "a castle on top of a hill covered in snow, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a castle on top of a hill covered in snow.jpg"
- },
- {
- "prompt_en": "a castle on top of a hill covered in snow, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a castle on top of a hill covered in snow.jpg"
- },
- {
- "prompt_en": "a castle on top of a hill covered in snow, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a castle on top of a hill covered in snow.jpg"
- },
- {
- "prompt_en": "a castle on top of a hill covered in snow, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a castle on top of a hill covered in snow.jpg"
- },
- {
- "prompt_en": "a castle on top of a hill covered in snow, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a castle on top of a hill covered in snow.jpg"
- },
- {
- "prompt_en": "a castle on top of a hill covered in snow, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a castle on top of a hill covered in snow.jpg"
- },
- {
- "prompt_en": "a castle on top of a hill covered in snow, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a castle on top of a hill covered in snow.jpg"
- },
- {
- "prompt_en": "an aerial view of big ben and the houses of parliament in london",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
- },
- {
- "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
- },
- {
- "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
- },
- {
- "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
- },
- {
- "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
- },
- {
- "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
- },
- {
- "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
- },
- {
- "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
- },
- {
- "prompt_en": "a beach with a lot of buildings on the side of a cliff",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
- },
- {
- "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
- },
- {
- "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
- },
- {
- "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
- },
- {
- "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
- },
- {
- "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
- },
- {
- "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
- },
- {
- "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
- },
- {
- "prompt_en": "an alley way in an old european city",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "an alley way in an old european city.jpg"
- },
- {
- "prompt_en": "an alley way in an old european city, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an alley way in an old european city.jpg"
- },
- {
- "prompt_en": "an alley way in an old european city, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an alley way in an old european city.jpg"
- },
- {
- "prompt_en": "an alley way in an old european city, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an alley way in an old european city.jpg"
- },
- {
- "prompt_en": "an alley way in an old european city, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an alley way in an old european city.jpg"
- },
- {
- "prompt_en": "an alley way in an old european city, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an alley way in an old european city.jpg"
- },
- {
- "prompt_en": "an alley way in an old european city, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an alley way in an old european city.jpg"
- },
- {
- "prompt_en": "an alley way in an old european city, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an alley way in an old european city.jpg"
- },
- {
- "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
- },
- {
- "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
- },
- {
- "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
- },
- {
- "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
- },
- {
- "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
- },
- {
- "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
- },
- {
- "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
- },
- {
- "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
- },
- {
- "prompt_en": "the great wall of china in autumn",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "the great wall of china in autumn.jpg"
- },
- {
- "prompt_en": "the great wall of china in autumn, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the great wall of china in autumn.jpg"
- },
- {
- "prompt_en": "the great wall of china in autumn, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the great wall of china in autumn.jpg"
- },
- {
- "prompt_en": "the great wall of china in autumn, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the great wall of china in autumn.jpg"
- },
- {
- "prompt_en": "the great wall of china in autumn, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the great wall of china in autumn.jpg"
- },
- {
- "prompt_en": "the great wall of china in autumn, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the great wall of china in autumn.jpg"
- },
- {
- "prompt_en": "the great wall of china in autumn, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the great wall of china in autumn.jpg"
- },
- {
- "prompt_en": "the great wall of china in autumn, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the great wall of china in autumn.jpg"
- },
- {
- "prompt_en": "the town of hallstatt is surrounded by mountains and water",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
- },
- {
- "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
- },
- {
- "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
- },
- {
- "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
- },
- {
- "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
- },
- {
- "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
- },
- {
- "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
- },
- {
- "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
- },
- {
- "prompt_en": "tokyo skyline at night",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "tokyo skyline at night.jpg"
- },
- {
- "prompt_en": "tokyo skyline at night, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tokyo skyline at night.jpg"
- },
- {
- "prompt_en": "tokyo skyline at night, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tokyo skyline at night.jpg"
- },
- {
- "prompt_en": "tokyo skyline at night, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tokyo skyline at night.jpg"
- },
- {
- "prompt_en": "tokyo skyline at night, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tokyo skyline at night.jpg"
- },
- {
- "prompt_en": "tokyo skyline at night, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tokyo skyline at night.jpg"
- },
- {
- "prompt_en": "tokyo skyline at night, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tokyo skyline at night.jpg"
- },
- {
- "prompt_en": "tokyo skyline at night, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tokyo skyline at night.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a large wave crashes into a lighthouse.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large wave crashes into a lighthouse.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large wave crashes into a lighthouse.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large wave crashes into a lighthouse.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large wave crashes into a lighthouse.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large wave crashes into a lighthouse.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large wave crashes into a lighthouse.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large wave crashes into a lighthouse.jpg"
- },
- {
- "prompt_en": "a church sits on top of a hill under a cloudy sky",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
- },
- {
- "prompt_en": "a church sits on top of a hill under a cloudy sky, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
- },
- {
- "prompt_en": "a church sits on top of a hill under a cloudy sky, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
- },
- {
- "prompt_en": "a church sits on top of a hill under a cloudy sky, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
- },
- {
- "prompt_en": "a church sits on top of a hill under a cloudy sky, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
- },
- {
- "prompt_en": "a church sits on top of a hill under a cloudy sky, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
- },
- {
- "prompt_en": "a church sits on top of a hill under a cloudy sky, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
- },
- {
- "prompt_en": "a church sits on top of a hill under a cloudy sky, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
- },
- {
- "prompt_en": "the parthenon in acropolis, greece",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "the parthenon in acropolis, greece.jpg"
- },
- {
- "prompt_en": "the parthenon in acropolis, greece, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the parthenon in acropolis, greece.jpg"
- },
- {
- "prompt_en": "the parthenon in acropolis, greece, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the parthenon in acropolis, greece.jpg"
- },
- {
- "prompt_en": "the parthenon in acropolis, greece, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the parthenon in acropolis, greece.jpg"
- },
- {
- "prompt_en": "the parthenon in acropolis, greece, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the parthenon in acropolis, greece.jpg"
- },
- {
- "prompt_en": "the parthenon in acropolis, greece, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the parthenon in acropolis, greece.jpg"
- },
- {
- "prompt_en": "the parthenon in acropolis, greece, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the parthenon in acropolis, greece.jpg"
- },
- {
- "prompt_en": "the parthenon in acropolis, greece, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the parthenon in acropolis, greece.jpg"
- },
- {
- "prompt_en": "a large crowd of people walking in a shopping mall",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a large crowd of people walking in a shopping mall.jpg"
- },
- {
- "prompt_en": "a large crowd of people walking in a shopping mall, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large crowd of people walking in a shopping mall.jpg"
- },
- {
- "prompt_en": "a large crowd of people walking in a shopping mall, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large crowd of people walking in a shopping mall.jpg"
- },
- {
- "prompt_en": "a large crowd of people walking in a shopping mall, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large crowd of people walking in a shopping mall.jpg"
- },
- {
- "prompt_en": "a large crowd of people walking in a shopping mall, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large crowd of people walking in a shopping mall.jpg"
- },
- {
- "prompt_en": "a large crowd of people walking in a shopping mall, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large crowd of people walking in a shopping mall.jpg"
- },
- {
- "prompt_en": "a large crowd of people walking in a shopping mall, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large crowd of people walking in a shopping mall.jpg"
- },
- {
- "prompt_en": "a large crowd of people walking in a shopping mall, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a large crowd of people walking in a shopping mall.jpg"
- },
- {
- "prompt_en": "the pyramids of giza, egypt",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "the pyramids of giza, egypt.jpg"
- },
- {
- "prompt_en": "the pyramids of giza, egypt, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the pyramids of giza, egypt.jpg"
- },
- {
- "prompt_en": "the pyramids of giza, egypt, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the pyramids of giza, egypt.jpg"
- },
- {
- "prompt_en": "the pyramids of giza, egypt, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the pyramids of giza, egypt.jpg"
- },
- {
- "prompt_en": "the pyramids of giza, egypt, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the pyramids of giza, egypt.jpg"
- },
- {
- "prompt_en": "the pyramids of giza, egypt, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the pyramids of giza, egypt.jpg"
- },
- {
- "prompt_en": "the pyramids of giza, egypt, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the pyramids of giza, egypt.jpg"
- },
- {
- "prompt_en": "the pyramids of giza, egypt, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the pyramids of giza, egypt.jpg"
- },
- {
- "prompt_en": "a stage door painted with a star on the side of a brick wall",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
- },
- {
- "prompt_en": "a stage door painted with a star on the side of a brick wall, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
- },
- {
- "prompt_en": "a stage door painted with a star on the side of a brick wall, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
- },
- {
- "prompt_en": "a stage door painted with a star on the side of a brick wall, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
- },
- {
- "prompt_en": "a stage door painted with a star on the side of a brick wall, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
- },
- {
- "prompt_en": "a stage door painted with a star on the side of a brick wall, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
- },
- {
- "prompt_en": "a stage door painted with a star on the side of a brick wall, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
- },
- {
- "prompt_en": "a stage door painted with a star on the side of a brick wall, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
- },
- {
- "prompt_en": "a light house on the edge of the water",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a light house on the edge of the water.jpg"
- },
- {
- "prompt_en": "a light house on the edge of the water, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a light house on the edge of the water.jpg"
- },
- {
- "prompt_en": "a light house on the edge of the water, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a light house on the edge of the water.jpg"
- },
- {
- "prompt_en": "a light house on the edge of the water, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a light house on the edge of the water.jpg"
- },
- {
- "prompt_en": "a light house on the edge of the water, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a light house on the edge of the water.jpg"
- },
- {
- "prompt_en": "a light house on the edge of the water, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a light house on the edge of the water.jpg"
- },
- {
- "prompt_en": "a light house on the edge of the water, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a light house on the edge of the water.jpg"
- },
- {
- "prompt_en": "a light house on the edge of the water, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a light house on the edge of the water.jpg"
- },
- {
- "prompt_en": "an asian city street at night with people and bicycles",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "an asian city street at night with people and bicycles.jpg"
- },
- {
- "prompt_en": "an asian city street at night with people and bicycles, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an asian city street at night with people and bicycles.jpg"
- },
- {
- "prompt_en": "an asian city street at night with people and bicycles, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an asian city street at night with people and bicycles.jpg"
- },
- {
- "prompt_en": "an asian city street at night with people and bicycles, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an asian city street at night with people and bicycles.jpg"
- },
- {
- "prompt_en": "an asian city street at night with people and bicycles, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an asian city street at night with people and bicycles.jpg"
- },
- {
- "prompt_en": "an asian city street at night with people and bicycles, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an asian city street at night with people and bicycles.jpg"
- },
- {
- "prompt_en": "an asian city street at night with people and bicycles, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an asian city street at night with people and bicycles.jpg"
- },
- {
- "prompt_en": "an asian city street at night with people and bicycles, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an asian city street at night with people and bicycles.jpg"
- },
- {
- "prompt_en": "a couple of wooden benches in the middle of a street",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a couple of wooden benches in the middle of a street.jpg"
- },
- {
- "prompt_en": "a couple of wooden benches in the middle of a street, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a couple of wooden benches in the middle of a street.jpg"
- },
- {
- "prompt_en": "a couple of wooden benches in the middle of a street, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a couple of wooden benches in the middle of a street.jpg"
- },
- {
- "prompt_en": "a couple of wooden benches in the middle of a street, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a couple of wooden benches in the middle of a street.jpg"
- },
- {
- "prompt_en": "a couple of wooden benches in the middle of a street, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a couple of wooden benches in the middle of a street.jpg"
- },
- {
- "prompt_en": "a couple of wooden benches in the middle of a street, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a couple of wooden benches in the middle of a street.jpg"
- },
- {
- "prompt_en": "a couple of wooden benches in the middle of a street, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a couple of wooden benches in the middle of a street.jpg"
- },
- {
- "prompt_en": "a couple of wooden benches in the middle of a street, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a couple of wooden benches in the middle of a street.jpg"
- },
- {
- "prompt_en": "a pagoda sits on top of a mountain in japan",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a pagoda sits on top of a mountain in japan.jpg"
- },
- {
- "prompt_en": "a pagoda sits on top of a mountain in japan, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pagoda sits on top of a mountain in japan.jpg"
- },
- {
- "prompt_en": "a pagoda sits on top of a mountain in japan, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pagoda sits on top of a mountain in japan.jpg"
- },
- {
- "prompt_en": "a pagoda sits on top of a mountain in japan, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pagoda sits on top of a mountain in japan.jpg"
- },
- {
- "prompt_en": "a pagoda sits on top of a mountain in japan, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pagoda sits on top of a mountain in japan.jpg"
- },
- {
- "prompt_en": "a pagoda sits on top of a mountain in japan, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pagoda sits on top of a mountain in japan.jpg"
- },
- {
- "prompt_en": "a pagoda sits on top of a mountain in japan, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pagoda sits on top of a mountain in japan.jpg"
- },
- {
- "prompt_en": "a pagoda sits on top of a mountain in japan, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a pagoda sits on top of a mountain in japan.jpg"
- },
- {
- "prompt_en": "a red bus driving down a snowy street at night",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a red bus driving down a snowy street at night.jpg"
- },
- {
- "prompt_en": "a red bus driving down a snowy street at night, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a red bus driving down a snowy street at night.jpg"
- },
- {
- "prompt_en": "a red bus driving down a snowy street at night, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a red bus driving down a snowy street at night.jpg"
- },
- {
- "prompt_en": "a red bus driving down a snowy street at night, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a red bus driving down a snowy street at night.jpg"
- },
- {
- "prompt_en": "a red bus driving down a snowy street at night, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a red bus driving down a snowy street at night.jpg"
- },
- {
- "prompt_en": "a red bus driving down a snowy street at night, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a red bus driving down a snowy street at night.jpg"
- },
- {
- "prompt_en": "a red bus driving down a snowy street at night, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a red bus driving down a snowy street at night.jpg"
- },
- {
- "prompt_en": "a red bus driving down a snowy street at night, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a red bus driving down a snowy street at night.jpg"
- },
- {
- "prompt_en": "a snow covered street",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a snow covered street.jpg"
- },
- {
- "prompt_en": "a snow covered street, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a snow covered street.jpg"
- },
- {
- "prompt_en": "a snow covered street, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a snow covered street.jpg"
- },
- {
- "prompt_en": "a snow covered street, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a snow covered street.jpg"
- },
- {
- "prompt_en": "a snow covered street, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a snow covered street.jpg"
- },
- {
- "prompt_en": "a snow covered street, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a snow covered street.jpg"
- },
- {
- "prompt_en": "a snow covered street, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a snow covered street.jpg"
- },
- {
- "prompt_en": "a snow covered street, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a snow covered street.jpg"
- },
- {
- "prompt_en": "a house with snow on the ground",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a house with snow on the ground.jpg"
- },
- {
- "prompt_en": "a house with snow on the ground, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a house with snow on the ground.jpg"
- },
- {
- "prompt_en": "a house with snow on the ground, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a house with snow on the ground.jpg"
- },
- {
- "prompt_en": "a house with snow on the ground, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a house with snow on the ground.jpg"
- },
- {
- "prompt_en": "a house with snow on the ground, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a house with snow on the ground.jpg"
- },
- {
- "prompt_en": "a house with snow on the ground, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a house with snow on the ground.jpg"
- },
- {
- "prompt_en": "a house with snow on the ground, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a house with snow on the ground.jpg"
- },
- {
- "prompt_en": "a house with snow on the ground, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a house with snow on the ground.jpg"
- },
- {
- "prompt_en": "cars parked on the side of the road during a snowstorm",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "cars parked on the side of the road during a snowstorm.jpg"
- },
- {
- "prompt_en": "cars parked on the side of the road during a snowstorm, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "cars parked on the side of the road during a snowstorm.jpg"
- },
- {
- "prompt_en": "cars parked on the side of the road during a snowstorm, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "cars parked on the side of the road during a snowstorm.jpg"
- },
- {
- "prompt_en": "cars parked on the side of the road during a snowstorm, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "cars parked on the side of the road during a snowstorm.jpg"
- },
- {
- "prompt_en": "cars parked on the side of the road during a snowstorm, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "cars parked on the side of the road during a snowstorm.jpg"
- },
- {
- "prompt_en": "cars parked on the side of the road during a snowstorm, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "cars parked on the side of the road during a snowstorm.jpg"
- },
- {
- "prompt_en": "cars parked on the side of the road during a snowstorm, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "cars parked on the side of the road during a snowstorm.jpg"
- },
- {
- "prompt_en": "cars parked on the side of the road during a snowstorm, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "cars parked on the side of the road during a snowstorm.jpg"
- },
- {
- "prompt_en": "a group of statues on the side of a building",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a group of statues on the side of a building.jpg"
- },
- {
- "prompt_en": "a group of statues on the side of a building, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a group of statues on the side of a building.jpg"
- },
- {
- "prompt_en": "a group of statues on the side of a building, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a group of statues on the side of a building.jpg"
- },
- {
- "prompt_en": "a group of statues on the side of a building, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a group of statues on the side of a building.jpg"
- },
- {
- "prompt_en": "a group of statues on the side of a building, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a group of statues on the side of a building.jpg"
- },
- {
- "prompt_en": "a group of statues on the side of a building, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a group of statues on the side of a building.jpg"
- },
- {
- "prompt_en": "a group of statues on the side of a building, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a group of statues on the side of a building.jpg"
- },
- {
- "prompt_en": "a group of statues on the side of a building, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a group of statues on the side of a building.jpg"
- },
- {
- "prompt_en": "a city street at night during a snow storm",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a city street at night during a snow storm.jpg"
- },
- {
- "prompt_en": "a city street at night during a snow storm, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a city street at night during a snow storm.jpg"
- },
- {
- "prompt_en": "a city street at night during a snow storm, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a city street at night during a snow storm.jpg"
- },
- {
- "prompt_en": "a city street at night during a snow storm, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a city street at night during a snow storm.jpg"
- },
- {
- "prompt_en": "a city street at night during a snow storm, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a city street at night during a snow storm.jpg"
- },
- {
- "prompt_en": "a city street at night during a snow storm, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a city street at night during a snow storm.jpg"
- },
- {
- "prompt_en": "a city street at night during a snow storm, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a city street at night during a snow storm.jpg"
- },
- {
- "prompt_en": "a city street at night during a snow storm, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a city street at night during a snow storm.jpg"
- },
- {
- "prompt_en": "tower bridge in london",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "tower bridge in london.jpg"
- },
- {
- "prompt_en": "tower bridge in london, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tower bridge in london.jpg"
- },
- {
- "prompt_en": "tower bridge in london, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tower bridge in london.jpg"
- },
- {
- "prompt_en": "tower bridge in london, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tower bridge in london.jpg"
- },
- {
- "prompt_en": "tower bridge in london, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tower bridge in london.jpg"
- },
- {
- "prompt_en": "tower bridge in london, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tower bridge in london.jpg"
- },
- {
- "prompt_en": "tower bridge in london, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tower bridge in london.jpg"
- },
- {
- "prompt_en": "tower bridge in london, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "tower bridge in london.jpg"
- },
- {
- "prompt_en": "chinese pagoda in the middle of a snowy day",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "chinese pagoda in the middle of a snowy day.jpg"
- },
- {
- "prompt_en": "chinese pagoda in the middle of a snowy day, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "chinese pagoda in the middle of a snowy day.jpg"
- },
- {
- "prompt_en": "chinese pagoda in the middle of a snowy day, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "chinese pagoda in the middle of a snowy day.jpg"
- },
- {
- "prompt_en": "chinese pagoda in the middle of a snowy day, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "chinese pagoda in the middle of a snowy day.jpg"
- },
- {
- "prompt_en": "chinese pagoda in the middle of a snowy day, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "chinese pagoda in the middle of a snowy day.jpg"
- },
- {
- "prompt_en": "chinese pagoda in the middle of a snowy day, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "chinese pagoda in the middle of a snowy day.jpg"
- },
- {
- "prompt_en": "chinese pagoda in the middle of a snowy day, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "chinese pagoda in the middle of a snowy day.jpg"
- },
- {
- "prompt_en": "chinese pagoda in the middle of a snowy day, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "chinese pagoda in the middle of a snowy day.jpg"
- },
- {
- "prompt_en": "a dark alleyway with a bus driving down it",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a dark alleyway with a bus driving down it.jpg"
- },
- {
- "prompt_en": "a dark alleyway with a bus driving down it, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a dark alleyway with a bus driving down it.jpg"
- },
- {
- "prompt_en": "a dark alleyway with a bus driving down it, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a dark alleyway with a bus driving down it.jpg"
- },
- {
- "prompt_en": "a dark alleyway with a bus driving down it, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a dark alleyway with a bus driving down it.jpg"
- },
- {
- "prompt_en": "a dark alleyway with a bus driving down it, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a dark alleyway with a bus driving down it.jpg"
- },
- {
- "prompt_en": "a dark alleyway with a bus driving down it, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a dark alleyway with a bus driving down it.jpg"
- },
- {
- "prompt_en": "a dark alleyway with a bus driving down it, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a dark alleyway with a bus driving down it.jpg"
- },
- {
- "prompt_en": "a dark alleyway with a bus driving down it, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a dark alleyway with a bus driving down it.jpg"
- },
- {
- "prompt_en": "a monastery sits on top of a cliff in bhutan",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
- },
- {
- "prompt_en": "a monastery sits on top of a cliff in bhutan, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
- },
- {
- "prompt_en": "a monastery sits on top of a cliff in bhutan, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
- },
- {
- "prompt_en": "a monastery sits on top of a cliff in bhutan, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
- },
- {
- "prompt_en": "a monastery sits on top of a cliff in bhutan, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
- },
- {
- "prompt_en": "a monastery sits on top of a cliff in bhutan, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
- },
- {
- "prompt_en": "a monastery sits on top of a cliff in bhutan, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
- },
- {
- "prompt_en": "a monastery sits on top of a cliff in bhutan, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
- },
- {
- "prompt_en": "the dome of the rock in jerusalem",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "the dome of the rock in jerusalem.jpg"
- },
- {
- "prompt_en": "the dome of the rock in jerusalem, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the dome of the rock in jerusalem.jpg"
- },
- {
- "prompt_en": "the dome of the rock in jerusalem, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the dome of the rock in jerusalem.jpg"
- },
- {
- "prompt_en": "the dome of the rock in jerusalem, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the dome of the rock in jerusalem.jpg"
- },
- {
- "prompt_en": "the dome of the rock in jerusalem, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the dome of the rock in jerusalem.jpg"
- },
- {
- "prompt_en": "the dome of the rock in jerusalem, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the dome of the rock in jerusalem.jpg"
- },
- {
- "prompt_en": "the dome of the rock in jerusalem, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the dome of the rock in jerusalem.jpg"
- },
- {
- "prompt_en": "the dome of the rock in jerusalem, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "the dome of the rock in jerusalem.jpg"
- },
- {
- "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
- },
- {
- "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
- },
- {
- "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
- },
- {
- "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
- },
- {
- "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
- },
- {
- "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
- },
- {
- "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
- },
- {
- "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
- },
- {
- "prompt_en": "a reflection of a city with buildings in the water",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "architecture",
- "image_name": "a reflection of a city with buildings in the water.jpg"
- },
- {
- "prompt_en": "a reflection of a city with buildings in the water, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a reflection of a city with buildings in the water.jpg"
- },
- {
- "prompt_en": "a reflection of a city with buildings in the water, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a reflection of a city with buildings in the water.jpg"
- },
- {
- "prompt_en": "a reflection of a city with buildings in the water, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a reflection of a city with buildings in the water.jpg"
- },
- {
- "prompt_en": "a reflection of a city with buildings in the water, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a reflection of a city with buildings in the water.jpg"
- },
- {
- "prompt_en": "a reflection of a city with buildings in the water, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a reflection of a city with buildings in the water.jpg"
- },
- {
- "prompt_en": "a reflection of a city with buildings in the water, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a reflection of a city with buildings in the water.jpg"
- },
- {
- "prompt_en": "a reflection of a city with buildings in the water, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "architecture",
- "image_name": "a reflection of a city with buildings in the water.jpg"
- },
- {
- "prompt_en": "a bar with chairs and a television on the wall",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a bar with chairs and a television on the wall.jpg"
- },
- {
- "prompt_en": "a bar with chairs and a television on the wall, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a bar with chairs and a television on the wall.jpg"
- },
- {
- "prompt_en": "a bar with chairs and a television on the wall, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a bar with chairs and a television on the wall.jpg"
- },
- {
- "prompt_en": "a bar with chairs and a television on the wall, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a bar with chairs and a television on the wall.jpg"
- },
- {
- "prompt_en": "a bar with chairs and a television on the wall, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a bar with chairs and a television on the wall.jpg"
- },
- {
- "prompt_en": "a bar with chairs and a television on the wall, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a bar with chairs and a television on the wall.jpg"
- },
- {
- "prompt_en": "a bar with chairs and a television on the wall, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a bar with chairs and a television on the wall.jpg"
- },
- {
- "prompt_en": "a bar with chairs and a television on the wall, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a bar with chairs and a television on the wall.jpg"
- },
- {
- "prompt_en": "a living room filled with lots of books on a wall",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with lots of books on a wall.jpg"
- },
- {
- "prompt_en": "a living room filled with lots of books on a wall, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with lots of books on a wall.jpg"
- },
- {
- "prompt_en": "a living room filled with lots of books on a wall, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with lots of books on a wall.jpg"
- },
- {
- "prompt_en": "a living room filled with lots of books on a wall, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with lots of books on a wall.jpg"
- },
- {
- "prompt_en": "a living room filled with lots of books on a wall, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with lots of books on a wall.jpg"
- },
- {
- "prompt_en": "a living room filled with lots of books on a wall, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with lots of books on a wall.jpg"
- },
- {
- "prompt_en": "a living room filled with lots of books on a wall, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with lots of books on a wall.jpg"
- },
- {
- "prompt_en": "a living room filled with lots of books on a wall, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with lots of books on a wall.jpg"
- },
- {
- "prompt_en": "a living room filled with furniture next to a stone wall",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with furniture next to a stone wall.jpg"
- },
- {
- "prompt_en": "a living room filled with furniture next to a stone wall, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with furniture next to a stone wall.jpg"
- },
- {
- "prompt_en": "a living room filled with furniture next to a stone wall, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with furniture next to a stone wall.jpg"
- },
- {
- "prompt_en": "a living room filled with furniture next to a stone wall, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with furniture next to a stone wall.jpg"
- },
- {
- "prompt_en": "a living room filled with furniture next to a stone wall, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with furniture next to a stone wall.jpg"
- },
- {
- "prompt_en": "a living room filled with furniture next to a stone wall, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with furniture next to a stone wall.jpg"
- },
- {
- "prompt_en": "a living room filled with furniture next to a stone wall, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with furniture next to a stone wall.jpg"
- },
- {
- "prompt_en": "a living room filled with furniture next to a stone wall, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room filled with furniture next to a stone wall.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with sunlight coming through the window",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
- },
- {
- "prompt_en": "a room filled with lots of shelves filled with books",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with lots of shelves filled with books.jpg"
- },
- {
- "prompt_en": "a room filled with lots of shelves filled with books, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with lots of shelves filled with books.jpg"
- },
- {
- "prompt_en": "a room filled with lots of shelves filled with books, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with lots of shelves filled with books.jpg"
- },
- {
- "prompt_en": "a room filled with lots of shelves filled with books, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with lots of shelves filled with books.jpg"
- },
- {
- "prompt_en": "a room filled with lots of shelves filled with books, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with lots of shelves filled with books.jpg"
- },
- {
- "prompt_en": "a room filled with lots of shelves filled with books, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with lots of shelves filled with books.jpg"
- },
- {
- "prompt_en": "a room filled with lots of shelves filled with books, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with lots of shelves filled with books.jpg"
- },
- {
- "prompt_en": "a room filled with lots of shelves filled with books, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with lots of shelves filled with books.jpg"
- },
- {
- "prompt_en": "an art gallery with paintings on the walls",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "an art gallery with paintings on the walls.jpg"
- },
- {
- "prompt_en": "an art gallery with paintings on the walls, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an art gallery with paintings on the walls.jpg"
- },
- {
- "prompt_en": "an art gallery with paintings on the walls, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an art gallery with paintings on the walls.jpg"
- },
- {
- "prompt_en": "an art gallery with paintings on the walls, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an art gallery with paintings on the walls.jpg"
- },
- {
- "prompt_en": "an art gallery with paintings on the walls, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an art gallery with paintings on the walls.jpg"
- },
- {
- "prompt_en": "an art gallery with paintings on the walls, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an art gallery with paintings on the walls.jpg"
- },
- {
- "prompt_en": "an art gallery with paintings on the walls, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an art gallery with paintings on the walls.jpg"
- },
- {
- "prompt_en": "an art gallery with paintings on the walls, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an art gallery with paintings on the walls.jpg"
- },
- {
- "prompt_en": "a room with a lot of pictures on the walls",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a room with a lot of pictures on the walls.jpg"
- },
- {
- "prompt_en": "a room with a lot of pictures on the walls, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a lot of pictures on the walls.jpg"
- },
- {
- "prompt_en": "a room with a lot of pictures on the walls, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a lot of pictures on the walls.jpg"
- },
- {
- "prompt_en": "a room with a lot of pictures on the walls, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a lot of pictures on the walls.jpg"
- },
- {
- "prompt_en": "a room with a lot of pictures on the walls, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a lot of pictures on the walls.jpg"
- },
- {
- "prompt_en": "a room with a lot of pictures on the walls, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a lot of pictures on the walls.jpg"
- },
- {
- "prompt_en": "a room with a lot of pictures on the walls, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a lot of pictures on the walls.jpg"
- },
- {
- "prompt_en": "a room with a lot of pictures on the walls, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a lot of pictures on the walls.jpg"
- },
- {
- "prompt_en": "a painting of a cloudy sky next to an easel",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a painting of a cloudy sky next to an easel.jpg"
- },
- {
- "prompt_en": "a painting of a cloudy sky next to an easel, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a painting of a cloudy sky next to an easel.jpg"
- },
- {
- "prompt_en": "a painting of a cloudy sky next to an easel, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a painting of a cloudy sky next to an easel.jpg"
- },
- {
- "prompt_en": "a painting of a cloudy sky next to an easel, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a painting of a cloudy sky next to an easel.jpg"
- },
- {
- "prompt_en": "a painting of a cloudy sky next to an easel, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a painting of a cloudy sky next to an easel.jpg"
- },
- {
- "prompt_en": "a painting of a cloudy sky next to an easel, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a painting of a cloudy sky next to an easel.jpg"
- },
- {
- "prompt_en": "a painting of a cloudy sky next to an easel, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a painting of a cloudy sky next to an easel.jpg"
- },
- {
- "prompt_en": "a painting of a cloudy sky next to an easel, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a painting of a cloudy sky next to an easel.jpg"
- },
- {
- "prompt_en": "a living room with a christmas tree and a rocking chair",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a christmas tree and a rocking chair.jpg"
- },
- {
- "prompt_en": "a living room with a christmas tree and a rocking chair, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a christmas tree and a rocking chair.jpg"
- },
- {
- "prompt_en": "a living room with a christmas tree and a rocking chair, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a christmas tree and a rocking chair.jpg"
- },
- {
- "prompt_en": "a living room with a christmas tree and a rocking chair, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a christmas tree and a rocking chair.jpg"
- },
- {
- "prompt_en": "a living room with a christmas tree and a rocking chair, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a christmas tree and a rocking chair.jpg"
- },
- {
- "prompt_en": "a living room with a christmas tree and a rocking chair, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a christmas tree and a rocking chair.jpg"
- },
- {
- "prompt_en": "a living room with a christmas tree and a rocking chair, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a christmas tree and a rocking chair.jpg"
- },
- {
- "prompt_en": "a living room with a christmas tree and a rocking chair, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a christmas tree and a rocking chair.jpg"
- },
- {
- "prompt_en": "a kitchen with a sink and a lot of glasses on the counter",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
- },
- {
- "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
- },
- {
- "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
- },
- {
- "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
- },
- {
- "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
- },
- {
- "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
- },
- {
- "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
- },
- {
- "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
- },
- {
- "prompt_en": "a wooden table in front of a brick wall with bottles on the wall",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
- },
- {
- "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
- },
- {
- "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
- },
- {
- "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
- },
- {
- "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
- },
- {
- "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
- },
- {
- "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
- },
- {
- "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
- },
- {
- "prompt_en": "a room filled with paintings and statues",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with paintings and statues.jpg"
- },
- {
- "prompt_en": "a room filled with paintings and statues, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with paintings and statues.jpg"
- },
- {
- "prompt_en": "a room filled with paintings and statues, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with paintings and statues.jpg"
- },
- {
- "prompt_en": "a room filled with paintings and statues, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with paintings and statues.jpg"
- },
- {
- "prompt_en": "a room filled with paintings and statues, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with paintings and statues.jpg"
- },
- {
- "prompt_en": "a room filled with paintings and statues, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with paintings and statues.jpg"
- },
- {
- "prompt_en": "a room filled with paintings and statues, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with paintings and statues.jpg"
- },
- {
- "prompt_en": "a room filled with paintings and statues, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with paintings and statues.jpg"
- },
- {
- "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
- },
- {
- "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
- },
- {
- "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
- },
- {
- "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
- },
- {
- "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
- },
- {
- "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
- },
- {
- "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
- },
- {
- "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
- },
- {
- "prompt_en": "a room filled with books and teddy bears",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with books and teddy bears.jpg"
- },
- {
- "prompt_en": "a room filled with books and teddy bears, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with books and teddy bears.jpg"
- },
- {
- "prompt_en": "a room filled with books and teddy bears, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with books and teddy bears.jpg"
- },
- {
- "prompt_en": "a room filled with books and teddy bears, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with books and teddy bears.jpg"
- },
- {
- "prompt_en": "a room filled with books and teddy bears, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with books and teddy bears.jpg"
- },
- {
- "prompt_en": "a room filled with books and teddy bears, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with books and teddy bears.jpg"
- },
- {
- "prompt_en": "a room filled with books and teddy bears, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with books and teddy bears.jpg"
- },
- {
- "prompt_en": "a room filled with books and teddy bears, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room filled with books and teddy bears.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with a plant in the corner",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with a plant in the corner.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with a plant in the corner, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with a plant in the corner.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with a plant in the corner, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with a plant in the corner.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with a plant in the corner, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with a plant in the corner.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with a plant in the corner, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with a plant in the corner.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with a plant in the corner, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with a plant in the corner.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with a plant in the corner, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with a plant in the corner.jpg"
- },
- {
- "prompt_en": "a table and chairs in a room with a plant in the corner, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a table and chairs in a room with a plant in the corner.jpg"
- },
- {
- "prompt_en": "a living room with a couch, table, and a window",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a couch, table, and a window.jpg"
- },
- {
- "prompt_en": "a living room with a couch, table, and a window, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a couch, table, and a window.jpg"
- },
- {
- "prompt_en": "a living room with a couch, table, and a window, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a couch, table, and a window.jpg"
- },
- {
- "prompt_en": "a living room with a couch, table, and a window, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a couch, table, and a window.jpg"
- },
- {
- "prompt_en": "a living room with a couch, table, and a window, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a couch, table, and a window.jpg"
- },
- {
- "prompt_en": "a living room with a couch, table, and a window, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a couch, table, and a window.jpg"
- },
- {
- "prompt_en": "a living room with a couch, table, and a window, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a couch, table, and a window.jpg"
- },
- {
- "prompt_en": "a living room with a couch, table, and a window, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with a couch, table, and a window.jpg"
- },
- {
- "prompt_en": "a modern living room with wood floors and a tv",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a modern living room with wood floors and a tv.jpg"
- },
- {
- "prompt_en": "a modern living room with wood floors and a tv, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a modern living room with wood floors and a tv.jpg"
- },
- {
- "prompt_en": "a modern living room with wood floors and a tv, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a modern living room with wood floors and a tv.jpg"
- },
- {
- "prompt_en": "a modern living room with wood floors and a tv, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a modern living room with wood floors and a tv.jpg"
- },
- {
- "prompt_en": "a modern living room with wood floors and a tv, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a modern living room with wood floors and a tv.jpg"
- },
- {
- "prompt_en": "a modern living room with wood floors and a tv, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a modern living room with wood floors and a tv.jpg"
- },
- {
- "prompt_en": "a modern living room with wood floors and a tv, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a modern living room with wood floors and a tv.jpg"
- },
- {
- "prompt_en": "a modern living room with wood floors and a tv, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a modern living room with wood floors and a tv.jpg"
- },
- {
- "prompt_en": "a room with a desk and a chair in it",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a room with a desk and a chair in it.jpg"
- },
- {
- "prompt_en": "a room with a desk and a chair in it, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a desk and a chair in it.jpg"
- },
- {
- "prompt_en": "a room with a desk and a chair in it, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a desk and a chair in it.jpg"
- },
- {
- "prompt_en": "a room with a desk and a chair in it, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a desk and a chair in it.jpg"
- },
- {
- "prompt_en": "a room with a desk and a chair in it, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a desk and a chair in it.jpg"
- },
- {
- "prompt_en": "a room with a desk and a chair in it, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a desk and a chair in it.jpg"
- },
- {
- "prompt_en": "a room with a desk and a chair in it, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a desk and a chair in it.jpg"
- },
- {
- "prompt_en": "a room with a desk and a chair in it, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a room with a desk and a chair in it.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a building",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a large waterfall in the middle of a building.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a building, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a large waterfall in the middle of a building.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a building, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a large waterfall in the middle of a building.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a building, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a large waterfall in the middle of a building.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a building, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a large waterfall in the middle of a building.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a building, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a large waterfall in the middle of a building.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a building, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a large waterfall in the middle of a building.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a building, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a large waterfall in the middle of a building.jpg"
- },
- {
- "prompt_en": "a chair in a room next to some drawings",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a chair in a room next to some drawings.jpg"
- },
- {
- "prompt_en": "a chair in a room next to some drawings, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a chair in a room next to some drawings.jpg"
- },
- {
- "prompt_en": "a chair in a room next to some drawings, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a chair in a room next to some drawings.jpg"
- },
- {
- "prompt_en": "a chair in a room next to some drawings, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a chair in a room next to some drawings.jpg"
- },
- {
- "prompt_en": "a chair in a room next to some drawings, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a chair in a room next to some drawings.jpg"
- },
- {
- "prompt_en": "a chair in a room next to some drawings, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a chair in a room next to some drawings.jpg"
- },
- {
- "prompt_en": "a chair in a room next to some drawings, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a chair in a room next to some drawings.jpg"
- },
- {
- "prompt_en": "a chair in a room next to some drawings, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a chair in a room next to some drawings.jpg"
- },
- {
- "prompt_en": "a living room with hardwood floors and a white couch",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "indoor",
- "image_name": "a living room with hardwood floors and a white couch.jpg"
- },
- {
- "prompt_en": "a living room with hardwood floors and a white couch, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with hardwood floors and a white couch.jpg"
- },
- {
- "prompt_en": "a living room with hardwood floors and a white couch, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with hardwood floors and a white couch.jpg"
- },
- {
- "prompt_en": "a living room with hardwood floors and a white couch, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with hardwood floors and a white couch.jpg"
- },
- {
- "prompt_en": "a living room with hardwood floors and a white couch, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with hardwood floors and a white couch.jpg"
- },
- {
- "prompt_en": "a living room with hardwood floors and a white couch, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with hardwood floors and a white couch.jpg"
- },
- {
- "prompt_en": "a living room with hardwood floors and a white couch, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with hardwood floors and a white couch.jpg"
- },
- {
- "prompt_en": "a living room with hardwood floors and a white couch, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "indoor",
- "image_name": "a living room with hardwood floors and a white couch.jpg"
- },
- {
- "prompt_en": "two people in a canoe on a lake with mountains in the background",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
- },
- {
- "prompt_en": "two people in a canoe on a lake with mountains in the background, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
- },
- {
- "prompt_en": "two people in a canoe on a lake with mountains in the background, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
- },
- {
- "prompt_en": "two people in a canoe on a lake with mountains in the background, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
- },
- {
- "prompt_en": "two people in a canoe on a lake with mountains in the background, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
- },
- {
- "prompt_en": "two people in a canoe on a lake with mountains in the background, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
- },
- {
- "prompt_en": "two people in a canoe on a lake with mountains in the background, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
- },
- {
- "prompt_en": "two people in a canoe on a lake with mountains in the background, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
- },
- {
- "prompt_en": "an aerial view of a snowy road in a forest",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a snowy road in a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a snowy road in a forest, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a snowy road in a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a snowy road in a forest, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a snowy road in a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a snowy road in a forest, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a snowy road in a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a snowy road in a forest, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a snowy road in a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a snowy road in a forest, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a snowy road in a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a snowy road in a forest, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a snowy road in a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a snowy road in a forest, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a snowy road in a forest.jpg"
- },
- {
- "prompt_en": "a view of a waterfall from a distance",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a view of a waterfall from a distance.jpg"
- },
- {
- "prompt_en": "a view of a waterfall from a distance, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a view of a waterfall from a distance.jpg"
- },
- {
- "prompt_en": "a view of a waterfall from a distance, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a view of a waterfall from a distance.jpg"
- },
- {
- "prompt_en": "a view of a waterfall from a distance, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a view of a waterfall from a distance.jpg"
- },
- {
- "prompt_en": "a view of a waterfall from a distance, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a view of a waterfall from a distance.jpg"
- },
- {
- "prompt_en": "a view of a waterfall from a distance, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a view of a waterfall from a distance.jpg"
- },
- {
- "prompt_en": "a view of a waterfall from a distance, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a view of a waterfall from a distance.jpg"
- },
- {
- "prompt_en": "a view of a waterfall from a distance, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a view of a waterfall from a distance.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a valley",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a valley.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a valley, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a valley.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a valley, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a valley.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a valley, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a valley.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a valley, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a valley.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a valley, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a valley.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a valley, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a valley.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a valley, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a valley.jpg"
- },
- {
- "prompt_en": "an aerial view of a group of islands in the middle of a lake",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
- },
- {
- "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
- },
- {
- "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
- },
- {
- "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
- },
- {
- "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
- },
- {
- "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
- },
- {
- "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
- },
- {
- "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
- },
- {
- "prompt_en": "an aerial view of a rocky beach in indonesia",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a rocky beach in indonesia.jpg"
- },
- {
- "prompt_en": "an aerial view of a rocky beach in indonesia, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a rocky beach in indonesia.jpg"
- },
- {
- "prompt_en": "an aerial view of a rocky beach in indonesia, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a rocky beach in indonesia.jpg"
- },
- {
- "prompt_en": "an aerial view of a rocky beach in indonesia, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a rocky beach in indonesia.jpg"
- },
- {
- "prompt_en": "an aerial view of a rocky beach in indonesia, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a rocky beach in indonesia.jpg"
- },
- {
- "prompt_en": "an aerial view of a rocky beach in indonesia, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a rocky beach in indonesia.jpg"
- },
- {
- "prompt_en": "an aerial view of a rocky beach in indonesia, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a rocky beach in indonesia.jpg"
- },
- {
- "prompt_en": "an aerial view of a rocky beach in indonesia, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a rocky beach in indonesia.jpg"
- },
- {
- "prompt_en": "fireworks in the night sky over a city",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "fireworks in the night sky over a city.jpg"
- },
- {
- "prompt_en": "fireworks in the night sky over a city, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "fireworks in the night sky over a city.jpg"
- },
- {
- "prompt_en": "fireworks in the night sky over a city, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "fireworks in the night sky over a city.jpg"
- },
- {
- "prompt_en": "fireworks in the night sky over a city, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "fireworks in the night sky over a city.jpg"
- },
- {
- "prompt_en": "fireworks in the night sky over a city, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "fireworks in the night sky over a city.jpg"
- },
- {
- "prompt_en": "fireworks in the night sky over a city, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "fireworks in the night sky over a city.jpg"
- },
- {
- "prompt_en": "fireworks in the night sky over a city, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "fireworks in the night sky over a city.jpg"
- },
- {
- "prompt_en": "fireworks in the night sky over a city, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "fireworks in the night sky over a city.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse on a stormy day",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
- },
- {
- "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
- },
- {
- "prompt_en": "a mountain range with a sky background",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with a sky background.jpg"
- },
- {
- "prompt_en": "a mountain range with a sky background, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with a sky background.jpg"
- },
- {
- "prompt_en": "a mountain range with a sky background, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with a sky background.jpg"
- },
- {
- "prompt_en": "a mountain range with a sky background, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with a sky background.jpg"
- },
- {
- "prompt_en": "a mountain range with a sky background, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with a sky background.jpg"
- },
- {
- "prompt_en": "a mountain range with a sky background, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with a sky background.jpg"
- },
- {
- "prompt_en": "a mountain range with a sky background, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with a sky background.jpg"
- },
- {
- "prompt_en": "a mountain range with a sky background, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with a sky background.jpg"
- },
- {
- "prompt_en": "a large bonfire is burning in the night sky",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a large bonfire is burning in the night sky.jpg"
- },
- {
- "prompt_en": "a large bonfire is burning in the night sky, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large bonfire is burning in the night sky.jpg"
- },
- {
- "prompt_en": "a large bonfire is burning in the night sky, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large bonfire is burning in the night sky.jpg"
- },
- {
- "prompt_en": "a large bonfire is burning in the night sky, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large bonfire is burning in the night sky.jpg"
- },
- {
- "prompt_en": "a large bonfire is burning in the night sky, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large bonfire is burning in the night sky.jpg"
- },
- {
- "prompt_en": "a large bonfire is burning in the night sky, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large bonfire is burning in the night sky.jpg"
- },
- {
- "prompt_en": "a large bonfire is burning in the night sky, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large bonfire is burning in the night sky.jpg"
- },
- {
- "prompt_en": "a large bonfire is burning in the night sky, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large bonfire is burning in the night sky.jpg"
- },
- {
- "prompt_en": "a close-up view of the flames of a fireplace",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a close-up view of the flames of a fireplace.jpg"
- },
- {
- "prompt_en": "a close-up view of the flames of a fireplace, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a close-up view of the flames of a fireplace.jpg"
- },
- {
- "prompt_en": "a close-up view of the flames of a fireplace, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a close-up view of the flames of a fireplace.jpg"
- },
- {
- "prompt_en": "a close-up view of the flames of a fireplace, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a close-up view of the flames of a fireplace.jpg"
- },
- {
- "prompt_en": "a close-up view of the flames of a fireplace, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a close-up view of the flames of a fireplace.jpg"
- },
- {
- "prompt_en": "a close-up view of the flames of a fireplace, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a close-up view of the flames of a fireplace.jpg"
- },
- {
- "prompt_en": "a close-up view of the flames of a fireplace, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a close-up view of the flames of a fireplace.jpg"
- },
- {
- "prompt_en": "a close-up view of the flames of a fireplace, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a close-up view of the flames of a fireplace.jpg"
- },
- {
- "prompt_en": "a farm in the middle of the day",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a farm in the middle of the day.jpg"
- },
- {
- "prompt_en": "a farm in the middle of the day, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a farm in the middle of the day.jpg"
- },
- {
- "prompt_en": "a farm in the middle of the day, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a farm in the middle of the day.jpg"
- },
- {
- "prompt_en": "a farm in the middle of the day, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a farm in the middle of the day.jpg"
- },
- {
- "prompt_en": "a farm in the middle of the day, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a farm in the middle of the day.jpg"
- },
- {
- "prompt_en": "a farm in the middle of the day, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a farm in the middle of the day.jpg"
- },
- {
- "prompt_en": "a farm in the middle of the day, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a farm in the middle of the day.jpg"
- },
- {
- "prompt_en": "a farm in the middle of the day, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a farm in the middle of the day.jpg"
- },
- {
- "prompt_en": "a flock of birds flying over a tree at sunset",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a flock of birds flying over a tree at sunset.jpg"
- },
- {
- "prompt_en": "a flock of birds flying over a tree at sunset, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a flock of birds flying over a tree at sunset.jpg"
- },
- {
- "prompt_en": "a flock of birds flying over a tree at sunset, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a flock of birds flying over a tree at sunset.jpg"
- },
- {
- "prompt_en": "a flock of birds flying over a tree at sunset, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a flock of birds flying over a tree at sunset.jpg"
- },
- {
- "prompt_en": "a flock of birds flying over a tree at sunset, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a flock of birds flying over a tree at sunset.jpg"
- },
- {
- "prompt_en": "a flock of birds flying over a tree at sunset, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a flock of birds flying over a tree at sunset.jpg"
- },
- {
- "prompt_en": "a flock of birds flying over a tree at sunset, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a flock of birds flying over a tree at sunset.jpg"
- },
- {
- "prompt_en": "a flock of birds flying over a tree at sunset, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a flock of birds flying over a tree at sunset.jpg"
- },
- {
- "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
- },
- {
- "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
- },
- {
- "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
- },
- {
- "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
- },
- {
- "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
- },
- {
- "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
- },
- {
- "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
- },
- {
- "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
- },
- {
- "prompt_en": "a mountain with snow on it",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a mountain with snow on it.jpg"
- },
- {
- "prompt_en": "a mountain with snow on it, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain with snow on it.jpg"
- },
- {
- "prompt_en": "a mountain with snow on it, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain with snow on it.jpg"
- },
- {
- "prompt_en": "a mountain with snow on it, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain with snow on it.jpg"
- },
- {
- "prompt_en": "a mountain with snow on it, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain with snow on it.jpg"
- },
- {
- "prompt_en": "a mountain with snow on it, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain with snow on it.jpg"
- },
- {
- "prompt_en": "a mountain with snow on it, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain with snow on it.jpg"
- },
- {
- "prompt_en": "a mountain with snow on it, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain with snow on it.jpg"
- },
- {
- "prompt_en": "a bridge that is in the middle of a river",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a bridge that is in the middle of a river.jpg"
- },
- {
- "prompt_en": "a bridge that is in the middle of a river, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a bridge that is in the middle of a river.jpg"
- },
- {
- "prompt_en": "a bridge that is in the middle of a river, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a bridge that is in the middle of a river.jpg"
- },
- {
- "prompt_en": "a bridge that is in the middle of a river, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a bridge that is in the middle of a river.jpg"
- },
- {
- "prompt_en": "a bridge that is in the middle of a river, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a bridge that is in the middle of a river.jpg"
- },
- {
- "prompt_en": "a bridge that is in the middle of a river, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a bridge that is in the middle of a river.jpg"
- },
- {
- "prompt_en": "a bridge that is in the middle of a river, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a bridge that is in the middle of a river.jpg"
- },
- {
- "prompt_en": "a bridge that is in the middle of a river, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a bridge that is in the middle of a river.jpg"
- },
- {
- "prompt_en": "a group of people standing on top of a green hill",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a group of people standing on top of a green hill.jpg"
- },
- {
- "prompt_en": "a group of people standing on top of a green hill, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of people standing on top of a green hill.jpg"
- },
- {
- "prompt_en": "a group of people standing on top of a green hill, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of people standing on top of a green hill.jpg"
- },
- {
- "prompt_en": "a group of people standing on top of a green hill, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of people standing on top of a green hill.jpg"
- },
- {
- "prompt_en": "a group of people standing on top of a green hill, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of people standing on top of a green hill.jpg"
- },
- {
- "prompt_en": "a group of people standing on top of a green hill, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of people standing on top of a green hill.jpg"
- },
- {
- "prompt_en": "a group of people standing on top of a green hill, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of people standing on top of a green hill.jpg"
- },
- {
- "prompt_en": "a group of people standing on top of a green hill, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of people standing on top of a green hill.jpg"
- },
- {
- "prompt_en": "a sandy beach with a wooden pier in the water",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with a wooden pier in the water.jpg"
- },
- {
- "prompt_en": "a sandy beach with a wooden pier in the water, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with a wooden pier in the water.jpg"
- },
- {
- "prompt_en": "a sandy beach with a wooden pier in the water, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with a wooden pier in the water.jpg"
- },
- {
- "prompt_en": "a sandy beach with a wooden pier in the water, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with a wooden pier in the water.jpg"
- },
- {
- "prompt_en": "a sandy beach with a wooden pier in the water, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with a wooden pier in the water.jpg"
- },
- {
- "prompt_en": "a sandy beach with a wooden pier in the water, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with a wooden pier in the water.jpg"
- },
- {
- "prompt_en": "a sandy beach with a wooden pier in the water, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with a wooden pier in the water.jpg"
- },
- {
- "prompt_en": "a sandy beach with a wooden pier in the water, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with a wooden pier in the water.jpg"
- },
- {
- "prompt_en": "a lake surrounded by mountains and flowers",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a lake surrounded by mountains and flowers.jpg"
- },
- {
- "prompt_en": "a lake surrounded by mountains and flowers, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a lake surrounded by mountains and flowers.jpg"
- },
- {
- "prompt_en": "a lake surrounded by mountains and flowers, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a lake surrounded by mountains and flowers.jpg"
- },
- {
- "prompt_en": "a lake surrounded by mountains and flowers, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a lake surrounded by mountains and flowers.jpg"
- },
- {
- "prompt_en": "a lake surrounded by mountains and flowers, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a lake surrounded by mountains and flowers.jpg"
- },
- {
- "prompt_en": "a lake surrounded by mountains and flowers, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a lake surrounded by mountains and flowers.jpg"
- },
- {
- "prompt_en": "a lake surrounded by mountains and flowers, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a lake surrounded by mountains and flowers.jpg"
- },
- {
- "prompt_en": "a lake surrounded by mountains and flowers, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a lake surrounded by mountains and flowers.jpg"
- },
- {
- "prompt_en": "a hot-air balloon flying over a desert landscape",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a hot-air balloon flying over a desert landscape.jpg"
- },
- {
- "prompt_en": "a hot-air balloon flying over a desert landscape, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a hot-air balloon flying over a desert landscape.jpg"
- },
- {
- "prompt_en": "a hot-air balloon flying over a desert landscape, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a hot-air balloon flying over a desert landscape.jpg"
- },
- {
- "prompt_en": "a hot-air balloon flying over a desert landscape, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a hot-air balloon flying over a desert landscape.jpg"
- },
- {
- "prompt_en": "a hot-air balloon flying over a desert landscape, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a hot-air balloon flying over a desert landscape.jpg"
- },
- {
- "prompt_en": "a hot-air balloon flying over a desert landscape, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a hot-air balloon flying over a desert landscape.jpg"
- },
- {
- "prompt_en": "a hot-air balloon flying over a desert landscape, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a hot-air balloon flying over a desert landscape.jpg"
- },
- {
- "prompt_en": "a hot-air balloon flying over a desert landscape, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a hot-air balloon flying over a desert landscape.jpg"
- },
- {
- "prompt_en": "several hot air balloons flying over a city",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "several hot air balloons flying over a city.jpg"
- },
- {
- "prompt_en": "several hot air balloons flying over a city, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "several hot air balloons flying over a city.jpg"
- },
- {
- "prompt_en": "several hot air balloons flying over a city, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "several hot air balloons flying over a city.jpg"
- },
- {
- "prompt_en": "several hot air balloons flying over a city, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "several hot air balloons flying over a city.jpg"
- },
- {
- "prompt_en": "several hot air balloons flying over a city, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "several hot air balloons flying over a city.jpg"
- },
- {
- "prompt_en": "several hot air balloons flying over a city, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "several hot air balloons flying over a city.jpg"
- },
- {
- "prompt_en": "several hot air balloons flying over a city, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "several hot air balloons flying over a city.jpg"
- },
- {
- "prompt_en": "several hot air balloons flying over a city, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "several hot air balloons flying over a city.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a field",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a field.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a field, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a field.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a field, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a field.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a field, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a field.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a field, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a field.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a field, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a field.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a field, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a field.jpg"
- },
- {
- "prompt_en": "a group of hot air balloons flying over a field, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a group of hot air balloons flying over a field.jpg"
- },
- {
- "prompt_en": "a large wave crashes over a rocky cliff",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes over a rocky cliff.jpg"
- },
- {
- "prompt_en": "a large wave crashes over a rocky cliff, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes over a rocky cliff.jpg"
- },
- {
- "prompt_en": "a large wave crashes over a rocky cliff, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes over a rocky cliff.jpg"
- },
- {
- "prompt_en": "a large wave crashes over a rocky cliff, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes over a rocky cliff.jpg"
- },
- {
- "prompt_en": "a large wave crashes over a rocky cliff, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes over a rocky cliff.jpg"
- },
- {
- "prompt_en": "a large wave crashes over a rocky cliff, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes over a rocky cliff.jpg"
- },
- {
- "prompt_en": "a large wave crashes over a rocky cliff, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes over a rocky cliff.jpg"
- },
- {
- "prompt_en": "a large wave crashes over a rocky cliff, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave crashes over a rocky cliff.jpg"
- },
- {
- "prompt_en": "the sun is setting over a lake in the mountains",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "the sun is setting over a lake in the mountains.jpg"
- },
- {
- "prompt_en": "the sun is setting over a lake in the mountains, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is setting over a lake in the mountains.jpg"
- },
- {
- "prompt_en": "the sun is setting over a lake in the mountains, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is setting over a lake in the mountains.jpg"
- },
- {
- "prompt_en": "the sun is setting over a lake in the mountains, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is setting over a lake in the mountains.jpg"
- },
- {
- "prompt_en": "the sun is setting over a lake in the mountains, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is setting over a lake in the mountains.jpg"
- },
- {
- "prompt_en": "the sun is setting over a lake in the mountains, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is setting over a lake in the mountains.jpg"
- },
- {
- "prompt_en": "the sun is setting over a lake in the mountains, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is setting over a lake in the mountains.jpg"
- },
- {
- "prompt_en": "the sun is setting over a lake in the mountains, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is setting over a lake in the mountains.jpg"
- },
- {
- "prompt_en": "a mountain range with snow on the ground",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with snow on the ground.jpg"
- },
- {
- "prompt_en": "a mountain range with snow on the ground, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with snow on the ground.jpg"
- },
- {
- "prompt_en": "a mountain range with snow on the ground, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with snow on the ground.jpg"
- },
- {
- "prompt_en": "a mountain range with snow on the ground, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with snow on the ground.jpg"
- },
- {
- "prompt_en": "a mountain range with snow on the ground, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with snow on the ground.jpg"
- },
- {
- "prompt_en": "a mountain range with snow on the ground, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with snow on the ground.jpg"
- },
- {
- "prompt_en": "a mountain range with snow on the ground, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with snow on the ground.jpg"
- },
- {
- "prompt_en": "a mountain range with snow on the ground, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain range with snow on the ground.jpg"
- },
- {
- "prompt_en": "sun rays shining through clouds over a lake",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "sun rays shining through clouds over a lake.jpg"
- },
- {
- "prompt_en": "sun rays shining through clouds over a lake, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "sun rays shining through clouds over a lake.jpg"
- },
- {
- "prompt_en": "sun rays shining through clouds over a lake, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "sun rays shining through clouds over a lake.jpg"
- },
- {
- "prompt_en": "sun rays shining through clouds over a lake, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "sun rays shining through clouds over a lake.jpg"
- },
- {
- "prompt_en": "sun rays shining through clouds over a lake, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "sun rays shining through clouds over a lake.jpg"
- },
- {
- "prompt_en": "sun rays shining through clouds over a lake, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "sun rays shining through clouds over a lake.jpg"
- },
- {
- "prompt_en": "sun rays shining through clouds over a lake, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "sun rays shining through clouds over a lake.jpg"
- },
- {
- "prompt_en": "sun rays shining through clouds over a lake, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "sun rays shining through clouds over a lake.jpg"
- },
- {
- "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
- },
- {
- "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
- },
- {
- "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
- },
- {
- "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
- },
- {
- "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
- },
- {
- "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
- },
- {
- "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
- },
- {
- "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
- },
- {
- "prompt_en": "a foggy road with trees in the distance",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a foggy road with trees in the distance.jpg"
- },
- {
- "prompt_en": "a foggy road with trees in the distance, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy road with trees in the distance.jpg"
- },
- {
- "prompt_en": "a foggy road with trees in the distance, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy road with trees in the distance.jpg"
- },
- {
- "prompt_en": "a foggy road with trees in the distance, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy road with trees in the distance.jpg"
- },
- {
- "prompt_en": "a foggy road with trees in the distance, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy road with trees in the distance.jpg"
- },
- {
- "prompt_en": "a foggy road with trees in the distance, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy road with trees in the distance.jpg"
- },
- {
- "prompt_en": "a foggy road with trees in the distance, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy road with trees in the distance.jpg"
- },
- {
- "prompt_en": "a foggy road with trees in the distance, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy road with trees in the distance.jpg"
- },
- {
- "prompt_en": "two swans swimming on a lake in the fog",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "two swans swimming on a lake in the fog.jpg"
- },
- {
- "prompt_en": "two swans swimming on a lake in the fog, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two swans swimming on a lake in the fog.jpg"
- },
- {
- "prompt_en": "two swans swimming on a lake in the fog, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two swans swimming on a lake in the fog.jpg"
- },
- {
- "prompt_en": "two swans swimming on a lake in the fog, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two swans swimming on a lake in the fog.jpg"
- },
- {
- "prompt_en": "two swans swimming on a lake in the fog, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two swans swimming on a lake in the fog.jpg"
- },
- {
- "prompt_en": "two swans swimming on a lake in the fog, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two swans swimming on a lake in the fog.jpg"
- },
- {
- "prompt_en": "two swans swimming on a lake in the fog, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two swans swimming on a lake in the fog.jpg"
- },
- {
- "prompt_en": "two swans swimming on a lake in the fog, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "two swans swimming on a lake in the fog.jpg"
- },
- {
- "prompt_en": "the sun is shining through the trees near a waterfall",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "the sun is shining through the trees near a waterfall.jpg"
- },
- {
- "prompt_en": "the sun is shining through the trees near a waterfall, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is shining through the trees near a waterfall.jpg"
- },
- {
- "prompt_en": "the sun is shining through the trees near a waterfall, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is shining through the trees near a waterfall.jpg"
- },
- {
- "prompt_en": "the sun is shining through the trees near a waterfall, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is shining through the trees near a waterfall.jpg"
- },
- {
- "prompt_en": "the sun is shining through the trees near a waterfall, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is shining through the trees near a waterfall.jpg"
- },
- {
- "prompt_en": "the sun is shining through the trees near a waterfall, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is shining through the trees near a waterfall.jpg"
- },
- {
- "prompt_en": "the sun is shining through the trees near a waterfall, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is shining through the trees near a waterfall.jpg"
- },
- {
- "prompt_en": "the sun is shining through the trees near a waterfall, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "the sun is shining through the trees near a waterfall.jpg"
- },
- {
- "prompt_en": "a sandy beach with palm trees on the shore",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with palm trees on the shore.jpg"
- },
- {
- "prompt_en": "a sandy beach with palm trees on the shore, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with palm trees on the shore.jpg"
- },
- {
- "prompt_en": "a sandy beach with palm trees on the shore, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with palm trees on the shore.jpg"
- },
- {
- "prompt_en": "a sandy beach with palm trees on the shore, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with palm trees on the shore.jpg"
- },
- {
- "prompt_en": "a sandy beach with palm trees on the shore, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with palm trees on the shore.jpg"
- },
- {
- "prompt_en": "a sandy beach with palm trees on the shore, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with palm trees on the shore.jpg"
- },
- {
- "prompt_en": "a sandy beach with palm trees on the shore, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with palm trees on the shore.jpg"
- },
- {
- "prompt_en": "a sandy beach with palm trees on the shore, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a sandy beach with palm trees on the shore.jpg"
- },
- {
- "prompt_en": "an aerial view of a body of water and a beach",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a body of water and a beach.jpg"
- },
- {
- "prompt_en": "an aerial view of a body of water and a beach, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a body of water and a beach.jpg"
- },
- {
- "prompt_en": "an aerial view of a body of water and a beach, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a body of water and a beach.jpg"
- },
- {
- "prompt_en": "an aerial view of a body of water and a beach, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a body of water and a beach.jpg"
- },
- {
- "prompt_en": "an aerial view of a body of water and a beach, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a body of water and a beach.jpg"
- },
- {
- "prompt_en": "an aerial view of a body of water and a beach, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a body of water and a beach.jpg"
- },
- {
- "prompt_en": "an aerial view of a body of water and a beach, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a body of water and a beach.jpg"
- },
- {
- "prompt_en": "an aerial view of a body of water and a beach, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a body of water and a beach.jpg"
- },
- {
- "prompt_en": "a foggy field that has trees in the grass",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a foggy field that has trees in the grass.jpg"
- },
- {
- "prompt_en": "a foggy field that has trees in the grass, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy field that has trees in the grass.jpg"
- },
- {
- "prompt_en": "a foggy field that has trees in the grass, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy field that has trees in the grass.jpg"
- },
- {
- "prompt_en": "a foggy field that has trees in the grass, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy field that has trees in the grass.jpg"
- },
- {
- "prompt_en": "a foggy field that has trees in the grass, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy field that has trees in the grass.jpg"
- },
- {
- "prompt_en": "a foggy field that has trees in the grass, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy field that has trees in the grass.jpg"
- },
- {
- "prompt_en": "a foggy field that has trees in the grass, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy field that has trees in the grass.jpg"
- },
- {
- "prompt_en": "a foggy field that has trees in the grass, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy field that has trees in the grass.jpg"
- },
- {
- "prompt_en": "a foggy landscape with trees and hills in the distance",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a foggy landscape with trees and hills in the distance.jpg"
- },
- {
- "prompt_en": "a foggy landscape with trees and hills in the distance, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy landscape with trees and hills in the distance.jpg"
- },
- {
- "prompt_en": "a foggy landscape with trees and hills in the distance, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy landscape with trees and hills in the distance.jpg"
- },
- {
- "prompt_en": "a foggy landscape with trees and hills in the distance, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy landscape with trees and hills in the distance.jpg"
- },
- {
- "prompt_en": "a foggy landscape with trees and hills in the distance, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy landscape with trees and hills in the distance.jpg"
- },
- {
- "prompt_en": "a foggy landscape with trees and hills in the distance, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy landscape with trees and hills in the distance.jpg"
- },
- {
- "prompt_en": "a foggy landscape with trees and hills in the distance, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy landscape with trees and hills in the distance.jpg"
- },
- {
- "prompt_en": "a foggy landscape with trees and hills in the distance, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a foggy landscape with trees and hills in the distance.jpg"
- },
- {
- "prompt_en": "a large wave in the ocean with a lot of spray coming from it",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
- },
- {
- "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
- },
- {
- "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
- },
- {
- "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
- },
- {
- "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
- },
- {
- "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
- },
- {
- "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
- },
- {
- "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
- },
- {
- "prompt_en": "a waterfall is shown in the middle of a lush green hillside",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "an aerial view of a curvy road in the middle of a forest",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
- },
- {
- "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
- },
- {
- "prompt_en": "a mountain covered in snow with evergreen trees",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a mountain covered in snow with evergreen trees.jpg"
- },
- {
- "prompt_en": "a mountain covered in snow with evergreen trees, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain covered in snow with evergreen trees.jpg"
- },
- {
- "prompt_en": "a mountain covered in snow with evergreen trees, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain covered in snow with evergreen trees.jpg"
- },
- {
- "prompt_en": "a mountain covered in snow with evergreen trees, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain covered in snow with evergreen trees.jpg"
- },
- {
- "prompt_en": "a mountain covered in snow with evergreen trees, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain covered in snow with evergreen trees.jpg"
- },
- {
- "prompt_en": "a mountain covered in snow with evergreen trees, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain covered in snow with evergreen trees.jpg"
- },
- {
- "prompt_en": "a mountain covered in snow with evergreen trees, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain covered in snow with evergreen trees.jpg"
- },
- {
- "prompt_en": "a mountain covered in snow with evergreen trees, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a mountain covered in snow with evergreen trees.jpg"
- },
- {
- "prompt_en": "a very large waterfall in the middle of the day",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a very large waterfall in the middle of the day.jpg"
- },
- {
- "prompt_en": "a very large waterfall in the middle of the day, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a very large waterfall in the middle of the day.jpg"
- },
- {
- "prompt_en": "a very large waterfall in the middle of the day, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a very large waterfall in the middle of the day.jpg"
- },
- {
- "prompt_en": "a very large waterfall in the middle of the day, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a very large waterfall in the middle of the day.jpg"
- },
- {
- "prompt_en": "a very large waterfall in the middle of the day, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a very large waterfall in the middle of the day.jpg"
- },
- {
- "prompt_en": "a very large waterfall in the middle of the day, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a very large waterfall in the middle of the day.jpg"
- },
- {
- "prompt_en": "a very large waterfall in the middle of the day, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a very large waterfall in the middle of the day.jpg"
- },
- {
- "prompt_en": "a very large waterfall in the middle of the day, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a very large waterfall in the middle of the day.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a lush green hillside",
- "dimension": [
- "i2v_background"
- ],
- "image_type": "scenery",
- "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a lush green hillside, camera pans left",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a lush green hillside, camera pans right",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a lush green hillside, camera tilts up",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a lush green hillside, camera tilts down",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a lush green hillside, camera zooms in",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a lush green hillside, camera zooms out",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a large waterfall in the middle of a lush green hillside, camera static",
- "dimension": [
- "camera_motion"
- ],
- "image_type": "scenery",
- "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
- },
- {
- "prompt_en": "a brown bear in the water with a fish in its mouth",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a brown bear in the water with a fish in its mouth.jpg"
- },
- {
- "prompt_en": "a close-up of a hippopotamus eating grass in a field",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a close-up of a hippopotamus eating grass in a field.jpg"
- },
- {
- "prompt_en": "a sea turtle swimming in the ocean under the water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a sea turtle swimming in the ocean under the water.jpg"
- },
- {
- "prompt_en": "two bees are flying over a lavender plant",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "two bees are flying over a lavender plant.jpg"
- },
- {
- "prompt_en": "the otter is standing in the water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "the otter is standing in the water.jpg"
- },
- {
- "prompt_en": "a dog carrying a soccer ball in its mouth",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a dog carrying a soccer ball in its mouth.jpg"
- },
- {
- "prompt_en": "an eagle is flying over a mountain with trees in the background",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "an eagle is flying over a mountain with trees in the background.jpg"
- },
- {
- "prompt_en": "a couple of horses are running in the dirt",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a couple of horses are running in the dirt.jpg"
- },
- {
- "prompt_en": "a highland cow with long horns standing in a field",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a highland cow with long horns standing in a field.jpg"
- },
- {
- "prompt_en": "a monkey is holding a banana in its mouth",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a monkey is holding a banana in its mouth.jpg"
- },
- {
- "prompt_en": "a large rhino grazing in the grass near a bush",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a large rhino grazing in the grass near a bush.jpg"
- },
- {
- "prompt_en": "a butterfly sits on top of a purple flower",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a butterfly sits on top of a purple flower.jpg"
- },
- {
- "prompt_en": "an alligator is covered in green plants in the water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "an alligator is covered in green plants in the water.jpg"
- },
- {
- "prompt_en": "a red panda eating bamboo in a zoo",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a red panda eating bamboo in a zoo.jpg"
- },
- {
- "prompt_en": "a monochromatic video capturing a cat's gaze into the camera",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a monochromatic video capturing a cat's gaze into the camera.jpg"
- },
- {
- "prompt_en": "a frog sitting on top of water lily leaves",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a frog sitting on top of water lily leaves.jpg"
- },
- {
- "prompt_en": "a lion is roaring in the wild",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a lion is roaring in the wild.jpg"
- },
- {
- "prompt_en": "a seagull is flying towards a person's hand",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a seagull is flying towards a person's hand.jpg"
- },
- {
- "prompt_en": "a yellow and white jellyfish is floating in the ocean",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a yellow and white jellyfish is floating in the ocean.jpg"
- },
- {
- "prompt_en": "a group of jellyfish swimming in an aquarium",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a group of jellyfish swimming in an aquarium.jpg"
- },
- {
- "prompt_en": "a clown fish hiding in a purple anemone",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a clown fish hiding in a purple anemone.jpg"
- },
- {
- "prompt_en": "a snake sitting on the ground next to a bowl",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a snake sitting on the ground next to a bowl.jpg"
- },
- {
- "prompt_en": "a brown and white cow eating hay",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a brown and white cow eating hay.jpg"
- },
- {
- "prompt_en": "a seal swimming in the water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a seal swimming in the water.jpg"
- },
- {
- "prompt_en": "a panda bear is eating a piece of bamboo",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a panda bear is eating a piece of bamboo.jpg"
- },
- {
- "prompt_en": "a small bird sits on a moss covered branch",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a small bird sits on a moss covered branch.jpg"
- },
- {
- "prompt_en": "a bird with a fish in its beak flying over a field",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a bird with a fish in its beak flying over a field.jpg"
- },
- {
- "prompt_en": "a large flock of birds flying in the sky",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a large flock of birds flying in the sky.jpg"
- },
- {
- "prompt_en": "a bald eagle flying over a tree filled forest",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a bald eagle flying over a tree filled forest.jpg"
- },
- {
- "prompt_en": "a giraffe walking in a field",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a giraffe walking in a field.jpg"
- },
- {
- "prompt_en": "a lioness yawning in a field",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a lioness yawning in a field.jpg"
- },
- {
- "prompt_en": "a little crab scurried on the sandy beach",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a little crab scurried on the sandy beach.jpg"
- },
- {
- "prompt_en": "a warthog is walking in the grass",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a warthog is walking in the grass.jpg"
- },
- {
- "prompt_en": "a penguin walking on a beach near the water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a penguin walking on a beach near the water.jpg"
- },
- {
- "prompt_en": "a tiger walking through a wooded area",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a tiger walking through a wooded area.jpg"
- },
- {
- "prompt_en": "a tiger walking on a dirt path in the woods",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a tiger walking on a dirt path in the woods.jpg"
- },
- {
- "prompt_en": "a small monkey holding a piece of food in it's mouth",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a small monkey holding a piece of food in it's mouth.jpg"
- },
- {
- "prompt_en": "a squirrel sitting on the ground eating a piece of bread",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a squirrel sitting on the ground eating a piece of bread.jpg"
- },
- {
- "prompt_en": "a group of fish swimming over a coral reef",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a group of fish swimming over a coral reef.jpg"
- },
- {
- "prompt_en": "a toad is sitting on top of some moss",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a toad is sitting on top of some moss.jpg"
- },
- {
- "prompt_en": "a great white shark swimming in the ocean",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a great white shark swimming in the ocean.jpg"
- },
- {
- "prompt_en": "a group of camels resting in the desert",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a group of camels resting in the desert.jpg"
- },
- {
- "prompt_en": "two sheep grazing in the grass next to a wooden bridge",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "two sheep grazing in the grass next to a wooden bridge.jpg"
- },
- {
- "prompt_en": "an elephant walking through a forest",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "an elephant walking through a forest.jpg"
- },
- {
- "prompt_en": "a white rooster standing in a grassy field",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a white rooster standing in a grassy field.jpg"
- },
- {
- "prompt_en": "a zebra walking across a dirt road near a field",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "animal",
- "image_name": "a zebra walking across a dirt road near a field.jpg"
- },
- {
- "prompt_en": "cars are driving down a street lined with tall trees",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "cars are driving down a street lined with tall trees.jpg"
- },
- {
- "prompt_en": "the cars on the street are waiting for the traffic lights",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "the cars on the street are waiting for the traffic lights.jpg"
- },
- {
- "prompt_en": "a bicycle leaning against a fence in the snow",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a bicycle leaning against a fence in the snow.jpg"
- },
- {
- "prompt_en": "a blue fishing boat is navigating in the ocean next to a cruise ship",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a blue fishing boat is navigating in the ocean next to a cruise ship.jpg"
- },
- {
- "prompt_en": "a blue car driving down a dirt road near train tracks",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a blue car driving down a dirt road near train tracks.jpg"
- },
- {
- "prompt_en": "a sailboat is drifting on the ocean",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a sailboat is drifting on the ocean.jpg"
- },
- {
- "prompt_en": "a couple of boats floating on a body of water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a couple of boats floating on a body of water.jpg"
- },
- {
- "prompt_en": "a city street with cars driving in the rain",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a city street with cars driving in the rain.jpg"
- },
- {
- "prompt_en": "a red and white tram traveling down a snowy street",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a red and white tram traveling down a snowy street.jpg"
- },
- {
- "prompt_en": "a city bus driving down a snowy street at night",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a city bus driving down a snowy street at night.jpg"
- },
- {
- "prompt_en": "a green toy car is sitting on the ground",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a green toy car is sitting on the ground.jpg"
- },
- {
- "prompt_en": "a train traveling down tracks through the woods with leaves on the ground",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a train traveling down tracks through the woods with leaves on the ground.jpg"
- },
- {
- "prompt_en": "a man in a small boat fishing in the ocean",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a man in a small boat fishing in the ocean.jpg"
- },
- {
- "prompt_en": "an airplane is flying through the sky at sunset",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "an airplane is flying through the sky at sunset.jpg"
- },
- {
- "prompt_en": "an old rusty car sits in the middle of a field",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "an old rusty car sits in the middle of a field.jpg"
- },
- {
- "prompt_en": "a motorcycle driving down a road",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a motorcycle driving down a road.jpg"
- },
- {
- "prompt_en": "a blue train traveling through a lush green area",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a blue train traveling through a lush green area.jpg"
- },
- {
- "prompt_en": "a white car is swiftly driving on a dirt road near a bush, kicking up dust",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a white car is swiftly driving on a dirt road near a bush, kicking up dust.jpg"
- },
- {
- "prompt_en": "a large cargo ship sailing in the water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a large cargo ship sailing in the water.jpg"
- },
- {
- "prompt_en": "the red Alfa sports car is speeding down the road",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "the red Alfa sports car is speeding down the road.jpg"
- },
- {
- "prompt_en": "two cars that have been involved in a violent collision",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "two cars that have been involved in a violent collision.jpg"
- },
- {
- "prompt_en": "a red double decker bus driving down a street",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a red double decker bus driving down a street.jpg"
- },
- {
- "prompt_en": "A red sports car driving through sand, kicking up a large amount of dust",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "A red sports car driving through sand, kicking up a large amount of dust.jpg"
- },
- {
- "prompt_en": "a yellow toy car parked on a rock near the water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a yellow toy car parked on a rock near the water.jpg"
- },
- {
- "prompt_en": "a space shuttle taking off into the sky",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a space shuttle taking off into the sky.jpg"
- },
- {
- "prompt_en": "a steam train traveling through the woods",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a steam train traveling through the woods.jpg"
- },
- {
- "prompt_en": "a group of buses parked at a bus station",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a group of buses parked at a bus station.jpg"
- },
- {
- "prompt_en": "A bunch of cars are driving on a highway",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "A bunch of cars are driving on a highway.jpg"
- },
- {
- "prompt_en": "a white and blue airplane flying in the sky",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "a white and blue airplane flying in the sky.jpg"
- },
- {
- "prompt_en": "A space station orbited above the Earth",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "A space station orbited above the Earth.jpg"
- },
- {
- "prompt_en": "A yellow boat is cruising in front of a bridge",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "transportation",
- "image_name": "A yellow boat is cruising in front of a bridge.jpg"
- },
- {
- "prompt_en": "tangerines in a metal bowl on a table",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "tangerines in a metal bowl on a table.jpg"
- },
- {
- "prompt_en": "a shadow of a hand reaching for a leaf",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "other",
- "image_name": "a shadow of a hand reaching for a leaf.jpg"
- },
- {
- "prompt_en": "A teddy bear is climbing over a wooden fence",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "other",
- "image_name": "A teddy bear is climbing over a wooden fence.jpg"
- },
- {
- "prompt_en": "a book on fire with flames coming out of it",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "other",
- "image_name": "a book on fire with flames coming out of it.jpg"
- },
- {
- "prompt_en": "a close-up of a pink rose with water droplets on it",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a close-up of a pink rose with water droplets on it.jpg"
- },
- {
- "prompt_en": "a person is cooking meat on a grill with flames",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person is cooking meat on a grill with flames.jpg"
- },
- {
- "prompt_en": "a snowman wearing a santa hat and scarf",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "other",
- "image_name": "a snowman wearing a santa hat and scarf.jpg"
- },
- {
- "prompt_en": "a person holding a sparkler in their hand",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "other",
- "image_name": "a person holding a sparkler in their hand.jpg"
- },
- {
- "prompt_en": "a teddy bear sitting on a moss covered ground",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "other",
- "image_name": "a teddy bear sitting on a moss covered ground.jpg"
- },
- {
- "prompt_en": "a statue of a lion is sitting on a pedestal",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "other",
- "image_name": "a statue of a lion is sitting on a pedestal.jpg"
- },
- {
- "prompt_en": "metal balls are suspended in the air",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "other",
- "image_name": "metal balls are suspended in the air.jpg"
- },
- {
- "prompt_en": "a close up of a bunch of green grapes",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a close up of a bunch of green grapes.jpg"
- },
- {
- "prompt_en": "a close-up view of a green plant with unfurled fronds",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a close-up view of a green plant with unfurled fronds.jpg"
- },
- {
- "prompt_en": "an orange mushroom sitting on top of a tree stump in the woods",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "an orange mushroom sitting on top of a tree stump in the woods.jpg"
- },
- {
- "prompt_en": "a stack of pancakes covered in syrup and fruit",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a stack of pancakes covered in syrup and fruit.jpg"
- },
- {
- "prompt_en": "a plate of spaghetti with spinach and tomatoes",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a plate of spaghetti with spinach and tomatoes.jpg"
- },
- {
- "prompt_en": "a pink lotus flower in the middle of a pond",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a pink lotus flower in the middle of a pond.jpg"
- },
- {
- "prompt_en": "a person holding a sparkler in front of a sunset",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "other",
- "image_name": "a person holding a sparkler in front of a sunset.jpg"
- },
- {
- "prompt_en": "a pink rose is blooming in a garden",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a pink rose is blooming in a garden.jpg"
- },
- {
- "prompt_en": "a snow man holding a lantern in the snow",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "other",
- "image_name": "a snow man holding a lantern in the snow.jpg"
- },
- {
- "prompt_en": "a stack of chocolate cookies with a bite taken out of it",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a stack of chocolate cookies with a bite taken out of it.jpg"
- },
- {
- "prompt_en": "a white plate topped with eggs, toast, tomatoes, and a sausage",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a white plate topped with eggs, toast, tomatoes, and a sausage.jpg"
- },
- {
- "prompt_en": "a yellow water lily is floating in a pond",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a yellow water lily is floating in a pond.jpg"
- },
- {
- "prompt_en": "an astronaut floating in space with the earth in the background",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "an astronaut floating in space with the earth in the background.jpg"
- },
- {
- "prompt_en": "A little girl, lost in thought, is quietly sitting on the bus",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "A little girl, lost in thought, is quietly sitting on the bus.jpg"
- },
- {
- "prompt_en": "a man holding a tray in front of a brick wall",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man holding a tray in front of a brick wall.jpg"
- },
- {
- "prompt_en": "an older man playing a saxophone on the street",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "an older man playing a saxophone on the street.jpg"
- },
- {
- "prompt_en": "an older man jogging by the water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "an older man jogging by the water.jpg"
- },
- {
- "prompt_en": "a person riding a skateboard on a concrete floor",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a person riding a skateboard on a concrete floor.jpg"
- },
- {
- "prompt_en": "a woman with long black hair is posing for a picture",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman with long black hair is posing for a picture.jpg"
- },
- {
- "prompt_en": "a woman sitting on the ground in front of a guitar",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman sitting on the ground in front of a guitar.jpg"
- },
- {
- "prompt_en": "a little girl wearing a purple helmet riding a blue bike",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a little girl wearing a purple helmet riding a blue bike.jpg"
- },
- {
- "prompt_en": "a young boy is jumping in the mud",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a young boy is jumping in the mud.jpg"
- },
- {
- "prompt_en": "a man sitting in the driver's seat of a car wearing sunglasses",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man sitting in the driver's seat of a car wearing sunglasses.jpg"
- },
- {
- "prompt_en": "a little boy jumping in the air over a puddle of water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a little boy jumping in the air over a puddle of water.jpg"
- },
- {
- "prompt_en": "a woman with afro hair is smiling while wearing earphones",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman with afro hair is smiling while wearing earphones.jpg"
- },
- {
- "prompt_en": "a smiling woman with her hands clasped",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a smiling woman with her hands clasped.jpg"
- },
- {
- "prompt_en": "a young boy standing in a field with horses in the background",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a young boy standing in a field with horses in the background.jpg"
- },
- {
- "prompt_en": "a young man is covered in colored powder",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a young man is covered in colored powder.jpg"
- },
- {
- "prompt_en": "a woman with curly hair is drinking a beer",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman with curly hair is drinking a beer.jpg"
- },
- {
- "prompt_en": "an old man standing in the middle of a field holding a bunch of plants",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "an old man standing in the middle of a field holding a bunch of plants.jpg"
- },
- {
- "prompt_en": "a man standing on a boat with a net",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man standing on a boat with a net.jpg"
- },
- {
- "prompt_en": "a woman in a hat is putting salt into a basket",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman in a hat is putting salt into a basket.jpg"
- },
- {
- "prompt_en": "a young girl smelling a pink flower",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a young girl smelling a pink flower.jpg"
- },
- {
- "prompt_en": "a young boy leaning on a wooden pole",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a young boy leaning on a wooden pole.jpg"
- },
- {
- "prompt_en": "a man in a hat sitting in front of a brick oven",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man in a hat sitting in front of a brick oven.jpg"
- },
- {
- "prompt_en": "a man in a mexican outfit holding an acoustic guitar",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man in a mexican outfit holding an acoustic guitar.jpg"
- },
- {
- "prompt_en": "a snowboarder is in the air doing a trick",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a snowboarder is in the air doing a trick.jpg"
- },
- {
- "prompt_en": "a man riding a horse with a spear in his hand",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man riding a horse with a spear in his hand.jpg"
- },
- {
- "prompt_en": "a woman carrying a bundle of plants over their head",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman carrying a bundle of plants over their head.jpg"
- },
- {
- "prompt_en": "a person jumping in the air over a fence",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a person jumping in the air over a fence.jpg"
- },
- {
- "prompt_en": "a man on a surfboard riding a wave in the ocean",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man on a surfboard riding a wave in the ocean.jpg"
- },
- {
- "prompt_en": "a man sitting on steps playing an acoustic guitar",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man sitting on steps playing an acoustic guitar.jpg"
- },
- {
- "prompt_en": "a man swinging a tennis racquet at a tennis ball",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man swinging a tennis racquet at a tennis ball.jpg"
- },
- {
- "prompt_en": "a man riding a mountain bike on top of a rocky hill",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man riding a mountain bike on top of a rocky hill.jpg"
- },
- {
- "prompt_en": "a man riding a bike down a street",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man riding a bike down a street.jpg"
- },
- {
- "prompt_en": "a man is running on a dirt road",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man is running on a dirt road.jpg"
- },
- {
- "prompt_en": "A man in a black suit and a sombrero, shouting loudly",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "A man in a black suit and a sombrero, shouting loudly.jpg"
- },
- {
- "prompt_en": "a man standing on top of a sand dune in the desert",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man standing on top of a sand dune in the desert.jpg"
- },
- {
- "prompt_en": "a person riding a motorcycle down a road",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a person riding a motorcycle down a road.jpg"
- },
- {
- "prompt_en": "a man standing on top of a mountain with a backpack",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man standing on top of a mountain with a backpack.jpg"
- },
- {
- "prompt_en": "a man with a skull face paint smoking a cigar and holding a guitar",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man with a skull face paint smoking a cigar and holding a guitar.jpg"
- },
- {
- "prompt_en": "a man in sunglasses laying on a wooden bench",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man in sunglasses laying on a wooden bench.jpg"
- },
- {
- "prompt_en": "an older woman sitting in a room with a cigarette in her hand",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "an older woman sitting in a room with a cigarette in her hand.jpg"
- },
- {
- "prompt_en": "a man sitting on the ground playing a musical instrument",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man sitting on the ground playing a musical instrument.jpg"
- },
- {
- "prompt_en": "a person riding a horse in a polo match",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a person riding a horse in a polo match.jpg"
- },
- {
- "prompt_en": "a woman in a kimono holding an umbrella",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman in a kimono holding an umbrella.jpg"
- },
- {
- "prompt_en": "a person riding a dirt bike",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a person riding a dirt bike.jpg"
- },
- {
- "prompt_en": "a person riding an atv on a dirt track",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a person riding an atv on a dirt track.jpg"
- },
- {
- "prompt_en": "a person riding a wave on a surfboard",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a person riding a wave on a surfboard.jpg"
- },
- {
- "prompt_en": "a woman in a wetsuit is swimming in the ocean",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman in a wetsuit is swimming in the ocean.jpg"
- },
- {
- "prompt_en": "a man snorkling in the ocean",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a man snorkling in the ocean.jpg"
- },
- {
- "prompt_en": "a beautiful woman in a blue sari posing in front of a wall",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a beautiful woman in a blue sari posing in front of a wall.jpg"
- },
- {
- "prompt_en": "a woman wearing a shawl in front of a mountain",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman wearing a shawl in front of a mountain.jpg"
- },
- {
- "prompt_en": "a woman is making bread in an oven",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman is making bread in an oven.jpg"
- },
- {
- "prompt_en": "a woman smiles while holding a yellow flower",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman smiles while holding a yellow flower.jpg"
- },
- {
- "prompt_en": "A young boy is lifting a bundle of dry grass wrapped in waterproof fabric over his head",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "A young boy is lifting a bundle of dry grass wrapped in waterproof fabric over his head.jpg"
- },
- {
- "prompt_en": "two people performing a sword fight in front of a forest",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "two people performing a sword fight in front of a forest.jpg"
- },
- {
- "prompt_en": "a woman in a colorful shirt is cooking food",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman in a colorful shirt is cooking food.jpg"
- },
- {
- "prompt_en": "an older woman is drinking a bottle of water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "an older woman is drinking a bottle of water.jpg"
- },
- {
- "prompt_en": "a smiling woman sitting at a table with food and drinks",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a smiling woman sitting at a table with food and drinks.jpg"
- },
- {
- "prompt_en": "a woman wearing a hijab reading a book on the beach",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman wearing a hijab reading a book on the beach.jpg"
- },
- {
- "prompt_en": "a woman wearing a headscarf is reaching for an olive tree",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman wearing a headscarf is reaching for an olive tree.jpg"
- },
- {
- "prompt_en": "a woman in a white dress jumping in the air in a field of pink flowers",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman in a white dress jumping in the air in a field of pink flowers.jpg"
- },
- {
- "prompt_en": "a woman wearing a conical hat sits on a boat",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman wearing a conical hat sits on a boat.jpg"
- },
- {
- "prompt_en": "an older woman sitting in front of an old building",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "an older woman sitting in front of an old building.jpg"
- },
- {
- "prompt_en": "a woman is praying in front of a buddhist temple",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman is praying in front of a buddhist temple.jpg"
- },
- {
- "prompt_en": "a woman with green hair smiling for the camera",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "single-human",
- "image_name": "a woman with green hair smiling for the camera.jpg"
- },
- {
- "prompt_en": "A group of people in a yellow raft is rowing through turbulent waters",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "A group of people in a yellow raft is rowing through turbulent waters.jpg"
- },
- {
- "prompt_en": "a man carrying a woman on his back in a field",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a man carrying a woman on his back in a field.jpg"
- },
- {
- "prompt_en": "an indian police officer talking to an old woman",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "an indian police officer talking to an old woman.jpg"
- },
- {
- "prompt_en": "two people scuba diving in the ocean",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "two people scuba diving in the ocean.jpg"
- },
- {
- "prompt_en": "A man and woman dressed as sugar skulls in a field of flowers, sharing a loving gaze with each other",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "A man and woman dressed as sugar skulls in a field of flowers, sharing a loving gaze with each other.jpg"
- },
- {
- "prompt_en": "a group of people watching a cow race",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a group of people watching a cow race.jpg"
- },
- {
- "prompt_en": "a man and a child riding bumper cars in an amusement park",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a man and a child riding bumper cars in an amusement park.jpg"
- },
- {
- "prompt_en": "a group of motorcyclists racing on a dirt track",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a group of motorcyclists racing on a dirt track.jpg"
- },
- {
- "prompt_en": "a man and a woman are boxing in a boxing ring",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a man and a woman are boxing in a boxing ring.jpg"
- },
- {
- "prompt_en": "a man holding a baby in his arms",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a man holding a baby in his arms.jpg"
- },
- {
- "prompt_en": "a man and a woman sitting on a bench playing instruments",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a man and a woman sitting on a bench playing instruments.jpg"
- },
- {
- "prompt_en": "two men are standing next to each other with a bicycle",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "two men are standing next to each other with a bicycle.jpg"
- },
- {
- "prompt_en": "a man and a boy sitting on a beach near the ocean",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a man and a boy sitting on a beach near the ocean.jpg"
- },
- {
- "prompt_en": "two men in white clothing standing next to each other",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "two men in white clothing standing next to each other.jpg"
- },
- {
- "prompt_en": "a group of men riding horses in a dusty arena",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a group of men riding horses in a dusty arena.jpg"
- },
- {
- "prompt_en": "a soccer player in a yellow and black shirt is chasing a soccer ball",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a soccer player in a yellow and black shirt is chasing a soccer ball.jpg"
- },
- {
- "prompt_en": "a group of women sitting on the steps of a building",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a group of women sitting on the steps of a building.jpg"
- },
- {
- "prompt_en": "a group of people gathered around a red checkered blanket",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a group of people gathered around a red checkered blanket.jpg"
- },
- {
- "prompt_en": "a group of people in orange jumpsuits running along a river",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a group of people in orange jumpsuits running along a river.jpg"
- },
- {
- "prompt_en": "a woman walking down a sidewalk with a bag",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a woman walking down a sidewalk with a bag.jpg"
- },
- {
- "prompt_en": "a busy street with cars and people on motorcycles",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a busy street with cars and people on motorcycles.jpg"
- },
- {
- "prompt_en": "a man in a mask is walking through a crowd of people",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a man in a mask is walking through a crowd of people.jpg"
- },
- {
- "prompt_en": "a man and a woman walking under an umbrella next to a brick wall",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a man and a woman walking under an umbrella next to a brick wall.jpg"
- },
- {
- "prompt_en": "a group of people riding bikes down a street",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "a group of people riding bikes down a street.jpg"
- },
- {
- "prompt_en": "An old person is holding a cup on the street, and people around are curiously looking at him",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "An old person is holding a cup on the street, and people around are curiously looking at him.jpg"
- },
- {
- "prompt_en": "two young girls playing with leaves in the woods",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "two young girls playing with leaves in the woods.jpg"
- },
- {
- "prompt_en": "One person is riding on the back of a horse led by another person",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "One person is riding on the back of a horse led by another person.jpg"
- },
- {
- "prompt_en": "an older woman and a young girl are knitting together",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "an older woman and a young girl are knitting together.jpg"
- },
- {
- "prompt_en": "three geishas walking down the street in traditional clothing",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "three geishas walking down the street in traditional clothing.jpg"
- },
- {
- "prompt_en": "two men riding bikes down a road near a forest",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "two men riding bikes down a road near a forest.jpg"
- },
- {
- "prompt_en": "two women carrying bowls on their heads",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "two women carrying bowls on their heads.jpg"
- },
- {
- "prompt_en": "two women eating pizza at a restaurant",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "two women eating pizza at a restaurant.jpg"
- },
- {
- "prompt_en": "two young women studying in a library",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "multiple-human",
- "image_name": "two young women studying in a library.jpg"
- },
- {
- "prompt_en": "pink water lilies in a pond with leaves",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "pink water lilies in a pond with leaves.jpg"
- },
- {
- "prompt_en": "a group of succulents in a rock garden",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a group of succulents in a rock garden.jpg"
- },
- {
- "prompt_en": "a close up view of a bunch of snowdrop flowers",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a close up view of a bunch of snowdrop flowers.jpg"
- },
- {
- "prompt_en": "a close up of leaves with water droplets on them",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a close up of leaves with water droplets on them.jpg"
- },
- {
- "prompt_en": "a close-up of a sea anemone in the water",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a close-up of a sea anemone in the water.jpg"
- },
- {
- "prompt_en": "a plant with water droplets on it",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a plant with water droplets on it.jpg"
- },
- {
- "prompt_en": "a group of cactus plants in the desert",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a group of cactus plants in the desert.jpg"
- },
- {
- "prompt_en": "a close-up view of a plant with spiky leaves",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a close-up view of a plant with spiky leaves.jpg"
- },
- {
- "prompt_en": "A budding and blossoming flower bud seedling",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "A budding and blossoming flower bud seedling.jpg"
- },
- {
- "prompt_en": "a field of orange flowers near the ocean'",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a field of orange flowers near the ocean'.jpg"
- },
- {
- "prompt_en": "a close-up view of a bunch of pink flowers",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a close-up view of a bunch of pink flowers.jpg"
- },
- {
- "prompt_en": "pink water lilies in a pond",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "pink water lilies in a pond.jpg"
- },
- {
- "prompt_en": "reeds blowing in the wind against a cloudy sky",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "reeds blowing in the wind against a cloudy sky.jpg"
- },
- {
- "prompt_en": "two tall cacti in the middle of the desert",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "two tall cacti in the middle of the desert.jpg"
- },
- {
- "prompt_en": "a sea anemone on a coral reef",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a sea anemone on a coral reef.jpg"
- },
- {
- "prompt_en": "a dandelion blowing in the wind",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "plant",
- "image_name": "a dandelion blowing in the wind.jpg"
- },
- {
- "prompt_en": "A boiling pot cooking vegetables",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "A boiling pot cooking vegetables.jpg"
- },
- {
- "prompt_en": "a woman stirring food in a pan on the stove",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a woman stirring food in a pan on the stove.jpg"
- },
- {
- "prompt_en": "two eggs are fried in a frying pan on the stove",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "two eggs are fried in a frying pan on the stove.jpg"
- },
- {
- "prompt_en": "fried onion rings in a basket",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "fried onion rings in a basket.jpg"
- },
- {
- "prompt_en": "a pot is sitting on top of a campfire",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a pot is sitting on top of a campfire.jpg"
- },
- {
- "prompt_en": "a chef is preparing a dish with mushrooms on a wooden board",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a chef is preparing a dish with mushrooms on a wooden board.jpg"
- },
- {
- "prompt_en": "a hand holding a slice of pizza",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a hand holding a slice of pizza.jpg"
- },
- {
- "prompt_en": "A person is using tongs to pick up meat from a plate",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "A person is using tongs to pick up meat from a plate.jpg"
- },
- {
- "prompt_en": "The meat is picked up from the grill with tongs",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "The meat is picked up from the grill with tongs.jpg"
- },
- {
- "prompt_en": "A person is whisking eggs, and the egg whites and yolks are gently streaming out",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "A person is whisking eggs, and the egg whites and yolks are gently streaming out.jpg"
- },
- {
- "prompt_en": "a person is putting sauce on a burger",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person is putting sauce on a burger.jpg"
- },
- {
- "prompt_en": "A person is making dumplings",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "A person is making dumplings.jpg"
- },
- {
- "prompt_en": "a pan filled with fried food",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a pan filled with fried food.jpg"
- },
- {
- "prompt_en": "Chopsticks are slowly picking up the buns from the plastic container",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "Chopsticks are slowly picking up the buns from the plastic container.jpg"
- },
- {
- "prompt_en": "a basket of french fries in a fryer",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a basket of french fries in a fryer.jpg"
- },
- {
- "prompt_en": "a table with lobsters and drinks on it",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a table with lobsters and drinks on it.jpg"
- },
- {
- "prompt_en": "a person pouring coffee into a pot on a stove",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person pouring coffee into a pot on a stove.jpg"
- },
- {
- "prompt_en": "a kettle is sitting on top of a campfire",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a kettle is sitting on top of a campfire.jpg"
- },
- {
- "prompt_en": "Chopsticks are picking up noodles from the bowl",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "Chopsticks are picking up noodles from the bowl.jpg"
- },
- {
- "prompt_en": "a person is cooking eggs on an outdoor grill",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person is cooking eggs on an outdoor grill.jpg"
- },
- {
- "prompt_en": "a person is cooking food in a wok on a stove",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person is cooking food in a wok on a stove.jpg"
- },
- {
- "prompt_en": "a person is holding up a burger with his hands",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person is holding up a burger with his hands.jpg"
- },
- {
- "prompt_en": "A person is pouring water into a teacup",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "A person is pouring water into a teacup.jpg"
- },
- {
- "prompt_en": "a person pouring seasoning into a pot of food",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person pouring seasoning into a pot of food.jpg"
- },
- {
- "prompt_en": "a person holding a taco in their hand",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person holding a taco in their hand.jpg"
- },
- {
- "prompt_en": "a person slicing salmon on a cutting board",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person slicing salmon on a cutting board.jpg"
- },
- {
- "prompt_en": "a bunch of food is cooking on a grill over an open fire",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a bunch of food is cooking on a grill over an open fire.jpg"
- },
- {
- "prompt_en": "a close up of a piece of sushi on chopsticks",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a close up of a piece of sushi on chopsticks.jpg"
- },
- {
- "prompt_en": "a group of pots on a stove with flames in the background",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a group of pots on a stove with flames in the background.jpg"
- },
- {
- "prompt_en": "a person cooking vegetables in a pan on a stove",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person cooking vegetables in a pan on a stove.jpg"
- },
- {
- "prompt_en": "a large pot of soup filled with vegetables and meat",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a large pot of soup filled with vegetables and meat.jpg"
- },
- {
- "prompt_en": "a person holding chopsticks over a bowl of food",
- "dimension": [
- "i2v_subject"
- ],
- "image_type": "food",
- "image_name": "a person holding chopsticks over a bowl of food.jpg"
- }
-]
diff --git a/PyTorch/built-in/mm/OpenSora1.1/eval/vbench_i2v/vbench_i2v.py b/PyTorch/built-in/mm/OpenSora1.1/eval/vbench_i2v/vbench_i2v.py
deleted file mode 100644
index b021e430964c0815712088ac352770027d93507b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/eval/vbench_i2v/vbench_i2v.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from vbench2_beta_i2v import VBenchI2V
-
-VIDEO_PATH = ""
-DIMENSIONS = ["i2v_subject", "i2v_background", "camera_motion"]
-
-my_VBench = VBenchI2V("cuda", "vbench2_beta_i2v/vbench2_i2v_full_info.json", "evaluation_results")
-my_VBench.evaluate(videos_path=VIDEO_PATH, name="vbench_i2v", dimension_list=DIMENSIONS, resolution="1-1")
diff --git a/PyTorch/built-in/mm/OpenSora1.1/gradio/README.md b/PyTorch/built-in/mm/OpenSora1.1/gradio/README.md
deleted file mode 100644
index e671f86d263f96c8c6c3d03b17c6510b3bfa6b07..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/gradio/README.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# 🕹 Gradio Demo
-
-We have provided a Gradio demo app for you to generate videos via a web interface. You can choose to run it locally or deploy it to Hugging Face by following the instructions given below.
-
-## 🚀 Run Gradio Locally
-
-We assume that you have already installed `opensora` based on the instructions given in the [main README](../README.md). Follow the steps below to run this app on your local machine.
-
-1. First of all, you need to install `gradio` and `spaces`.
-
-```bash
-pip install gradio spaces
-```
-
-2. Afterwards, you can use the following command to launch different models. Remeber to launch the command in the project root directory instead of the `gradio` folder.
-
-```bash
-# run the default model v1-HQ-16x256x256
-python gradio/app.py
-
-# run the model with higher resolution
-python gradio/app.py --model-type v1-HQ-16x512x512
-
-# run with a different host and port
-python gradio/app.py --port 8000 --host 0.0.0.0
-
-# run with acceleration such as flash attention and fused norm
-python gradio/app.py --enable-optimization
-
-# run with a sharable Gradio link
-python gradio/app.py --share
-```
-
-3. You should then be able to access this demo via the link which appears in your terminal.
-
-
-## 📦 Deploy Gradio to Hugging Face Space
-
-We have also tested this Gradio app on Hugging Face Spaces. You can follow the steps below.
-
-1. Create a Space on Hugging Face, remember to choose `Gradio SDK` and GPU space hardware.
-
-2. Clone the Space repository in your local machine.
-
-3. Copy the `configs` folder and `gradio/app.py` and `gradio/requirements.txt` to the repository you just cloned. The file structure will look like:
-
-```text
-- configs
- - opensora
- - inference
- - 16x256x256.py
- - 16x512x512.py
- - 64x512x512.py
- ...
- ...
-- app.py
-- requirements.txt
-- README.md
-- LICENSE
-- ...
-```
-
-4. Push the files to your remote Hugging Face Spaces repository. The application will be built and run automatically.
diff --git a/PyTorch/built-in/mm/OpenSora1.1/gradio/app.py b/PyTorch/built-in/mm/OpenSora1.1/gradio/app.py
deleted file mode 100644
index 181c059dd7037cd45f647609198a9c7f2ab3a897..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/gradio/app.py
+++ /dev/null
@@ -1,598 +0,0 @@
-#!/usr/bin/env python
-"""
-This script runs a Gradio App for the Open-Sora model.
-
-Usage:
- python demo.py
-"""
-
-import argparse
-import importlib
-import os
-import subprocess
-import sys
-import re
-import json
-import math
-
-import spaces
-import torch
-
-import gradio as gr
-from tempfile import NamedTemporaryFile
-import datetime
-
-
-
-MODEL_TYPES = ["v1.1-stage2", "v1.1-stage3"]
-CONFIG_MAP = {
- "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
- "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
-}
-HF_STDIT_MAP = {
- "v1.1-stage2": "hpcai-tech/OpenSora-STDiT-v2-stage2",
- "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
-}
-RESOLUTION_MAP = {
- "144p": {
- "16:9": (256, 144),
- "9:16": (144, 256),
- "4:3": (221, 165),
- "3:4": (165, 221),
- "1:1": (192, 192),
- },
- "240p": {
- "16:9": (426, 240),
- "9:16": (240, 426),
- "4:3": (370, 278),
- "3:4": (278, 370),
- "1:1": (320, 320),
- },
- "360p": {
- "16:9": (640, 360),
- "9:16": (360, 640),
- "4:3": (554, 416),
- "3:4": (416, 554),
- "1:1": (480, 480),
- },
- "480p": {
- "16:9": (854, 480),
- "9:16": (480, 854),
- "4:3": (740, 555),
- "3:4": (555, 740),
- "1:1": (640, 640),
- },
- "720p": {
- "16:9": (1280, 720),
- "9:16": (720, 1280),
- "4:3": (1108, 832),
- "3:4": (832, 1110),
- "1:1": (960, 960),
- },
-}
-
-
-# ============================
-# Utils
-# ============================
-def collect_references_batch(reference_paths, vae, image_size):
- from opensora.datasets.utils import read_from_path
-
- refs_x = []
- for reference_path in reference_paths:
- if reference_path is None:
- refs_x.append([])
- continue
- ref_path = reference_path.split(";")
- ref = []
- for r_path in ref_path:
- r = read_from_path(r_path, image_size, transform_name="resize_crop")
- r_x = vae.encode(r.unsqueeze(0).to(vae.device, vae.dtype))
- r_x = r_x.squeeze(0)
- ref.append(r_x)
- refs_x.append(ref)
- # refs_x: [batch, ref_num, C, T, H, W]
- return refs_x
-
-
-def process_mask_strategy(mask_strategy):
- mask_batch = []
- mask_strategy = mask_strategy.split(";")
- for mask in mask_strategy:
- mask_group = mask.split(",")
- assert len(mask_group) >= 1 and len(mask_group) <= 6, f"Invalid mask strategy: {mask}"
- if len(mask_group) == 1:
- mask_group.extend(["0", "0", "0", "1", "0"])
- elif len(mask_group) == 2:
- mask_group.extend(["0", "0", "1", "0"])
- elif len(mask_group) == 3:
- mask_group.extend(["0", "1", "0"])
- elif len(mask_group) == 4:
- mask_group.extend(["1", "0"])
- elif len(mask_group) == 5:
- mask_group.append("0")
- mask_batch.append(mask_group)
- return mask_batch
-
-
-def apply_mask_strategy(z, refs_x, mask_strategys, loop_i):
- masks = []
- for i, mask_strategy in enumerate(mask_strategys):
- mask = torch.ones(z.shape[2], dtype=torch.float, device=z.device)
- if mask_strategy is None:
- masks.append(mask)
- continue
- mask_strategy = process_mask_strategy(mask_strategy)
- for mst in mask_strategy:
- loop_id, m_id, m_ref_start, m_target_start, m_length, edit_ratio = mst
- loop_id = int(loop_id)
- if loop_id != loop_i:
- continue
- m_id = int(m_id)
- m_ref_start = int(m_ref_start)
- m_length = int(m_length)
- m_target_start = int(m_target_start)
- edit_ratio = float(edit_ratio)
- ref = refs_x[i][m_id] # [C, T, H, W]
- if m_ref_start < 0:
- m_ref_start = ref.shape[1] + m_ref_start
- if m_target_start < 0:
- # z: [B, C, T, H, W]
- m_target_start = z.shape[2] + m_target_start
- z[i, :, m_target_start : m_target_start + m_length] = ref[:, m_ref_start : m_ref_start + m_length]
- mask[m_target_start : m_target_start + m_length] = edit_ratio
- masks.append(mask)
- masks = torch.stack(masks)
- return masks
-
-
-def process_prompts(prompts, num_loop):
- from opensora.models.text_encoder.t5 import text_preprocessing
-
- ret_prompts = []
- for prompt in prompts:
- if prompt.startswith("|0|"):
- prompt_list = prompt.split("|")[1:]
- text_list = []
- for i in range(0, len(prompt_list), 2):
- start_loop = int(prompt_list[i])
- text = prompt_list[i + 1]
- text = text_preprocessing(text)
- end_loop = int(prompt_list[i + 2]) if i + 2 < len(prompt_list) else num_loop
- text_list.extend([text] * (end_loop - start_loop))
- assert len(text_list) == num_loop, f"Prompt loop mismatch: {len(text_list)} != {num_loop}"
- ret_prompts.append(text_list)
- else:
- prompt = text_preprocessing(prompt)
- ret_prompts.append([prompt] * num_loop)
- return ret_prompts
-
-
-def extract_json_from_prompts(prompts):
- additional_infos = []
- ret_prompts = []
- for prompt in prompts:
- parts = re.split(r"(?=[{\[])", prompt)
- assert len(parts) <= 2, f"Invalid prompt: {prompt}"
- ret_prompts.append(parts[0])
- if len(parts) == 1:
- additional_infos.append({})
- else:
- additional_infos.append(json.loads(parts[1]))
- return ret_prompts, additional_infos
-
-
-# ============================
-# Runtime Environment
-# ============================
-def install_dependencies(enable_optimization=False):
- """
- Install the required dependencies for the demo if they are not already installed.
- """
-
- def _is_package_available(name) -> bool:
- try:
- importlib.import_module(name)
- return True
- except (ImportError, ModuleNotFoundError):
- return False
-
- # flash attention is needed no matter optimization is enabled or not
- # because Hugging Face transformers detects flash_attn is a dependency in STDiT
- # thus, we need to install it no matter what
- if not _is_package_available("flash_attn"):
- subprocess.run(
- f"{sys.executable} -m pip install flash-attn --no-build-isolation",
- env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
- shell=True,
- )
-
- if enable_optimization:
- # install apex for fused layernorm
- if not _is_package_available("apex"):
- subprocess.run(
- f'{sys.executable} -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git',
- shell=True,
- )
-
- # install ninja
- if not _is_package_available("ninja"):
- subprocess.run(f"{sys.executable} -m pip install ninja", shell=True)
-
- # install xformers
- if not _is_package_available("xformers"):
- subprocess.run(
- f"{sys.executable} -m pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers",
- shell=True,
- )
-
-
-# ============================
-# Model-related
-# ============================
-def read_config(config_path):
- """
- Read the configuration file.
- """
- from mmengine.config import Config
-
- return Config.fromfile(config_path)
-
-
-def build_models(model_type, config, enable_optimization=False):
- """
- Build the models for the given model type and configuration.
- """
- # build vae
- from opensora.registry import MODELS, build_module
-
- vae = build_module(config.vae, MODELS).cuda()
-
- # build text encoder
- text_encoder = build_module(config.text_encoder, MODELS) # T5 must be fp32
- text_encoder.t5.model = text_encoder.t5.model.cuda()
-
- # build stdit
- # we load model from HuggingFace directly so that we don't need to
- # handle model download logic in HuggingFace Space
- from transformers import AutoModel
-
- stdit = AutoModel.from_pretrained(
- HF_STDIT_MAP[model_type],
- enable_flash_attn=enable_optimization,
- trust_remote_code=True,
- ).cuda()
-
- # build scheduler
- from opensora.registry import SCHEDULERS
-
- scheduler = build_module(config.scheduler, SCHEDULERS)
-
- # hack for classifier-free guidance
- text_encoder.y_embedder = stdit.y_embedder
-
- # move modelst to device
- vae = vae.to(torch.bfloat16).eval()
- text_encoder.t5.model = text_encoder.t5.model.eval() # t5 must be in fp32
- stdit = stdit.to(torch.bfloat16).eval()
-
- # clear cuda
- torch.cuda.empty_cache()
- return vae, text_encoder, stdit, scheduler
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "--model-type",
- default="v1.1-stage3",
- choices=MODEL_TYPES,
- help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
- )
- parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder")
- parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
- parser.add_argument("--host", default=None, type=str, help="The host to run the Gradio App on.")
- parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
- parser.add_argument(
- "--enable-optimization",
- action="store_true",
- help="Whether to enable optimization such as flash attention and fused layernorm",
- )
- return parser.parse_args()
-
-
-# ============================
-# Main Gradio Script
-# ============================
-# as `run_inference` needs to be wrapped by `spaces.GPU` and the input can only be the prompt text
-# so we can't pass the models to `run_inference` as arguments.
-# instead, we need to define them globally so that we can access these models inside `run_inference`
-
-# read config
-args = parse_args()
-config = read_config(CONFIG_MAP[args.model_type])
-
-# make outputs dir
-os.makedirs(args.output, exist_ok=True)
-
-# disable torch jit as it can cause failure in gradio SDK
-# gradio sdk uses torch with cuda 11.3
-torch.jit._state.disable()
-
-# set up
-install_dependencies(enable_optimization=args.enable_optimization)
-
-# import after installation
-from opensora.datasets import IMG_FPS, save_sample
-from opensora.utils.misc import to_torch_dtype
-
-# some global variables
-dtype = to_torch_dtype(config.dtype)
-device = torch.device("cuda")
-
-# build model
-vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
-
-
-def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
- torch.manual_seed(seed)
- with torch.inference_mode():
- # ======================
- # 1. Preparation
- # ======================
- # parse the inputs
- resolution = RESOLUTION_MAP[resolution][aspect_ratio]
-
- # gather args from config
- num_frames = config.num_frames
- frame_interval = config.frame_interval
- fps = config.fps
- condition_frame_length = config.condition_frame_length
-
- # compute number of loops
- if mode == "Text2Image":
- num_frames = 1
- num_loop = 1
- else:
- num_seconds = int(length.rstrip('s'))
- if num_seconds <= 16:
- num_frames = num_seconds * fps // frame_interval
- num_loop = 1
- else:
- config.num_frames = 16
- total_number_of_frames = num_seconds * fps / frame_interval
- num_loop = math.ceil((total_number_of_frames - condition_frame_length) / (num_frames - condition_frame_length))
-
- # prepare model args
- if config.num_frames == 1:
- fps = IMG_FPS
-
- model_args = dict()
- height_tensor = torch.tensor([resolution[0]], device=device, dtype=dtype)
- width_tensor = torch.tensor([resolution[1]], device=device, dtype=dtype)
- num_frames_tensor = torch.tensor([num_frames], device=device, dtype=dtype)
- ar_tensor = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
- fps_tensor = torch.tensor([fps], device=device, dtype=dtype)
- model_args["height"] = height_tensor
- model_args["width"] = width_tensor
- model_args["num_frames"] = num_frames_tensor
- model_args["ar"] = ar_tensor
- model_args["fps"] = fps_tensor
-
- # compute latent size
- input_size = (num_frames, *resolution)
- latent_size = vae.get_latent_size(input_size)
-
- # process prompt
- prompt_raw = [prompt_text]
- prompt_raw, _ = extract_json_from_prompts(prompt_raw)
- prompt_loops = process_prompts(prompt_raw, num_loop)
- video_clips = []
-
- # prepare mask strategy
- if mode == "Text2Image":
- mask_strategy = [None]
- elif mode == "Text2Video":
- if reference_image is not None:
- mask_strategy = ['0']
- else:
- mask_strategy = [None]
- else:
- raise ValueError(f"Invalid mode: {mode}")
-
- # =========================
- # 2. Load reference images
- # =========================
- if mode == "Text2Image":
- refs_x = collect_references_batch([None], vae, resolution)
- elif mode == "Text2Video":
- if reference_image is not None:
- # save image to disk
- from PIL import Image
- im = Image.fromarray(reference_image)
- idx = os.environ['CUDA_VISIBLE_DEVICES']
-
- with NamedTemporaryFile(suffix=".jpg") as temp_file:
- im.save(temp_file.name)
- refs_x = collect_references_batch([temp_file.name], vae, resolution)
- else:
- refs_x = collect_references_batch([None], vae, resolution)
- else:
- raise ValueError(f"Invalid mode: {mode}")
-
- # 4.3. long video generation
- for loop_i in range(num_loop):
- # 4.4 sample in hidden space
- batch_prompts = [prompt[loop_i] for prompt in prompt_loops]
- z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
-
- # 4.5. apply mask strategy
- masks = None
-
- # if cfg.reference_path is not None:
- if loop_i > 0:
- ref_x = vae.encode(video_clips[-1])
- for j, refs in enumerate(refs_x):
- if refs is None:
- refs_x[j] = [ref_x[j]]
- else:
- refs.append(ref_x[j])
- if mask_strategy[j] is None:
- mask_strategy[j] = ""
- else:
- mask_strategy[j] += ";"
- mask_strategy[
- j
- ] += f"{loop_i},{len(refs)-1},-{condition_frame_length},0,{condition_frame_length}"
-
- masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
-
- # 4.6. diffusion sampling
- # hack to update num_sampling_steps and cfg_scale
- scheduler_kwargs = config.scheduler.copy()
- scheduler_kwargs.pop('type')
- scheduler_kwargs['num_sampling_steps'] = sampling_steps
- scheduler_kwargs['cfg_scale'] = cfg_scale
-
- scheduler.__init__(
- **scheduler_kwargs
- )
- samples = scheduler.sample(
- stdit,
- text_encoder,
- z=z,
- prompts=batch_prompts,
- device=device,
- additional_args=model_args,
- mask=masks, # scheduler must support mask
- )
- samples = vae.decode(samples.to(dtype))
- video_clips.append(samples)
-
- # 4.7. save video
- if loop_i == num_loop - 1:
- video_clips_list = [
- video_clips[0][0]] + [video_clips[i][0][:, config.condition_frame_length :]
- for i in range(1, num_loop)
- ]
- video = torch.cat(video_clips_list, dim=1)
- current_datetime = datetime.datetime.now()
- timestamp = current_datetime.timestamp()
- save_path = os.path.join(args.output, f"output_{timestamp}")
- saved_path = save_sample(video, save_path=save_path, fps=config.fps // config.frame_interval)
- return saved_path
-
-@spaces.GPU(duration=200)
-def run_image_inference(prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
- return run_inference("Text2Image", prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale)
-
-@spaces.GPU(duration=200)
-def run_video_inference(prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
- return run_inference("Text2Video", prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale)
-
-
-def main():
- # create demo
- with gr.Blocks() as demo:
- with gr.Row():
- with gr.Column():
- gr.HTML(
- """
-
-
-
-
-
-
Open-Sora: Democratizing Efficient Video Production for All
-
- """
- )
-
- with gr.Row():
- with gr.Column():
- prompt_text = gr.Textbox(
- label="Prompt",
- placeholder="Describe your video here",
- lines=4,
- )
- resolution = gr.Radio(
- choices=["144p", "240p", "360p", "480p", "720p"],
- value="240p",
- label="Resolution",
- )
- aspect_ratio = gr.Radio(
- choices=["9:16", "16:9", "3:4", "4:3", "1:1"],
- value="9:16",
- label="Aspect Ratio (H:W)",
- )
- length = gr.Radio(
- choices=["2s", "4s", "8s", "16s"],
- value="2s",
- label="Video Length (only effective for video generation)",
- info="8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
- )
-
- with gr.Row():
- seed = gr.Slider(
- value=1024,
- minimum=1,
- maximum=2048,
- step=1,
- label="Seed"
- )
-
- sampling_steps = gr.Slider(
- value=100,
- minimum=1,
- maximum=200,
- step=1,
- label="Sampling steps"
- )
- cfg_scale = gr.Slider(
- value=7.0,
- minimum=0.0,
- maximum=10.0,
- step=0.1,
- label="CFG Scale"
- )
-
- reference_image = gr.Image(
- label="Reference Image (Optional)",
- )
-
- with gr.Column():
- output_video = gr.Video(
- label="Output Video",
- height="100%"
- )
-
- with gr.Row():
- image_gen_button = gr.Button("Generate image")
- video_gen_button = gr.Button("Generate video")
-
-
- image_gen_button.click(
- fn=run_image_inference,
- inputs=[prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale],
- outputs=reference_image
- )
- video_gen_button.click(
- fn=run_video_inference,
- inputs=[prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale],
- outputs=output_video
- )
-
- # launch
- demo.launch(server_port=args.port, server_name=args.host, share=args.share)
-
-
-if __name__ == "__main__":
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/gradio/requirements.txt b/PyTorch/built-in/mm/OpenSora1.1/gradio/requirements.txt
deleted file mode 100644
index f0c5b943c429bd23f409bf46f32f3363f1df0001..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/gradio/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-xformers
-transformers
-git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/__init__.py
deleted file mode 100644
index a3175b2df160a6b1215dc75eb1cc1a91bfc51ae0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .acceleration import *
-from .datasets import *
-from .models import *
-from .registry import *
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/checkpoint.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/checkpoint.py
deleted file mode 100644
index d832a0105ac278982feee34109bc585b4bf4d9d0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/checkpoint.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from collections.abc import Iterable
-
-import torch.nn as nn
-from torch.utils.checkpoint import checkpoint, checkpoint_sequential
-
-
-def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
- assert isinstance(model, nn.Module)
-
- def set_attr(module):
- module.grad_checkpointing = True
- module.fp32_attention = use_fp32_attention
- module.grad_checkpointing_step = gc_step
-
- model.apply(set_attr)
-
-
-def auto_grad_checkpoint(module, *args, **kwargs):
- if getattr(module, "grad_checkpointing", False):
- if not isinstance(module, Iterable):
- return checkpoint(module, *args, **kwargs)
- gc_step = module[0].grad_checkpointing_step
- return checkpoint_sequential(module, gc_step, *args, **kwargs)
- return module(*args, **kwargs)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/communications.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/communications.py
deleted file mode 100644
index d0900d20841248a250b5aeb31755fac689474ff8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/communications.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import torch
-import torch.distributed as dist
-
-
-# ====================
-# All-To-All
-# ====================
-def _all_to_all(
- input_: torch.Tensor,
- world_size: int,
- group: dist.ProcessGroup,
- scatter_dim: int,
- gather_dim: int,
-):
- input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
- output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
- dist.all_to_all(output_list, input_list, group=group)
- return torch.cat(output_list, dim=gather_dim).contiguous()
-
-
-class _AllToAll(torch.autograd.Function):
- """All-to-all communication.
-
- Args:
- input_: input matrix
- process_group: communication group
- scatter_dim: scatter dimension
- gather_dim: gather dimension
- """
-
- @staticmethod
- def forward(ctx, input_, process_group, scatter_dim, gather_dim):
- ctx.process_group = process_group
- ctx.scatter_dim = scatter_dim
- ctx.gather_dim = gather_dim
- ctx.world_size = dist.get_world_size(process_group)
- output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim)
- return output
-
- @staticmethod
- def backward(ctx, grad_output):
- grad_output = _all_to_all(
- grad_output,
- ctx.world_size,
- ctx.process_group,
- ctx.gather_dim,
- ctx.scatter_dim,
- )
- return (
- grad_output,
- None,
- None,
- None,
- )
-
-
-def all_to_all(
- input_: torch.Tensor,
- process_group: dist.ProcessGroup,
- scatter_dim: int = 2,
- gather_dim: int = 1,
-):
- return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)
-
-
-def _gather(
- input_: torch.Tensor,
- world_size: int,
- group: dist.ProcessGroup,
- gather_dim: int,
-):
- if gather_list is None:
- gather_list = [torch.empty_like(input_) for _ in range(world_size)]
- dist.gather(input_, gather_list, group=group, gather_dim=gather_dim)
- return gather_list
-
-
-# ====================
-# Gather-Split
-# ====================
-
-
-def _split(input_, pg: dist.ProcessGroup, dim=-1):
- # skip if only one rank involved
- world_size = dist.get_world_size(pg)
- rank = dist.get_rank(pg)
- if world_size == 1:
- return input_
-
- # Split along last dimension.
- dim_size = input_.size(dim)
- assert dim_size % world_size == 0, (
- f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), "
- f"cannot split tensor evenly"
- )
-
- tensor_list = torch.split(input_, dim_size // world_size, dim=dim)
- output = tensor_list[rank].contiguous()
-
- return output
-
-
-def _gather(input_, pg: dist.ProcessGroup, dim=-1):
- # skip if only one rank involved
- input_ = input_.contiguous()
- world_size = dist.get_world_size(pg)
- dist.get_rank(pg)
-
- if world_size == 1:
- return input_
-
- # all gather
- tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
- assert input_.device.type == "cuda"
- torch.distributed.all_gather(tensor_list, input_, group=pg)
-
- # concat
- output = torch.cat(tensor_list, dim=dim).contiguous()
-
- return output
-
-
-class _GatherForwardSplitBackward(torch.autograd.Function):
- """Gather the input from model parallel region and concatenate.
-
- Args:
- input_: input matrix.
- process_group: parallel mode.
- dim: dimension
- """
-
- @staticmethod
- def symbolic(graph, input_):
- return _gather(input_)
-
- @staticmethod
- def forward(ctx, input_, process_group, dim, grad_scale):
- ctx.mode = process_group
- ctx.dim = dim
- ctx.grad_scale = grad_scale
- return _gather(input_, process_group, dim)
-
- @staticmethod
- def backward(ctx, grad_output):
- if ctx.grad_scale == "up":
- grad_output = grad_output * dist.get_world_size(ctx.mode)
- elif ctx.grad_scale == "down":
- grad_output = grad_output / dist.get_world_size(ctx.mode)
-
- return _split(grad_output, ctx.mode, ctx.dim), None, None, None
-
-
-class _SplitForwardGatherBackward(torch.autograd.Function):
- """
- Split the input and keep only the corresponding chuck to the rank.
-
- Args:
- input_: input matrix.
- process_group: parallel mode.
- dim: dimension
- """
-
- @staticmethod
- def symbolic(graph, input_):
- return _split(input_)
-
- @staticmethod
- def forward(ctx, input_, process_group, dim, grad_scale):
- ctx.mode = process_group
- ctx.dim = dim
- ctx.grad_scale = grad_scale
- return _split(input_, process_group, dim)
-
- @staticmethod
- def backward(ctx, grad_output):
- if ctx.grad_scale == "up":
- grad_output = grad_output * dist.get_world_size(ctx.mode)
- elif ctx.grad_scale == "down":
- grad_output = grad_output / dist.get_world_size(ctx.mode)
- return _gather(grad_output, ctx.mode, ctx.dim), None, None, None
-
-
-def split_forward_gather_backward(input_, process_group, dim, grad_scale=1.0):
- return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale)
-
-
-def gather_forward_split_backward(input_, process_group, dim, grad_scale=None):
- return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/parallel_states.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/parallel_states.py
deleted file mode 100644
index ff2893e33c86da4cb8a5170566917355af882825..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/parallel_states.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import torch.distributed as dist
-
-_GLOBAL_PARALLEL_GROUPS = dict()
-
-
-def set_data_parallel_group(group: dist.ProcessGroup):
- _GLOBAL_PARALLEL_GROUPS["data"] = group
-
-
-def get_data_parallel_group():
- return _GLOBAL_PARALLEL_GROUPS.get("data", None)
-
-
-def set_sequence_parallel_group(group: dist.ProcessGroup):
- _GLOBAL_PARALLEL_GROUPS["sequence"] = group
-
-
-def get_sequence_parallel_group():
- return _GLOBAL_PARALLEL_GROUPS.get("sequence", None)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/plugin.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/plugin.py
deleted file mode 100644
index c657a9539d8fb1f0d65e8f452777a4bb73a84d4d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/plugin.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import random
-from typing import Optional
-
-import numpy as np
-import torch
-from colossalai.booster.plugin import LowLevelZeroPlugin
-from colossalai.cluster import ProcessGroupMesh
-from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
-
-DP_AXIS, SP_AXIS = 0, 1
-
-
-class ZeroSeqParallelPlugin(LowLevelZeroPlugin):
- def __init__(
- self,
- sp_size: int = 1,
- stage: int = 2,
- precision: str = "fp16",
- initial_scale: float = 2**32,
- min_scale: float = 1,
- growth_factor: float = 2,
- backoff_factor: float = 0.5,
- growth_interval: int = 1000,
- hysteresis: int = 2,
- max_scale: float = 2**32,
- max_norm: float = 0.0,
- norm_type: float = 2.0,
- reduce_bucket_size_in_m: int = 12,
- communication_dtype: Optional[torch.dtype] = None,
- overlap_communication: bool = True,
- cpu_offload: bool = False,
- master_weights: bool = True,
- verbose: bool = False,
- ) -> None:
- super().__init__(
- stage=stage,
- precision=precision,
- initial_scale=initial_scale,
- min_scale=min_scale,
- growth_factor=growth_factor,
- backoff_factor=backoff_factor,
- growth_interval=growth_interval,
- hysteresis=hysteresis,
- max_scale=max_scale,
- max_norm=max_norm,
- norm_type=norm_type,
- reduce_bucket_size_in_m=reduce_bucket_size_in_m,
- communication_dtype=communication_dtype,
- overlap_communication=overlap_communication,
- cpu_offload=cpu_offload,
- master_weights=master_weights,
- verbose=verbose,
- )
- self.sp_size = sp_size
- assert self.world_size % sp_size == 0, "world_size must be divisible by sp_size"
- self.dp_size = self.world_size // sp_size
- self.pg_mesh = ProcessGroupMesh(self.dp_size, self.sp_size)
- self.dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS)
- self.sp_group = self.pg_mesh.get_group_along_axis(SP_AXIS)
- self.dp_rank = self.pg_mesh.coordinate(DP_AXIS)
- self.sp_rank = self.pg_mesh.coordinate(SP_AXIS)
-
- def __del__(self):
- """Destroy the prcess groups in ProcessGroupMesh"""
- self.pg_mesh.destroy_mesh_process_groups()
-
- def prepare_dataloader(
- self,
- dataset,
- batch_size,
- shuffle=False,
- seed=1024,
- drop_last=False,
- pin_memory=False,
- num_workers=0,
- distributed_sampler_cls=None,
- **kwargs,
- ):
- _kwargs = kwargs.copy()
- distributed_sampler_cls = distributed_sampler_cls or DistributedSampler
- sampler = distributed_sampler_cls(dataset, num_replicas=self.dp_size, rank=self.dp_rank, shuffle=shuffle)
-
- # Deterministic dataloader
- def seed_worker(worker_id):
- worker_seed = seed
- np.random.seed(worker_seed)
- torch.manual_seed(worker_seed)
- random.seed(worker_seed)
-
- return DataLoader(
- dataset,
- batch_size=batch_size,
- sampler=sampler,
- worker_init_fn=seed_worker,
- drop_last=drop_last,
- pin_memory=pin_memory,
- num_workers=num_workers,
- **_kwargs,
- )
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/modeling/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/modeling/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/modeling/t5.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/modeling/t5.py
deleted file mode 100644
index eb3ead046bfda2d22e3c56b6c8ed97415f350b3b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/modeling/t5.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-import torch
-import torch.nn as nn
-from opensora.utils.device_utils import is_npu_available
-if is_npu_available():
- import torch_npu
-
-
-class NpuRMSNorm(torch.nn.Module):
- def __init__(self, hidden_size, eps=1e-6):
- """
- Initialize NPU RMSNorm normalization layer
- """
- super().__init__()
- self.weight = nn.Parameter(torch.ones(hidden_size))
- self.eps = eps
-
- def forward(self, x):
- return torch_npu.npu_rms_norm(x, self.weight, epsilon=self.eps)[0]
-
-
-class T5LayerNorm(nn.Module):
- def __init__(self, hidden_size, eps=1e-6):
- """
- Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
- """
- super().__init__()
- self.weight = nn.Parameter(torch.ones(hidden_size))
- self.variance_epsilon = eps
-
- def forward(self, hidden_states):
- # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
- # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
- # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
- # half-precision inputs is done in fp32
-
- variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
- hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
- # convert into half-precision if necessary
- if self.weight.dtype in [torch.float16, torch.bfloat16]:
- hidden_states = hidden_states.to(self.weight.dtype)
-
- return self.weight * hidden_states
-
- @staticmethod
- def from_native_module(module, *args, **kwargs):
- if is_npu_available():
- normalized_shape = module.weight.shape[0]
- layer_norm = NpuRMSNorm(normalized_shape, eps=module.variance_epsilon)
- else:
- assert module.__class__.__name__ == "FusedRMSNorm", (
- "Recovering T5LayerNorm requires the original layer to be apex's Fused RMS Norm."
- "Apex's fused norm is automatically used by Hugging Face Transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L265C5-L265C48"
- )
-
- layer_norm = T5LayerNorm(module.normalized_shape, eps=module.eps)
- layer_norm.weight.data.copy_(module.weight.data)
- layer_norm = layer_norm.to(module.weight.device)
- return layer_norm
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/policy/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/policy/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/policy/t5_encoder.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/policy/t5_encoder.py
deleted file mode 100644
index 85c994ecc1a911da5f76b23819148cb1e17b16fa..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/acceleration/shardformer/policy/t5_encoder.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from colossalai.shardformer.modeling.jit import get_jit_fused_dropout_add_func
-from colossalai.shardformer.modeling.t5 import get_jit_fused_T5_layer_ff_forward, get_T5_layer_self_attention_forward
-from colossalai.shardformer.policies.base_policy import Policy, SubModuleReplacementDescription
-
-
-class T5EncoderPolicy(Policy):
- def config_sanity_check(self):
- assert not self.shard_config.enable_tensor_parallelism
- assert not self.shard_config.enable_flash_attention
-
- def preprocess(self):
- return self.model
-
- def module_policy(self):
- from transformers.models.t5.modeling_t5 import T5LayerFF, T5LayerSelfAttention, T5Stack
-
- policy = {}
-
- # check whether apex is installed
- try:
- from opensora.acceleration.shardformer.modeling.t5 import T5LayerNorm
-
- # recover hf from fused rms norm to T5 norm which is faster
- self.append_or_create_submodule_replacement(
- description=SubModuleReplacementDescription(
- suffix="layer_norm",
- target_module=T5LayerNorm,
- ),
- policy=policy,
- target_key=T5LayerFF,
- )
- self.append_or_create_submodule_replacement(
- description=SubModuleReplacementDescription(suffix="layer_norm", target_module=T5LayerNorm),
- policy=policy,
- target_key=T5LayerSelfAttention,
- )
- self.append_or_create_submodule_replacement(
- description=SubModuleReplacementDescription(suffix="final_layer_norm", target_module=T5LayerNorm),
- policy=policy,
- target_key=T5Stack,
- )
- except (ImportError, ModuleNotFoundError):
- pass
-
- # use jit operator
- if self.shard_config.enable_jit_fused:
- self.append_or_create_method_replacement(
- description={
- "forward": get_jit_fused_T5_layer_ff_forward(),
- "dropout_add": get_jit_fused_dropout_add_func(),
- },
- policy=policy,
- target_key=T5LayerFF,
- )
- self.append_or_create_method_replacement(
- description={
- "forward": get_T5_layer_self_attention_forward(),
- "dropout_add": get_jit_fused_dropout_add_func(),
- },
- policy=policy,
- target_key=T5LayerSelfAttention,
- )
-
- return policy
-
- def postprocess(self):
- return self.model
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/__init__.py
deleted file mode 100644
index 545eb17be11c207ac90723507107eb45d1eea30e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .dataloader import prepare_dataloader, prepare_variable_dataloader
-from .datasets import IMG_FPS, VariableVideoTextDataset, VideoTextDataset
-from .utils import get_transforms_image, get_transforms_video, save_sample
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/aspect.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/aspect.py
deleted file mode 100644
index 57e33f297a683d22b0c9fa6977e25d5eeca37f60..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/aspect.py
+++ /dev/null
@@ -1,341 +0,0 @@
-import math
-
-# Ours
-
-
-def get_h_w(a, ts, eps=1e-4):
- h = (ts * a) ** 0.5
- h = h + eps
- h = math.ceil(h) if math.ceil(h) % 2 == 0 else math.floor(h)
- w = h / a
- w = w + eps
- w = math.ceil(w) if math.ceil(w) % 2 == 0 else math.floor(w)
- return h, w
-
-
-AR = [
- 3 / 8,
- 9 / 21,
- 0.48,
- 1 / 2,
- 9 / 17,
- 1 / 1.85,
- 9 / 16,
- 5 / 8,
- 2 / 3,
- 3 / 4,
- 1 / 1,
- 4 / 3,
- 3 / 2,
- 16 / 9,
- 17 / 9,
- 2 / 1,
- 1 / 0.48,
-]
-ARV = [0.375, 0.43, 0.48, 0.50, 0.53, 0.54, 0.56, 0.62, 0.67, 0.75, 1.0, 1.33, 1.50, 1.78, 1.89, 2.0, 2.08]
-
-
-def get_aspect_ratios_dict(ts=360 * 640, ars=AR):
- est = {f"{a:.2f}": get_h_w(a, ts) for a in ars}
- return est
-
-
-# S = 8294400
-ASPECT_RATIO_4K = {
- "0.38": (1764, 4704),
- "0.43": (1886, 4400),
- "0.48": (1996, 4158),
- "0.50": (2036, 4072),
- "0.53": (2096, 3960),
- "0.54": (2118, 3918),
- "0.62": (2276, 3642),
- "0.56": (2160, 3840), # base
- "0.67": (2352, 3528),
- "0.75": (2494, 3326),
- "1.00": (2880, 2880),
- "1.33": (3326, 2494),
- "1.50": (3528, 2352),
- "1.78": (3840, 2160),
- "1.89": (3958, 2096),
- "2.00": (4072, 2036),
- "2.08": (4156, 1994),
-}
-
-# S = 2073600
-ASPECT_RATIO_1080P = {
- "0.38": (882, 2352),
- "0.43": (942, 2198),
- "0.48": (998, 2080),
- "0.50": (1018, 2036),
- "0.53": (1048, 1980),
- "0.54": (1058, 1958),
- "0.56": (1080, 1920), # base
- "0.62": (1138, 1820),
- "0.67": (1176, 1764),
- "0.75": (1248, 1664),
- "1.00": (1440, 1440),
- "1.33": (1662, 1246),
- "1.50": (1764, 1176),
- "1.78": (1920, 1080),
- "1.89": (1980, 1048),
- "2.00": (2036, 1018),
- "2.08": (2078, 998),
-}
-
-# S = 921600
-ASPECT_RATIO_720P = {
- "0.38": (588, 1568),
- "0.43": (628, 1466),
- "0.48": (666, 1388),
- "0.50": (678, 1356),
- "0.53": (698, 1318),
- "0.54": (706, 1306),
- "0.56": (720, 1280), # base
- "0.62": (758, 1212),
- "0.67": (784, 1176),
- "0.75": (832, 1110),
- "1.00": (960, 960),
- "1.33": (1108, 832),
- "1.50": (1176, 784),
- "1.78": (1280, 720),
- "1.89": (1320, 698),
- "2.00": (1358, 680),
- "2.08": (1386, 666),
-}
-
-# S = 409920
-ASPECT_RATIO_480P = {
- "0.38": (392, 1046),
- "0.43": (420, 980),
- "0.48": (444, 925),
- "0.50": (452, 904),
- "0.53": (466, 880),
- "0.54": (470, 870),
- "0.56": (480, 854), # base
- "0.62": (506, 810),
- "0.67": (522, 784),
- "0.75": (554, 738),
- "1.00": (640, 640),
- "1.33": (740, 555),
- "1.50": (784, 522),
- "1.78": (854, 480),
- "1.89": (880, 466),
- "2.00": (906, 454),
- "2.08": (924, 444),
-}
-
-# S = 230400
-ASPECT_RATIO_360P = {
- "0.38": (294, 784),
- "0.43": (314, 732),
- "0.48": (332, 692),
- "0.50": (340, 680),
- "0.53": (350, 662),
- "0.54": (352, 652),
- "0.56": (360, 640), # base
- "0.62": (380, 608),
- "0.67": (392, 588),
- "0.75": (416, 554),
- "1.00": (480, 480),
- "1.33": (554, 416),
- "1.50": (588, 392),
- "1.78": (640, 360),
- "1.89": (660, 350),
- "2.00": (678, 340),
- "2.08": (692, 332),
-}
-
-# S = 102240
-ASPECT_RATIO_240P = {
- "0.38": (196, 522),
- "0.43": (210, 490),
- "0.48": (222, 462),
- "0.50": (226, 452),
- "0.53": (232, 438),
- "0.54": (236, 436),
- "0.56": (240, 426), # base
- "0.62": (252, 404),
- "0.67": (262, 393),
- "0.75": (276, 368),
- "1.00": (320, 320),
- "1.33": (370, 278),
- "1.50": (392, 262),
- "1.78": (426, 240),
- "1.89": (440, 232),
- "2.00": (452, 226),
- "2.08": (462, 222),
-}
-
-# S = 36864
-ASPECT_RATIO_144P = {
- "0.38": (117, 312),
- "0.43": (125, 291),
- "0.48": (133, 277),
- "0.50": (135, 270),
- "0.53": (139, 262),
- "0.54": (141, 260),
- "0.56": (144, 256), # base
- "0.62": (151, 241),
- "0.67": (156, 234),
- "0.75": (166, 221),
- "1.00": (192, 192),
- "1.33": (221, 165),
- "1.50": (235, 156),
- "1.78": (256, 144),
- "1.89": (263, 139),
- "2.00": (271, 135),
- "2.08": (277, 132),
-}
-
-# from PixArt
-# S = 1048576
-ASPECT_RATIO_1024 = {
- "0.25": (512, 2048),
- "0.26": (512, 1984),
- "0.27": (512, 1920),
- "0.28": (512, 1856),
- "0.32": (576, 1792),
- "0.33": (576, 1728),
- "0.35": (576, 1664),
- "0.4": (640, 1600),
- "0.42": (640, 1536),
- "0.48": (704, 1472),
- "0.5": (704, 1408),
- "0.52": (704, 1344),
- "0.57": (768, 1344),
- "0.6": (768, 1280),
- "0.68": (832, 1216),
- "0.72": (832, 1152),
- "0.78": (896, 1152),
- "0.82": (896, 1088),
- "0.88": (960, 1088),
- "0.94": (960, 1024),
- "1.0": (1024, 1024),
- "1.07": (1024, 960),
- "1.13": (1088, 960),
- "1.21": (1088, 896),
- "1.29": (1152, 896),
- "1.38": (1152, 832),
- "1.46": (1216, 832),
- "1.67": (1280, 768),
- "1.75": (1344, 768),
- "2.0": (1408, 704),
- "2.09": (1472, 704),
- "2.4": (1536, 640),
- "2.5": (1600, 640),
- "2.89": (1664, 576),
- "3.0": (1728, 576),
- "3.11": (1792, 576),
- "3.62": (1856, 512),
- "3.75": (1920, 512),
- "3.88": (1984, 512),
- "4.0": (2048, 512),
-}
-
-# S = 262144
-ASPECT_RATIO_512 = {
- "0.25": (256, 1024),
- "0.26": (256, 992),
- "0.27": (256, 960),
- "0.28": (256, 928),
- "0.32": (288, 896),
- "0.33": (288, 864),
- "0.35": (288, 832),
- "0.4": (320, 800),
- "0.42": (320, 768),
- "0.48": (352, 736),
- "0.5": (352, 704),
- "0.52": (352, 672),
- "0.57": (384, 672),
- "0.6": (384, 640),
- "0.68": (416, 608),
- "0.72": (416, 576),
- "0.78": (448, 576),
- "0.82": (448, 544),
- "0.88": (480, 544),
- "0.94": (480, 512),
- "1.0": (512, 512),
- "1.07": (512, 480),
- "1.13": (544, 480),
- "1.21": (544, 448),
- "1.29": (576, 448),
- "1.38": (576, 416),
- "1.46": (608, 416),
- "1.67": (640, 384),
- "1.75": (672, 384),
- "2.0": (704, 352),
- "2.09": (736, 352),
- "2.4": (768, 320),
- "2.5": (800, 320),
- "2.89": (832, 288),
- "3.0": (864, 288),
- "3.11": (896, 288),
- "3.62": (928, 256),
- "3.75": (960, 256),
- "3.88": (992, 256),
- "4.0": (1024, 256),
-}
-
-# S = 65536
-ASPECT_RATIO_256 = {
- "0.25": (128, 512),
- "0.26": (128, 496),
- "0.27": (128, 480),
- "0.28": (128, 464),
- "0.32": (144, 448),
- "0.33": (144, 432),
- "0.35": (144, 416),
- "0.4": (160, 400),
- "0.42": (160, 384),
- "0.48": (176, 368),
- "0.5": (176, 352),
- "0.52": (176, 336),
- "0.57": (192, 336),
- "0.6": (192, 320),
- "0.68": (208, 304),
- "0.72": (208, 288),
- "0.78": (224, 288),
- "0.82": (224, 272),
- "0.88": (240, 272),
- "0.94": (240, 256),
- "1.0": (256, 256),
- "1.07": (256, 240),
- "1.13": (272, 240),
- "1.21": (272, 224),
- "1.29": (288, 224),
- "1.38": (288, 208),
- "1.46": (304, 208),
- "1.67": (320, 192),
- "1.75": (336, 192),
- "2.0": (352, 176),
- "2.09": (368, 176),
- "2.4": (384, 160),
- "2.5": (400, 160),
- "2.89": (416, 144),
- "3.0": (432, 144),
- "3.11": (448, 144),
- "3.62": (464, 128),
- "3.75": (480, 128),
- "3.88": (496, 128),
- "4.0": (512, 128),
-}
-
-
-def get_closest_ratio(height: float, width: float, ratios: dict):
- aspect_ratio = height / width
- closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
- return closest_ratio
-
-
-ASPECT_RATIOS = {
- "144p": (36864, ASPECT_RATIO_144P),
- "256": (65536, ASPECT_RATIO_256),
- "240p": (102240, ASPECT_RATIO_240P),
- "360p": (230400, ASPECT_RATIO_360P),
- "512": (262144, ASPECT_RATIO_512),
- "480p": (409920, ASPECT_RATIO_480P),
- "720p": (921600, ASPECT_RATIO_720P),
- "1024": (1048576, ASPECT_RATIO_1024),
- "1080p": (2073600, ASPECT_RATIO_1080P),
- "4k": (8294400, ASPECT_RATIO_4K),
-}
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/bucket.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/bucket.py
deleted file mode 100644
index 01d9650727745266af0c5c4df255c775031c31f5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/bucket.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from collections import OrderedDict
-
-import numpy as np
-
-from .aspect import ASPECT_RATIOS, get_closest_ratio
-
-
-def find_approximate_hw(hw, hw_dict, approx=0.8):
- for k, v in hw_dict.items():
- if hw >= v * approx:
- return k
- return None
-
-
-def find_closet_smaller_bucket(t, t_dict, frame_interval):
- # process image
- if t == 1:
- if 1 in t_dict:
- return 1
- else:
- return None
- # process video
- for k, v in t_dict.items():
- if t >= v * frame_interval and v != 1:
- return k
- return None
-
-
-class Bucket:
- def __init__(self, bucket_config):
- for key in bucket_config:
- assert key in ASPECT_RATIOS, f"Aspect ratio {key} not found."
- # wrap config with OrderedDict
- bucket_probs = OrderedDict()
- bucket_bs = OrderedDict()
- bucket_names = sorted(bucket_config.keys(), key=lambda x: ASPECT_RATIOS[x][0], reverse=True)
- for key in bucket_names:
- bucket_time_names = sorted(bucket_config[key].keys(), key=lambda x: x, reverse=True)
- bucket_probs[key] = OrderedDict({k: bucket_config[key][k][0] for k in bucket_time_names})
- bucket_bs[key] = OrderedDict({k: bucket_config[key][k][1] for k in bucket_time_names})
-
- # first level: HW
- num_bucket = 0
- hw_criteria = dict()
- t_criteria = dict()
- ar_criteria = dict()
- bucket_id = OrderedDict()
- bucket_id_cnt = 0
- for k1, v1 in bucket_probs.items():
- hw_criteria[k1] = ASPECT_RATIOS[k1][0]
- t_criteria[k1] = dict()
- ar_criteria[k1] = dict()
- bucket_id[k1] = dict()
- for k2, _ in v1.items():
- t_criteria[k1][k2] = k2
- bucket_id[k1][k2] = bucket_id_cnt
- bucket_id_cnt += 1
- ar_criteria[k1][k2] = dict()
- for k3, v3 in ASPECT_RATIOS[k1][1].items():
- ar_criteria[k1][k2][k3] = v3
- num_bucket += 1
-
- self.bucket_probs = bucket_probs
- self.bucket_bs = bucket_bs
- self.bucket_id = bucket_id
- self.hw_criteria = hw_criteria
- self.t_criteria = t_criteria
- self.ar_criteria = ar_criteria
- self.num_bucket = num_bucket
- print(f"Number of buckets: {num_bucket}")
-
- def get_bucket_id(self, T, H, W, frame_interval=1, seed=None):
- resolution = H * W
- approx = 0.8
-
- fail = True
- for hw_id, t_criteria in self.bucket_probs.items():
- if resolution < self.hw_criteria[hw_id] * approx:
- continue
-
- # if sample is an image
- if T == 1:
- if 1 in t_criteria:
- rng = np.random.default_rng(seed + self.bucket_id[hw_id][1])
- if rng.random() < t_criteria[1]:
- fail = False
- t_id = 1
- break
- else:
- continue
-
- # otherwise, find suitable t_id for video
- t_fail = True
- for t_id, prob in t_criteria.items():
- if T > t_id * frame_interval and t_id != 1:
- t_fail = False
- break
- if t_fail:
- continue
-
- # leave the loop if prob is high enough
- rng = np.random.default_rng(seed + self.bucket_id[hw_id][t_id])
- if prob == 1 or rng.random() < prob:
- fail = False
- break
- if fail:
- return None
-
- # get aspect ratio id
- ar_criteria = self.ar_criteria[hw_id][t_id]
- ar_id = get_closest_ratio(H, W, ar_criteria)
- return hw_id, t_id, ar_id
-
- def get_thw(self, bucket_id):
- assert len(bucket_id) == 3
- T = self.t_criteria[bucket_id[0]][bucket_id[1]]
- H, W = self.ar_criteria[bucket_id[0]][bucket_id[1]][bucket_id[2]]
- return T, H, W
-
- def get_prob(self, bucket_id):
- return self.bucket_probs[bucket_id[0]][bucket_id[1]]
-
- def get_batch_size(self, bucket_id):
- return self.bucket_bs[bucket_id[0]][bucket_id[1]]
-
- def __len__(self):
- return self.num_bucket
-
-
-def closet_smaller_bucket(value, bucket):
- for i in range(1, len(bucket)):
- if value < bucket[i]:
- return bucket[i - 1]
- return bucket[-1]
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/dataloader.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/dataloader.py
deleted file mode 100644
index bf360b1de4fb16cf7124b9ce2a7a34812614ef09..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/dataloader.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import random
-from typing import Iterator, Optional
-
-import numpy as np
-import torch
-from torch.distributed import ProcessGroup
-from torch.distributed.distributed_c10d import _get_default_group
-from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.distributed import DistributedSampler
-
-from .sampler import VariableVideoBatchSampler
-
-
-class StatefulDistributedSampler(DistributedSampler):
- def __init__(
- self,
- dataset: Dataset,
- num_replicas: Optional[int] = None,
- rank: Optional[int] = None,
- shuffle: bool = True,
- seed: int = 0,
- drop_last: bool = False,
- ) -> None:
- super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
- self.start_index: int = 0
-
- def __iter__(self) -> Iterator:
- iterator = super().__iter__()
- indices = list(iterator)
- indices = indices[self.start_index :]
- return iter(indices)
-
- def __len__(self) -> int:
- return self.num_samples - self.start_index
-
- def set_start_index(self, start_index: int) -> None:
- self.start_index = start_index
-
-
-def prepare_dataloader(
- dataset,
- batch_size,
- shuffle=False,
- seed=1024,
- drop_last=False,
- pin_memory=False,
- num_workers=0,
- process_group: Optional[ProcessGroup] = None,
- **kwargs,
-):
- r"""
- Prepare a dataloader for distributed training. The dataloader will be wrapped by
- `torch.utils.data.DataLoader` and `StatefulDistributedSampler`.
-
-
- Args:
- dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
- shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
- seed (int, optional): Random worker seed for sampling, defaults to 1024.
- add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
- drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
- is not divisible by the batch size. If False and the size of dataset is not divisible by
- the batch size, then the last batch will be smaller, defaults to False.
- pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
- num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
- kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
- `DataLoader `_.
-
- Returns:
- :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
- """
- _kwargs = kwargs.copy()
- process_group = process_group or _get_default_group()
- sampler = StatefulDistributedSampler(
- dataset,
- num_replicas=process_group.size(),
- rank=process_group.rank(),
- shuffle=shuffle,
- )
-
- # Deterministic dataloader
- def seed_worker(worker_id):
- worker_seed = seed
- np.random.seed(worker_seed)
- torch.manual_seed(worker_seed)
- random.seed(worker_seed)
-
- return DataLoader(
- dataset,
- batch_size=batch_size,
- sampler=sampler,
- worker_init_fn=seed_worker,
- drop_last=drop_last,
- pin_memory=pin_memory,
- num_workers=num_workers,
- **_kwargs,
- )
-
-
-def prepare_variable_dataloader(
- dataset,
- batch_size,
- bucket_config,
- shuffle=False,
- seed=1024,
- drop_last=False,
- pin_memory=False,
- num_workers=0,
- process_group=None,
- num_bucket_build_workers=1,
- **kwargs,
-):
- _kwargs = kwargs.copy()
- process_group = process_group or _get_default_group()
- batch_sampler = VariableVideoBatchSampler(
- dataset,
- bucket_config,
- num_replicas=process_group.size(),
- rank=process_group.rank(),
- shuffle=shuffle,
- seed=seed,
- drop_last=drop_last,
- verbose=True,
- num_bucket_build_workers=num_bucket_build_workers,
- )
-
- # Deterministic dataloader
- def seed_worker(worker_id):
- worker_seed = seed
- np.random.seed(worker_seed)
- torch.manual_seed(worker_seed)
- random.seed(worker_seed)
-
- return torch.utils.data.DataLoader(
- dataset,
- batch_sampler=batch_sampler,
- worker_init_fn=seed_worker,
- pin_memory=pin_memory,
- num_workers=num_workers,
- **_kwargs,
- )
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/datasets.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/datasets.py
deleted file mode 100644
index 4c2b3164ac014aa0c1e6032b97d7211bc681177e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/datasets.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import os
-
-import numpy as np
-import torch
-import torchvision
-from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader
-
-from opensora.registry import DATASETS
-
-from .utils import VID_EXTENSIONS, get_transforms_image, get_transforms_video, read_file, temporal_random_crop
-
-IMG_FPS = 120
-
-
-@DATASETS.register_module()
-class VideoTextDataset(torch.utils.data.Dataset):
- """load video according to the csv file.
-
- Args:
- target_video_len (int): the number of video frames will be load.
- align_transform (callable): Align different videos in a specified size.
- temporal_sample (callable): Sample the target length of a video.
- """
-
- def __init__(
- self,
- data_path,
- num_frames=16,
- frame_interval=1,
- image_size=(256, 256),
- transform_name="center",
- ):
- self.data_path = data_path
- self.data = read_file(data_path)
- self.num_frames = num_frames
- self.frame_interval = frame_interval
- self.image_size = image_size
- self.transforms = {
- "image": get_transforms_image(transform_name, image_size),
- "video": get_transforms_video(transform_name, image_size),
- }
-
- def _print_data_number(self):
- num_videos = 0
- num_images = 0
- for path in self.data["path"]:
- if self.get_type(path) == "video":
- num_videos += 1
- else:
- num_images += 1
- print(f"Dataset contains {num_videos} videos and {num_images} images.")
-
- def get_type(self, path):
- ext = os.path.splitext(path)[-1].lower()
- if ext.lower() in VID_EXTENSIONS:
- return "video"
- else:
- assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}"
- return "image"
-
- def getitem(self, index):
- sample = self.data.iloc[index]
- path = sample["path"]
- text = sample["text"]
- file_type = self.get_type(path)
-
- if file_type == "video":
- # loading
- vframes, _, _ = torchvision.io.read_video(filename=path, pts_unit="sec", output_format="TCHW")
-
- # Sampling video frames
- video = temporal_random_crop(vframes, self.num_frames, self.frame_interval)
-
- # transform
- transform = self.transforms["video"]
- video = transform(video) # T C H W
- else:
- # loading
- image = pil_loader(path)
-
- # transform
- transform = self.transforms["image"]
- image = transform(image)
-
- # repeat
- video = image.unsqueeze(0).repeat(self.num_frames, 1, 1, 1)
-
- # TCHW -> CTHW
- video = video.permute(1, 0, 2, 3)
- return {"video": video, "text": text}
-
- def __getitem__(self, index):
- for _ in range(10):
- try:
- return self.getitem(index)
- except Exception as e:
- path = self.data.iloc[index]["path"]
- print(f"data {path}: {e}")
- index = np.random.randint(len(self))
- raise RuntimeError("Too many bad data.")
-
- def __len__(self):
- return len(self.data)
-
-
-@DATASETS.register_module()
-class VariableVideoTextDataset(VideoTextDataset):
- def __init__(
- self,
- data_path,
- num_frames=None,
- frame_interval=1,
- image_size=None,
- transform_name=None,
- ):
- super().__init__(data_path, num_frames, frame_interval, image_size, transform_name=None)
- self.transform_name = transform_name
- self.data["id"] = np.arange(len(self.data))
-
- def get_data_info(self, index):
- T = self.data.iloc[index]["num_frames"]
- H = self.data.iloc[index]["height"]
- W = self.data.iloc[index]["width"]
- return T, H, W
-
- def getitem(self, index):
- # a hack to pass in the (time, height, width) info from sampler
- index, num_frames, height, width = [int(val) for val in index.split("-")]
-
- sample = self.data.iloc[index]
- path = sample["path"]
- text = sample["text"]
- file_type = self.get_type(path)
- ar = width / height
-
- video_fps = 24 # default fps
- if file_type == "video":
- # loading
- vframes, _, infos = torchvision.io.read_video(filename=path, pts_unit="sec", output_format="TCHW")
- if "video_fps" in infos:
- video_fps = infos["video_fps"]
-
- # Sampling video frames
- video = temporal_random_crop(vframes, num_frames, self.frame_interval)
-
- # transform
- transform = get_transforms_video(self.transform_name, (height, width))
- video = transform(video) # T C H W
- else:
- # loading
- image = pil_loader(path)
- video_fps = IMG_FPS
-
- # transform
- transform = get_transforms_image(self.transform_name, (height, width))
- image = transform(image)
-
- # repeat
- video = image.unsqueeze(0)
-
- # TCHW -> CTHW
- video = video.permute(1, 0, 2, 3)
- return {
- "video": video,
- "text": text,
- "num_frames": num_frames,
- "height": height,
- "width": width,
- "ar": ar,
- "fps": video_fps,
- }
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/sampler.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/sampler.py
deleted file mode 100644
index bdb1b76055fe6805fc662e62309e22b952c7fd42..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/sampler.py
+++ /dev/null
@@ -1,241 +0,0 @@
-import warnings
-from collections import OrderedDict, defaultdict
-from pprint import pprint
-from typing import Iterator, List, Optional
-
-import torch
-import torch.distributed as dist
-from pandarallel import pandarallel
-from torch.utils.data import DistributedSampler
-
-from .bucket import Bucket
-from .datasets import VariableVideoTextDataset
-
-
-# HACK: use pandarallel
-# pandarallel should only access local variables
-def apply(data, method=None, frame_interval=None, seed=None, num_bucket=None):
- return method(
- data["num_frames"],
- data["height"],
- data["width"],
- frame_interval,
- seed + data["id"] * num_bucket,
- )
-
-
-class VariableVideoBatchSampler(DistributedSampler):
- def __init__(
- self,
- dataset: VariableVideoTextDataset,
- bucket_config: dict,
- num_replicas: Optional[int] = None,
- rank: Optional[int] = None,
- shuffle: bool = True,
- seed: int = 0,
- drop_last: bool = False,
- verbose: bool = False,
- num_bucket_build_workers: int = 1,
- ) -> None:
- super().__init__(
- dataset=dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle, seed=seed, drop_last=drop_last
- )
- self.dataset = dataset
- self.bucket = Bucket(bucket_config)
- self.verbose = verbose
- self.last_micro_batch_access_index = 0
- self.approximate_num_batch = None
-
- self._get_num_batch_cached_bucket_sample_dict = None
- self.num_bucket_build_workers = num_bucket_build_workers
-
- def group_by_bucket(self) -> dict:
- bucket_sample_dict = OrderedDict()
-
- pandarallel.initialize(nb_workers=self.num_bucket_build_workers, progress_bar=False)
- bucket_ids = self.dataset.data.parallel_apply(
- apply,
- axis=1,
- method=self.bucket.get_bucket_id,
- frame_interval=self.dataset.frame_interval,
- seed=self.seed + self.epoch,
- num_bucket=self.bucket.num_bucket,
- )
-
- # group by bucket
- # each data sample is put into a bucket with a similar image/video size
- for i in range(len(self.dataset)):
- bucket_id = bucket_ids[i]
- if bucket_id is None:
- continue
- if bucket_id not in bucket_sample_dict:
- bucket_sample_dict[bucket_id] = []
- bucket_sample_dict[bucket_id].append(i)
- return bucket_sample_dict
-
- def get_num_batch(self) -> int:
- bucket_sample_dict = self.group_by_bucket()
- self._get_num_batch_cached_bucket_sample_dict = bucket_sample_dict
-
- # calculate the number of batches
- if self.verbose:
- self._print_bucket_info(bucket_sample_dict)
- return self.approximate_num_batch
-
- def __iter__(self) -> Iterator[List[int]]:
- if self._get_num_batch_cached_bucket_sample_dict is not None:
- bucket_sample_dict = self._get_num_batch_cached_bucket_sample_dict
- self._get_num_batch_cached_bucket_sample_dict = None
- else:
- bucket_sample_dict = self.group_by_bucket()
- if self.verbose:
- self._print_bucket_info(bucket_sample_dict)
-
- g = torch.Generator()
- g.manual_seed(self.seed + self.epoch)
- bucket_micro_batch_count = OrderedDict()
- bucket_last_consumed = OrderedDict()
-
- # process the samples
- for bucket_id, data_list in bucket_sample_dict.items():
- # handle droplast
- bs_per_gpu = self.bucket.get_batch_size(bucket_id)
- remainder = len(data_list) % bs_per_gpu
-
- if remainder > 0:
- if not self.drop_last:
- # if there is remainder, we pad to make it divisible
- data_list += data_list[: bs_per_gpu - remainder]
- else:
- # we just drop the remainder to make it divisible
- data_list = data_list[:-remainder]
- bucket_sample_dict[bucket_id] = data_list
-
- # handle shuffle
- if self.shuffle:
- data_indices = torch.randperm(len(data_list), generator=g).tolist()
- data_list = [data_list[i] for i in data_indices]
- bucket_sample_dict[bucket_id] = data_list
-
- # compute how many micro-batches each bucket has
- num_micro_batches = len(data_list) // bs_per_gpu
- bucket_micro_batch_count[bucket_id] = num_micro_batches
-
- # compute the bucket access order
- # each bucket may have more than one batch of data
- # thus bucket_id may appear more than 1 time
- bucket_id_access_order = []
- for bucket_id, num_micro_batch in bucket_micro_batch_count.items():
- bucket_id_access_order.extend([bucket_id] * num_micro_batch)
-
- # randomize the access order
- if self.shuffle:
- bucket_id_access_order_indices = torch.randperm(len(bucket_id_access_order), generator=g).tolist()
- bucket_id_access_order = [bucket_id_access_order[i] for i in bucket_id_access_order_indices]
-
- # make the number of bucket accesses divisible by dp size
- remainder = len(bucket_id_access_order) % self.num_replicas
- if remainder > 0:
- if self.drop_last:
- bucket_id_access_order = bucket_id_access_order[: len(bucket_id_access_order) - remainder]
- else:
- bucket_id_access_order += bucket_id_access_order[: self.num_replicas - remainder]
-
- # prepare each batch from its bucket
- # according to the predefined bucket access order
- num_iters = len(bucket_id_access_order) // self.num_replicas
- start_iter_idx = self.last_micro_batch_access_index // self.num_replicas
-
- # re-compute the micro-batch consumption
- # this is useful when resuming from a state dict with a different number of GPUs
- self.last_micro_batch_access_index = start_iter_idx * self.num_replicas
- for i in range(self.last_micro_batch_access_index):
- bucket_id = bucket_id_access_order[i]
- bucket_bs = self.bucket.get_batch_size(bucket_id)
- if bucket_id in bucket_last_consumed:
- bucket_last_consumed[bucket_id] += bucket_bs
- else:
- bucket_last_consumed[bucket_id] = bucket_bs
-
- for i in range(start_iter_idx, num_iters):
- bucket_access_list = bucket_id_access_order[i * self.num_replicas : (i + 1) * self.num_replicas]
- self.last_micro_batch_access_index += self.num_replicas
-
- # comppute the data samples consumed by each access
- bucket_access_boundaries = []
- for bucket_id in bucket_access_list:
- bucket_bs = self.bucket.get_batch_size(bucket_id)
- last_consumed_index = bucket_last_consumed.get(bucket_id, 0)
- bucket_access_boundaries.append([last_consumed_index, last_consumed_index + bucket_bs])
-
- # update consumption
- if bucket_id in bucket_last_consumed:
- bucket_last_consumed[bucket_id] += bucket_bs
- else:
- bucket_last_consumed[bucket_id] = bucket_bs
-
- # compute the range of data accessed by each GPU
- bucket_id = bucket_access_list[self.rank]
- boundary = bucket_access_boundaries[self.rank]
- cur_micro_batch = bucket_sample_dict[bucket_id][boundary[0] : boundary[1]]
-
- # encode t, h, w into the sample index
- real_t, real_h, real_w = self.bucket.get_thw(bucket_id)
- cur_micro_batch = [f"{idx}-{real_t}-{real_h}-{real_w}" for idx in cur_micro_batch]
- yield cur_micro_batch
-
- self._reset()
-
- def _reset(self):
- self.last_micro_batch_access_index = 0
-
- def state_dict(self, num_steps: int) -> dict:
- # the last_micro_batch_access_index in the __iter__ is often
- # not accurate during multi-workers and data prefetching
- # thus, we need the user to pass the actual steps which have been executed
- # to calculate the correct last_micro_batch_access_index
- return {"seed": self.seed, "epoch": self.epoch, "last_micro_batch_access_index": num_steps * self.num_replicas}
-
- def load_state_dict(self, state_dict: dict) -> None:
- self.__dict__.update(state_dict)
-
- def _print_bucket_info(self, bucket_sample_dict: dict, verbose=True) -> None:
- total_samples = 0
- num_batch = 0
- num_dict = {}
- num_aspect_dict = defaultdict(int)
- num_hwt_dict = defaultdict(int)
- for k, v in bucket_sample_dict.items():
- size = len(v)
- total_samples += size
- num_dict[k] = size
- num_aspect_dict[k[-1]] += size
- num_hwt_dict[k[:-1]] += size
- num_batch += size // self.bucket.get_batch_size(k[:-1])
- if dist.get_rank() == 0 and verbose:
- print(f"Total training samples: {total_samples}, num buckets: {len(num_dict)}")
- print("Bucket samples:")
- pprint(num_dict)
- print("Bucket samples by aspect ratio:")
- pprint(num_aspect_dict)
- print("Bucket samples by HxWxT:")
- pprint(num_hwt_dict)
- print(f"Number of batches: {num_batch}")
- self.approximate_num_batch = num_batch
-
- def set_epoch(self, epoch: int) -> None:
- super().set_epoch(epoch)
-
- def __len__(self) -> int:
- warnings.warn(
- "The length of VariableVideoBatchSampler is dynamic and may not be accurate. Return the max value."
- )
- min_batch_size = None
- for v in self.bucket.bucket_bs.values():
- for bs in v.values():
- if bs is not None and (min_batch_size is None or bs < min_batch_size):
- min_batch_size = bs
- if self.drop_last:
- return len(self.dataset) // min_batch_size
- else:
- return (len(self.dataset) + min_batch_size - 1) // min_batch_size
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/utils.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/utils.py
deleted file mode 100644
index 9b0be4cde6e26fc9bdb121b2311fd30e6a6b32a1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/utils.py
+++ /dev/null
@@ -1,205 +0,0 @@
-import os
-import re
-
-import numpy as np
-import pandas as pd
-import requests
-import torch
-import torchvision
-import torchvision.transforms as transforms
-from PIL import Image
-from torchvision.datasets.folder import IMG_EXTENSIONS, pil_loader
-from torchvision.io import write_video
-from torchvision.utils import save_image
-
-from . import video_transforms
-
-VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
-
-regex = re.compile(
- r"^(?:http|ftp)s?://" # http:// or https://
- r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
- r"localhost|" # localhost...
- r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
- r"(?::\d+)?" # optional port
- r"(?:/?|[/?]\S+)$",
- re.IGNORECASE,
-)
-
-
-def is_url(url):
- return re.match(regex, url) is not None
-
-
-def read_file(input_path):
- if input_path.endswith(".csv"):
- return pd.read_csv(input_path)
- elif input_path.endswith(".parquet"):
- return pd.read_parquet(input_path)
- else:
- raise NotImplementedError(f"Unsupported file format: {input_path}")
-
-
-def download_url(input_path):
- output_dir = "cache"
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
- base_name = os.path.basename(input_path)
- output_path = os.path.join(output_dir, base_name)
- img_data = requests.get(input_path).content
- with open(output_path, "wb") as handler:
- handler.write(img_data)
- print(f"URL {input_path} downloaded to {output_path}")
- return output_path
-
-
-def temporal_random_crop(vframes, num_frames, frame_interval):
- temporal_sample = video_transforms.TemporalRandomCrop(num_frames * frame_interval)
- total_frames = len(vframes)
- start_frame_ind, end_frame_ind = temporal_sample(total_frames)
- assert end_frame_ind - start_frame_ind >= num_frames
- frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, num_frames, dtype=int)
- video = vframes[frame_indice]
- return video
-
-
-def get_transforms_video(name="center", image_size=(256, 256)):
- if name is None:
- return None
- elif name == "center":
- assert image_size[0] == image_size[1], "image_size must be square for center crop"
- transform_video = transforms.Compose(
- [
- video_transforms.ToTensorVideo(), # TCHW
- # video_transforms.RandomHorizontalFlipVideo(),
- video_transforms.UCFCenterCropVideo(image_size[0]),
- transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
- ]
- )
- elif name == "resize_crop":
- transform_video = transforms.Compose(
- [
- video_transforms.ToTensorVideo(), # TCHW
- video_transforms.ResizeCrop(image_size),
- transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
- ]
- )
- else:
- raise NotImplementedError(f"Transform {name} not implemented")
- return transform_video
-
-
-def get_transforms_image(name="center", image_size=(256, 256)):
- if name is None:
- return None
- elif name == "center":
- assert image_size[0] == image_size[1], "Image size must be square for center crop"
- transform = transforms.Compose(
- [
- transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, image_size[0])),
- # transforms.RandomHorizontalFlip(),
- transforms.ToTensor(),
- transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
- ]
- )
- elif name == "resize_crop":
- transform = transforms.Compose(
- [
- transforms.Lambda(lambda pil_image: resize_crop_to_fill(pil_image, image_size)),
- transforms.ToTensor(),
- transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
- ]
- )
- else:
- raise NotImplementedError(f"Transform {name} not implemented")
- return transform
-
-
-def read_image_from_path(path, transform=None, transform_name="center", num_frames=1, image_size=(256, 256)):
- image = pil_loader(path)
- if transform is None:
- transform = get_transforms_image(image_size=image_size, name=transform_name)
- image = transform(image)
- video = image.unsqueeze(0).repeat(num_frames, 1, 1, 1)
- video = video.permute(1, 0, 2, 3)
- return video
-
-
-def read_video_from_path(path, transform=None, transform_name="center", image_size=(256, 256)):
- vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit="sec", output_format="TCHW")
- if transform is None:
- transform = get_transforms_video(image_size=image_size, name=transform_name)
- video = transform(vframes) # T C H W
- video = video.permute(1, 0, 2, 3)
- return video
-
-
-def read_from_path(path, image_size, transform_name="center"):
- if is_url(path):
- path = download_url(path)
- ext = os.path.splitext(path)[-1].lower()
- if ext.lower() in VID_EXTENSIONS:
- return read_video_from_path(path, image_size=image_size, transform_name=transform_name)
- else:
- assert ext.lower() in IMG_EXTENSIONS, f"Unsupported file format: {ext}"
- return read_image_from_path(path, image_size=image_size, transform_name=transform_name)
-
-
-def save_sample(x, fps=8, save_path=None, normalize=True, value_range=(-1, 1), force_video=False):
- """
- Args:
- x (Tensor): shape [C, T, H, W]
- """
- assert x.ndim == 4
-
- if not force_video and x.shape[1] == 1: # T = 1: save as image
- save_path += ".png"
- x = x.squeeze(1)
- save_image([x], save_path, normalize=normalize, value_range=value_range)
- else:
- save_path += ".mp4"
- if normalize:
- low, high = value_range
- x.clamp_(min=low, max=high)
- x.sub_(low).div_(max(high - low, 1e-5))
-
- x = x.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 3, 0).to("cpu", torch.uint8)
- write_video(save_path, x, fps=fps, video_codec="h264")
- print(f"Saved to {save_path}")
- return save_path
-
-
-def center_crop_arr(pil_image, image_size):
- """
- Center cropping implementation from ADM.
- https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
- """
- while min(*pil_image.size) >= 2 * image_size:
- pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=Image.BOX)
-
- scale = image_size / min(*pil_image.size)
- pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC)
-
- arr = np.array(pil_image)
- crop_y = (arr.shape[0] - image_size) // 2
- crop_x = (arr.shape[1] - image_size) // 2
- return Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size])
-
-
-def resize_crop_to_fill(pil_image, image_size):
- w, h = pil_image.size # PIL is (W, H)
- th, tw = image_size
- rh, rw = th / h, tw / w
- if rh > rw:
- sh, sw = th, round(w * rh)
- image = pil_image.resize((sw, sh), Image.BICUBIC)
- i = 0
- j = int(round((sw - tw) / 2.0))
- else:
- sh, sw = round(h * rw), tw
- image = pil_image.resize((sw, sh), Image.BICUBIC)
- i = int(round((sh - th) / 2.0))
- j = 0
- arr = np.array(image)
- assert i + th <= arr.shape[0] and j + tw <= arr.shape[1]
- return Image.fromarray(arr[i : i + th, j : j + tw])
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/video_transforms.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/video_transforms.py
deleted file mode 100644
index 8cf50468ee96e339c0dfb6401e83a8f34c29b900..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/datasets/video_transforms.py
+++ /dev/null
@@ -1,520 +0,0 @@
-# Copyright 2024 Vchitect/Latte
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-# http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.# Modified from Latte
-
-# - This file is adapted from https://github.com/Vchitect/Latte/blob/main/datasets/video_transforms.py
-
-
-import numbers
-import random
-
-import numpy as np
-import torch
-
-
-def _is_tensor_video_clip(clip):
- if not torch.is_tensor(clip):
- raise TypeError("clip should be Tensor. Got %s" % type(clip))
-
- if not clip.ndimension() == 4:
- raise ValueError("clip should be 4D. Got %dD" % clip.dim())
-
- return True
-
-
-def crop(clip, i, j, h, w):
- """
- Args:
- clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
- """
- if len(clip.size()) != 4:
- raise ValueError("clip should be a 4D tensor")
- return clip[..., i : i + h, j : j + w]
-
-
-def resize(clip, target_size, interpolation_mode):
- if len(target_size) != 2:
- raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
- return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)
-
-
-def resize_scale(clip, target_size, interpolation_mode):
- if len(target_size) != 2:
- raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
- H, W = clip.size(-2), clip.size(-1)
- scale_ = target_size[0] / min(H, W)
- return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)
-
-
-def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
- """
- Do spatial cropping and resizing to the video clip
- Args:
- clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
- i (int): i in (i,j) i.e coordinates of the upper left corner.
- j (int): j in (i,j) i.e coordinates of the upper left corner.
- h (int): Height of the cropped region.
- w (int): Width of the cropped region.
- size (tuple(int, int)): height and width of resized clip
- Returns:
- clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
- """
- if not _is_tensor_video_clip(clip):
- raise ValueError("clip should be a 4D torch.tensor")
- clip = crop(clip, i, j, h, w)
- clip = resize(clip, size, interpolation_mode)
- return clip
-
-
-def center_crop(clip, crop_size):
- if not _is_tensor_video_clip(clip):
- raise ValueError("clip should be a 4D torch.tensor")
- h, w = clip.size(-2), clip.size(-1)
- th, tw = crop_size
- if h < th or w < tw:
- raise ValueError("height and width must be no smaller than crop_size")
-
- i = int(round((h - th) / 2.0))
- j = int(round((w - tw) / 2.0))
- return crop(clip, i, j, th, tw)
-
-
-def center_crop_using_short_edge(clip):
- if not _is_tensor_video_clip(clip):
- raise ValueError("clip should be a 4D torch.tensor")
- h, w = clip.size(-2), clip.size(-1)
- if h < w:
- th, tw = h, h
- i = 0
- j = int(round((w - tw) / 2.0))
- else:
- th, tw = w, w
- i = int(round((h - th) / 2.0))
- j = 0
- return crop(clip, i, j, th, tw)
-
-
-def resize_crop_to_fill(clip, target_size):
- if not _is_tensor_video_clip(clip):
- raise ValueError("clip should be a 4D torch.tensor")
- h, w = clip.size(-2), clip.size(-1)
- th, tw = target_size[0], target_size[1]
- rh, rw = th / h, tw / w
- if rh > rw:
- sh, sw = th, round(w * rh)
- clip = resize(clip, (sh, sw), "bilinear")
- i = 0
- j = int(round(sw - tw) / 2.0)
- else:
- sh, sw = round(h * rw), tw
- clip = resize(clip, (sh, sw), "bilinear")
- i = int(round(sh - th) / 2.0)
- j = 0
- assert i + th <= clip.size(-2) and j + tw <= clip.size(-1)
- return crop(clip, i, j, th, tw)
-
-
-def random_shift_crop(clip):
- """
- Slide along the long edge, with the short edge as crop size
- """
- if not _is_tensor_video_clip(clip):
- raise ValueError("clip should be a 4D torch.tensor")
- h, w = clip.size(-2), clip.size(-1)
-
- if h <= w:
- short_edge = h
- else:
- short_edge = w
-
- th, tw = short_edge, short_edge
-
- i = torch.randint(0, h - th + 1, size=(1,)).item()
- j = torch.randint(0, w - tw + 1, size=(1,)).item()
- return crop(clip, i, j, th, tw)
-
-
-def to_tensor(clip):
- """
- Convert tensor data type from uint8 to float, divide value by 255.0 and
- permute the dimensions of clip tensor
- Args:
- clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
- Return:
- clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
- """
- _is_tensor_video_clip(clip)
- if not clip.dtype == torch.uint8:
- raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
- # return clip.float().permute(3, 0, 1, 2) / 255.0
- return clip.float() / 255.0
-
-
-def normalize(clip, mean, std, inplace=False):
- """
- Args:
- clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
- mean (tuple): pixel RGB mean. Size is (3)
- std (tuple): pixel standard deviation. Size is (3)
- Returns:
- normalized clip (torch.tensor): Size is (T, C, H, W)
- """
- if not _is_tensor_video_clip(clip):
- raise ValueError("clip should be a 4D torch.tensor")
- if not inplace:
- clip = clip.clone()
- mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
- # print(mean)
- std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
- clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
- return clip
-
-
-def hflip(clip):
- """
- Args:
- clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
- Returns:
- flipped clip (torch.tensor): Size is (T, C, H, W)
- """
- if not _is_tensor_video_clip(clip):
- raise ValueError("clip should be a 4D torch.tensor")
- return clip.flip(-1)
-
-
-class ResizeCrop:
- def __init__(self, size):
- if isinstance(size, numbers.Number):
- self.size = (int(size), int(size))
- else:
- self.size = size
-
- def __call__(self, clip):
- clip = resize_crop_to_fill(clip, self.size)
- return clip
-
- def __repr__(self) -> str:
- return f"{self.__class__.__name__}(size={self.size})"
-
-
-class RandomCropVideo:
- def __init__(self, size):
- if isinstance(size, numbers.Number):
- self.size = (int(size), int(size))
- else:
- self.size = size
-
- def __call__(self, clip):
- """
- Args:
- clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
- Returns:
- torch.tensor: randomly cropped video clip.
- size is (T, C, OH, OW)
- """
- i, j, h, w = self.get_params(clip)
- return crop(clip, i, j, h, w)
-
- def get_params(self, clip):
- h, w = clip.shape[-2:]
- th, tw = self.size
-
- if h < th or w < tw:
- raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
-
- if w == tw and h == th:
- return 0, 0, h, w
-
- i = torch.randint(0, h - th + 1, size=(1,)).item()
- j = torch.randint(0, w - tw + 1, size=(1,)).item()
-
- return i, j, th, tw
-
- def __repr__(self) -> str:
- return f"{self.__class__.__name__}(size={self.size})"
-
-
-class CenterCropResizeVideo:
- """
- First use the short side for cropping length,
- center crop video, then resize to the specified size
- """
-
- def __init__(
- self,
- size,
- interpolation_mode="bilinear",
- ):
- if isinstance(size, tuple):
- if len(size) != 2:
- raise ValueError(f"size should be tuple (height, width), instead got {size}")
- self.size = size
- else:
- self.size = (size, size)
-
- self.interpolation_mode = interpolation_mode
-
- def __call__(self, clip):
- """
- Args:
- clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
- Returns:
- torch.tensor: scale resized / center cropped video clip.
- size is (T, C, crop_size, crop_size)
- """
- clip_center_crop = center_crop_using_short_edge(clip)
- clip_center_crop_resize = resize(
- clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode
- )
- return clip_center_crop_resize
-
- def __repr__(self) -> str:
- return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
-
-
-class UCFCenterCropVideo:
- """
- First scale to the specified size in equal proportion to the short edge,
- then center cropping
- """
-
- def __init__(
- self,
- size,
- interpolation_mode="bilinear",
- ):
- if isinstance(size, tuple):
- if len(size) != 2:
- raise ValueError(f"size should be tuple (height, width), instead got {size}")
- self.size = size
- else:
- self.size = (size, size)
-
- self.interpolation_mode = interpolation_mode
-
- def __call__(self, clip):
- """
- Args:
- clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
- Returns:
- torch.tensor: scale resized / center cropped video clip.
- size is (T, C, crop_size, crop_size)
- """
- clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
- clip_center_crop = center_crop(clip_resize, self.size)
- return clip_center_crop
-
- def __repr__(self) -> str:
- return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
-
-
-class KineticsRandomCropResizeVideo:
- """
- Slide along the long edge, with the short edge as crop size. And resie to the desired size.
- """
-
- def __init__(
- self,
- size,
- interpolation_mode="bilinear",
- ):
- if isinstance(size, tuple):
- if len(size) != 2:
- raise ValueError(f"size should be tuple (height, width), instead got {size}")
- self.size = size
- else:
- self.size = (size, size)
-
- self.interpolation_mode = interpolation_mode
-
- def __call__(self, clip):
- clip_random_crop = random_shift_crop(clip)
- clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
- return clip_resize
-
-
-class CenterCropVideo:
- def __init__(
- self,
- size,
- interpolation_mode="bilinear",
- ):
- if isinstance(size, tuple):
- if len(size) != 2:
- raise ValueError(f"size should be tuple (height, width), instead got {size}")
- self.size = size
- else:
- self.size = (size, size)
-
- self.interpolation_mode = interpolation_mode
-
- def __call__(self, clip):
- """
- Args:
- clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
- Returns:
- torch.tensor: center cropped video clip.
- size is (T, C, crop_size, crop_size)
- """
- clip_center_crop = center_crop(clip, self.size)
- return clip_center_crop
-
- def __repr__(self) -> str:
- return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
-
-
-class NormalizeVideo:
- """
- Normalize the video clip by mean subtraction and division by standard deviation
- Args:
- mean (3-tuple): pixel RGB mean
- std (3-tuple): pixel RGB standard deviation
- inplace (boolean): whether do in-place normalization
- """
-
- def __init__(self, mean, std, inplace=False):
- self.mean = mean
- self.std = std
- self.inplace = inplace
-
- def __call__(self, clip):
- """
- Args:
- clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
- """
- return normalize(clip, self.mean, self.std, self.inplace)
-
- def __repr__(self) -> str:
- return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
-
-
-class ToTensorVideo:
- """
- Convert tensor data type from uint8 to float, divide value by 255.0 and
- permute the dimensions of clip tensor
- """
-
- def __init__(self):
- pass
-
- def __call__(self, clip):
- """
- Args:
- clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
- Return:
- clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
- """
- return to_tensor(clip)
-
- def __repr__(self) -> str:
- return self.__class__.__name__
-
-
-class RandomHorizontalFlipVideo:
- """
- Flip the video clip along the horizontal direction with a given probability
- Args:
- p (float): probability of the clip being flipped. Default value is 0.5
- """
-
- def __init__(self, p=0.5):
- self.p = p
-
- def __call__(self, clip):
- """
- Args:
- clip (torch.tensor): Size is (T, C, H, W)
- Return:
- clip (torch.tensor): Size is (T, C, H, W)
- """
- if random.random() < self.p:
- clip = hflip(clip)
- return clip
-
- def __repr__(self) -> str:
- return f"{self.__class__.__name__}(p={self.p})"
-
-
-# ------------------------------------------------------------
-# --------------------- Sampling ---------------------------
-# ------------------------------------------------------------
-class TemporalRandomCrop(object):
- """Temporally crop the given frame indices at a random location.
-
- Args:
- size (int): Desired length of frames will be seen in the model.
- """
-
- def __init__(self, size):
- self.size = size
-
- def __call__(self, total_frames):
- rand_end = max(0, total_frames - self.size - 1)
- begin_index = random.randint(0, rand_end)
- end_index = min(begin_index + self.size, total_frames)
- return begin_index, end_index
-
-
-if __name__ == "__main__":
- import os
-
- import numpy as np
- import torchvision.io as io
- from torchvision import transforms
- from torchvision.utils import save_image
-
- vframes, aframes, info = io.read_video(filename="./v_Archery_g01_c03.avi", pts_unit="sec", output_format="TCHW")
-
- trans = transforms.Compose(
- [
- ToTensorVideo(),
- RandomHorizontalFlipVideo(),
- UCFCenterCropVideo(512),
- # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
- transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
- ]
- )
-
- target_video_len = 32
- frame_interval = 1
- total_frames = len(vframes)
- print(total_frames)
-
- temporal_sample = TemporalRandomCrop(target_video_len * frame_interval)
-
- # Sampling video frames
- start_frame_ind, end_frame_ind = temporal_sample(total_frames)
- # print(start_frame_ind)
- # print(end_frame_ind)
- assert end_frame_ind - start_frame_ind >= target_video_len
- frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int)
- print(frame_indice)
-
- select_vframes = vframes[frame_indice]
- print(select_vframes.shape)
- print(select_vframes.dtype)
-
- select_vframes_trans = trans(select_vframes)
- print(select_vframes_trans.shape)
- print(select_vframes_trans.dtype)
-
- select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8)
- print(select_vframes_trans_int.dtype)
- print(select_vframes_trans_int.permute(0, 2, 3, 1).shape)
-
- io.write_video("./test.avi", select_vframes_trans_int.permute(0, 2, 3, 1), fps=8)
-
- for i in range(target_video_len):
- save_image(
- select_vframes_trans[i], os.path.join("./test000", "%04d.png" % i), normalize=True, value_range=(-1, 1)
- )
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/__init__.py
deleted file mode 100644
index 60253499b07d5c9f4e0848d1b76b26fa5d2ea048..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .dit import *
-from .latte import *
-from .pixart import *
-from .stdit import *
-from .text_encoder import *
-from .vae import *
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/dit/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/dit/__init__.py
deleted file mode 100644
index 94548a363f00ee5bbd7c5b38eaf53d26a4919b11..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/dit/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .dit import DiT, DiT_XL_2, DiT_XL_2x2
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/dit/dit.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/dit/dit.py
deleted file mode 100644
index f264f8e54c1a114a15b20df33b1e45fee37abdb7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/dit/dit.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Modified from Meta DiT
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# DiT: https://github.com/facebookresearch/DiT/tree/main
-# GLIDE: https://github.com/openai/glide-text2im
-# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
-# --------------------------------------------------------
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint
-from einops import rearrange
-from timm.models.vision_transformer import Mlp
-
-from opensora.acceleration.checkpoint import auto_grad_checkpoint
-from opensora.models.layers.blocks import (
- Attention,
- CaptionEmbedder,
- FinalLayer,
- LabelEmbedder,
- PatchEmbed3D,
- TimestepEmbedder,
- approx_gelu,
- get_1d_sincos_pos_embed,
- get_2d_sincos_pos_embed,
- get_layernorm,
- modulate,
-)
-from opensora.registry import MODELS
-from opensora.utils.ckpt_utils import load_checkpoint
-
-
-class DiTBlock(nn.Module):
- """
- A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
- """
-
- def __init__(
- self,
- hidden_size,
- num_heads,
- mlp_ratio=4.0,
- enable_flashattn=False,
- enable_layernorm_kernel=False,
- ):
- super().__init__()
- self.hidden_size = hidden_size
- self.num_heads = num_heads
- self.enable_flashattn = enable_flashattn
- mlp_hidden_dim = int(hidden_size * mlp_ratio)
-
- self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
- self.attn = Attention(
- hidden_size,
- num_heads=num_heads,
- qkv_bias=True,
- enable_flashattn=enable_flashattn,
- )
- self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
- self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
- self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
-
- def forward(self, x, c):
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
- x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1, x, shift_msa, scale_msa))
- x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2, x, shift_mlp, scale_mlp))
- return x
-
-
-@MODELS.register_module()
-class DiT(nn.Module):
- """
- Diffusion model with a Transformer backbone.
- """
-
- def __init__(
- self,
- input_size=(16, 32, 32),
- in_channels=4,
- patch_size=(1, 2, 2),
- hidden_size=1152,
- depth=28,
- num_heads=16,
- mlp_ratio=4.0,
- class_dropout_prob=0.1,
- learn_sigma=True,
- condition="text",
- no_temporal_pos_emb=False,
- caption_channels=512,
- model_max_length=77,
- dtype=torch.float32,
- enable_flashattn=False,
- enable_layernorm_kernel=False,
- enable_sequence_parallelism=False,
- ):
- super().__init__()
- self.learn_sigma = learn_sigma
- self.in_channels = in_channels
- self.out_channels = in_channels * 2 if learn_sigma else in_channels
- self.hidden_size = hidden_size
- self.patch_size = patch_size
- self.input_size = input_size
- num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
- self.num_patches = num_patches
- self.num_temporal = input_size[0] // patch_size[0]
- self.num_spatial = num_patches // self.num_temporal
- self.num_heads = num_heads
- self.dtype = dtype
- self.use_text_encoder = not condition.startswith("label")
- if enable_flashattn:
- assert dtype in [
- torch.float16,
- torch.bfloat16,
- ], f"Flash attention only supports float16 and bfloat16, but got {self.dtype}"
- self.no_temporal_pos_emb = no_temporal_pos_emb
- self.mlp_ratio = mlp_ratio
- self.depth = depth
- assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in DiT"
-
- self.register_buffer("pos_embed_spatial", self.get_spatial_pos_embed())
- self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())
-
- self.x_embedder = PatchEmbed3D(patch_size, in_channels, embed_dim=hidden_size)
- if not self.use_text_encoder:
- num_classes = int(condition.split("_")[-1])
- self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
- else:
- self.y_embedder = CaptionEmbedder(
- in_channels=caption_channels,
- hidden_size=hidden_size,
- uncond_prob=class_dropout_prob,
- act_layer=approx_gelu,
- token_num=1, # pooled token
- )
- self.t_embedder = TimestepEmbedder(hidden_size)
- self.blocks = nn.ModuleList(
- [
- DiTBlock(
- hidden_size,
- num_heads,
- mlp_ratio=mlp_ratio,
- enable_flashattn=enable_flashattn,
- enable_layernorm_kernel=enable_layernorm_kernel,
- )
- for _ in range(depth)
- ]
- )
- self.final_layer = FinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels)
-
- self.initialize_weights()
- self.enable_flashattn = enable_flashattn
- self.enable_layernorm_kernel = enable_layernorm_kernel
-
- def get_spatial_pos_embed(self):
- pos_embed = get_2d_sincos_pos_embed(
- self.hidden_size,
- self.input_size[1] // self.patch_size[1],
- )
- pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
- return pos_embed
-
- def get_temporal_pos_embed(self):
- pos_embed = get_1d_sincos_pos_embed(
- self.hidden_size,
- self.input_size[0] // self.patch_size[0],
- )
- pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
- return pos_embed
-
- def unpatchify(self, x):
- c = self.out_channels
- t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
- pt, ph, pw = self.patch_size
-
- x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
- x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
- imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
- return imgs
-
- def forward(self, x, t, y):
- """
- Forward pass of DiT.
- x: (B, C, T, H, W) tensor of inputs
- t: (B,) tensor of diffusion timesteps
- y: list of text
- """
- # origin inputs should be float32, cast to specified dtype
- x = x.to(self.dtype)
- if self.use_text_encoder:
- y = y.to(self.dtype)
-
- # embedding
- x = self.x_embedder(x) # (B, N, D)
- x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
- x = x + self.pos_embed_spatial
- if not self.no_temporal_pos_emb:
- x = rearrange(x, "b t s d -> b s t d")
- x = x + self.pos_embed_temporal
- x = rearrange(x, "b s t d -> b (t s) d")
- else:
- x = rearrange(x, "b t s d -> b (t s) d")
-
- t = self.t_embedder(t, dtype=x.dtype) # (N, D)
- y = self.y_embedder(y, self.training) # (N, D)
- if self.use_text_encoder:
- y = y.squeeze(1).squeeze(1)
- condition = t + y
-
- # blocks
- for _, block in enumerate(self.blocks):
- c = condition
- x = auto_grad_checkpoint(block, x, c) # (B, N, D)
-
- # final process
- x = self.final_layer(x, condition) # (B, N, num_patches * out_channels)
- x = self.unpatchify(x) # (B, out_channels, T, H, W)
-
- # cast to float32 for better accuracy
- x = x.to(torch.float32)
- return x
-
- def initialize_weights(self):
- # Initialize transformer layers:
- def _basic_init(module):
- if isinstance(module, nn.Linear):
- if module.weight.requires_grad_:
- torch.nn.init.xavier_uniform_(module.weight)
- if module.bias is not None:
- nn.init.constant_(module.bias, 0)
-
- self.apply(_basic_init)
-
- # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
- w = self.x_embedder.proj.weight.data
- nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
- nn.init.constant_(self.x_embedder.proj.bias, 0)
-
- # Initialize timestep embedding MLP:
- nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
- nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
-
- # Zero-out adaLN modulation layers in DiT blocks:
- for block in self.blocks:
- nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
- nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
-
- # Zero-out output layers:
- nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
- nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
- nn.init.constant_(self.final_layer.linear.weight, 0)
- nn.init.constant_(self.final_layer.linear.bias, 0)
-
- # Zero-out text embedding layers:
- if self.use_text_encoder:
- nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
- nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)
-
-
-@MODELS.register_module("DiT-XL/2")
-def DiT_XL_2(from_pretrained=None, **kwargs):
- model = DiT(
- depth=28,
- hidden_size=1152,
- patch_size=(1, 2, 2),
- num_heads=16,
- **kwargs,
- )
- if from_pretrained is not None:
- load_checkpoint(model, from_pretrained)
- return model
-
-
-@MODELS.register_module("DiT-XL/2x2")
-def DiT_XL_2x2(from_pretrained=None, **kwargs):
- model = DiT(
- depth=28,
- hidden_size=1152,
- patch_size=(2, 2, 2),
- num_heads=16,
- **kwargs,
- )
- if from_pretrained is not None:
- load_checkpoint(model, from_pretrained)
- return model
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/latte/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/latte/__init__.py
deleted file mode 100644
index f9d918ad01c676a2c2c0dc25f68aa008101773d3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/latte/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .latte import Latte, Latte_XL_2, Latte_XL_2x2
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/latte/latte.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/latte/latte.py
deleted file mode 100644
index 3f8f9685e00b72e601f662b49925d82a57f9e253..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/latte/latte.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2024 Vchitect/Latte
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.# Modified from Latte
-#
-#
-# This file is mofied from https://github.com/Vchitect/Latte/blob/main/models/latte.py
-#
-# With references to:
-# Latte: https://github.com/Vchitect/Latte
-# DiT: https://github.com/facebookresearch/DiT/tree/main
-
-
-import torch
-from einops import rearrange, repeat
-
-from opensora.acceleration.checkpoint import auto_grad_checkpoint
-from opensora.models.dit import DiT
-from opensora.registry import MODELS
-from opensora.utils.ckpt_utils import load_checkpoint
-
-
-@MODELS.register_module()
-class Latte(DiT):
- def forward(self, x, t, y):
- """
- Forward pass of DiT.
- x: (B, C, T, H, W) tensor of inputs
- t: (B,) tensor of diffusion timesteps
- y: list of text
- """
- # origin inputs should be float32, cast to specified dtype
- x = x.to(self.dtype)
-
- # embedding
- x = self.x_embedder(x) # (B, N, D)
- x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
- x = x + self.pos_embed_spatial
- x = rearrange(x, "b t s d -> b (t s) d")
-
- t = self.t_embedder(t, dtype=x.dtype) # (N, D)
- y = self.y_embedder(y, self.training) # (N, D)
- if self.use_text_encoder:
- y = y.squeeze(1).squeeze(1)
- condition = t + y
- condition_spatial = repeat(condition, "b d -> (b t) d", t=self.num_temporal)
- condition_temporal = repeat(condition, "b d -> (b s) d", s=self.num_spatial)
-
- # blocks
- for i, block in enumerate(self.blocks):
- if i % 2 == 0:
- # spatial
- x = rearrange(x, "b (t s) d -> (b t) s d", t=self.num_temporal, s=self.num_spatial)
- c = condition_spatial
- else:
- # temporal
- x = rearrange(x, "b (t s) d -> (b s) t d", t=self.num_temporal, s=self.num_spatial)
- c = condition_temporal
- if i == 1:
- x = x + self.pos_embed_temporal
-
- x = auto_grad_checkpoint(block, x, c) # (B, N, D)
-
- if i % 2 == 0:
- x = rearrange(x, "(b t) s d -> b (t s) d", t=self.num_temporal, s=self.num_spatial)
- else:
- x = rearrange(x, "(b s) t d -> b (t s) d", t=self.num_temporal, s=self.num_spatial)
-
- # final process
- x = self.final_layer(x, condition) # (B, N, num_patches * out_channels)
- x = self.unpatchify(x) # (B, out_channels, T, H, W)
-
- # cast to float32 for better accuracy
- x = x.to(torch.float32)
- return x
-
-
-@MODELS.register_module("Latte-XL/2")
-def Latte_XL_2(from_pretrained=None, **kwargs):
- model = Latte(
- depth=28,
- hidden_size=1152,
- patch_size=(1, 2, 2),
- num_heads=16,
- **kwargs,
- )
- if from_pretrained is not None:
- load_checkpoint(model, from_pretrained)
- return model
-
-
-@MODELS.register_module("Latte-XL/2x2")
-def Latte_XL_2x2(from_pretrained=None, **kwargs):
- model = Latte(
- depth=28,
- hidden_size=1152,
- patch_size=(2, 2, 2),
- num_heads=16,
- **kwargs,
- )
- if from_pretrained is not None:
- load_checkpoint(model, from_pretrained)
- return model
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/layers/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/layers/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/layers/blocks.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/layers/blocks.py
deleted file mode 100644
index cca035ac0842776fcce3c84f0776affe77541500..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/layers/blocks.py
+++ /dev/null
@@ -1,814 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
-# Latte: https://github.com/Vchitect/Latte
-# DiT: https://github.com/facebookresearch/DiT/tree/main
-# GLIDE: https://github.com/openai/glide-text2im
-# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
-# --------------------------------------------------------
-
-import functools
-import math
-from typing import Optional
-
-import numpy as np
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from einops import rearrange
-from timm.models.vision_transformer import Mlp
-
-from opensora.acceleration.communications import all_to_all, split_forward_gather_backward
-from opensora.acceleration.parallel_states import get_sequence_parallel_group
-from opensora.utils.device_utils import is_npu_available
-if not is_npu_available():
- import xformers.ops
-else:
- import torch_npu
-
-
-approx_gelu = lambda: nn.GELU(approximate="tanh")
-
-
-class LlamaRMSNorm(nn.Module):
- def __init__(self, hidden_size, eps=1e-6):
- """
- LlamaRMSNorm is equivalent to T5LayerNorm
- """
- super().__init__()
- self.weight = nn.Parameter(torch.ones(hidden_size))
- self.variance_epsilon = eps
-
- def forward(self, hidden_states):
- input_dtype = hidden_states.dtype
- hidden_states = hidden_states.to(torch.float32)
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
- hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
- return self.weight * hidden_states.to(input_dtype)
-
-
-def get_layernorm(hidden_size: torch.Tensor, eps: float, affine: bool, use_kernel: bool):
- if use_kernel:
- try:
- from apex.normalization import FusedLayerNorm
-
- return FusedLayerNorm(hidden_size, elementwise_affine=affine, eps=eps)
- except ImportError:
- raise RuntimeError("FusedLayerNorm not available. Please install apex.")
- else:
- return nn.LayerNorm(hidden_size, eps, elementwise_affine=affine)
-
-
-def modulate(norm_func, x, shift, scale):
- # Suppose x is (B, N, D), shift is (B, D), scale is (B, D)
- dtype = x.dtype
- x = norm_func(x.to(torch.float32)).to(dtype)
- x = x * (scale.unsqueeze(1) + 1) + shift.unsqueeze(1)
- x = x.to(dtype)
- return x
-
-
-def t2i_modulate(x, shift, scale):
- return x * (1 + scale) + shift
-
-
-# ===============================================
-# General-purpose Layers
-# ===============================================
-
-
-class PatchEmbed3D(nn.Module):
- """Video to Patch Embedding.
-
- Args:
- patch_size (int): Patch token size. Default: (2,4,4).
- in_chans (int): Number of input video channels. Default: 3.
- embed_dim (int): Number of linear projection output channels. Default: 96.
- norm_layer (nn.Module, optional): Normalization layer. Default: None
- """
-
- def __init__(
- self,
- patch_size=(2, 4, 4),
- in_chans=3,
- embed_dim=96,
- norm_layer=None,
- flatten=True,
- ):
- super().__init__()
- self.patch_size = patch_size
- self.flatten = flatten
-
- self.in_chans = in_chans
- self.embed_dim = embed_dim
-
- self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
- if norm_layer is not None:
- self.norm = norm_layer(embed_dim)
- else:
- self.norm = None
-
- def forward(self, x):
- """Forward function."""
- # padding
- _, _, D, H, W = x.size()
- if W % self.patch_size[2] != 0:
- x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
- if H % self.patch_size[1] != 0:
- x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
- if D % self.patch_size[0] != 0:
- x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
-
- x = self.proj(x) # (B C T H W)
- if self.norm is not None:
- D, Wh, Ww = x.size(2), x.size(3), x.size(4)
- x = x.flatten(2).transpose(1, 2)
- x = self.norm(x)
- x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
- if self.flatten:
- x = x.flatten(2).transpose(1, 2) # BCTHW -> BNC
- return x
-
-
-class Attention(nn.Module):
- def __init__(
- self,
- dim: int,
- num_heads: int = 8,
- qkv_bias: bool = False,
- qk_norm: bool = False,
- attn_drop: float = 0.0,
- proj_drop: float = 0.0,
- norm_layer: nn.Module = LlamaRMSNorm,
- enable_flashattn: bool = False,
- rope=None,
- ) -> None:
- super().__init__()
- assert dim % num_heads == 0, "dim should be divisible by num_heads"
- self.dim = dim
- self.num_heads = num_heads
- self.head_dim = dim // num_heads
- self.scale = self.head_dim**-0.5
- self.enable_flashattn = enable_flashattn
-
- self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
- self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
- self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
- self.attn_drop = nn.Dropout(attn_drop)
- self.proj = nn.Linear(dim, dim)
- self.proj_drop = nn.Dropout(proj_drop)
-
- self.rope = False
- if rope is not None:
- self.rope = True
- self.rotary_emb = rope
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- B, N, C = x.shape
- # flash attn is not memory efficient for small sequences, this is empirical
- enable_flashattn = self.enable_flashattn and (N > B)
- qkv = self.qkv(x)
- qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
-
- qkv = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4)
- q, k, v = qkv.unbind(0)
- # WARNING: this may be a bug
- if self.rope:
- q = self.rotary_emb(q)
- k = self.rotary_emb(k)
- q, k = self.q_norm(q), self.k_norm(k)
-
- if enable_flashattn:
- if is_npu_available() and q.dtype in [torch.float16, torch.bfloat16]:
- x = torch_npu.npu_fusion_attention(
- q, k, v, self.num_heads, input_layout="BNSD",
- pse=None,
- scale=self.scale,
- pre_tockens=65536,
- next_tockens=65536,
- keep_prob=1. - self.attn_drop.p if self.training else 1.,
- sync=False,
- inner_precise=0,
- )[0]
- x = x.transpose(1, 2)
- else:
- from flash_attn import flash_attn_func
-
- # (B, #heads, N, #dim) -> (B, N, #heads, #dim)
- q = q.permute(0, 2, 1, 3)
- k = k.permute(0, 2, 1, 3)
- v = v.permute(0, 2, 1, 3)
- x = flash_attn_func(
- q,
- k,
- v,
- dropout_p=self.attn_drop.p if self.training else 0.0,
- softmax_scale=self.scale,
- )
- else:
- dtype = q.dtype
- q = q * self.scale
- attn = q @ k.transpose(-2, -1) # translate attn to float32
- attn = attn.to(torch.float32)
- attn = attn.softmax(dim=-1)
- attn = attn.to(dtype) # cast back attn to original dtype
- attn = self.attn_drop(attn)
- x = attn @ v
-
- x_output_shape = (B, N, C)
- if not enable_flashattn:
- x = x.transpose(1, 2)
- x = x.reshape(x_output_shape)
- x = self.proj(x)
- x = self.proj_drop(x)
- return x
-
-
-class SeqParallelAttention(Attention):
- def __init__(
- self,
- dim: int,
- num_heads: int = 8,
- qkv_bias: bool = False,
- qk_norm: bool = False,
- attn_drop: float = 0.0,
- proj_drop: float = 0.0,
- norm_layer: nn.Module = LlamaRMSNorm,
- enable_flashattn: bool = False,
- rope=None,
- ) -> None:
- assert rope is None, "Rope is not supported in SeqParallelAttention"
- super().__init__(
- dim=dim,
- num_heads=num_heads,
- qkv_bias=qkv_bias,
- qk_norm=qk_norm,
- attn_drop=attn_drop,
- proj_drop=proj_drop,
- norm_layer=norm_layer,
- enable_flashattn=enable_flashattn,
- )
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- B, N, C = x.shape # for sequence parallel here, the N is a local sequence length
- qkv = self.qkv(x)
- qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
-
- qkv = qkv.view(qkv_shape)
-
- sp_group = get_sequence_parallel_group()
-
- # apply all_to_all to gather sequence and split attention heads
- # [B, SUB_N, 3, NUM_HEAD, HEAD_DIM] -> [B, N, 3, NUM_HEAD_PER_DEVICE, HEAD_DIM]
- qkv = all_to_all(qkv, sp_group, scatter_dim=3, gather_dim=1)
-
- if self.enable_flashattn:
- qkv_permute_shape = (
- 2,
- 0,
- 1,
- 3,
- 4,
- ) # [3, B, N, NUM_HEAD_PER_DEVICE, HEAD_DIM]
- else:
- qkv_permute_shape = (
- 2,
- 0,
- 3,
- 1,
- 4,
- ) # [3, B, NUM_HEAD_PER_DEVICE, N, HEAD_DIM]
- qkv = qkv.permute(qkv_permute_shape)
-
- # ERROR: Should qk_norm first
- q, k, v = qkv.unbind(0)
- q, k = self.q_norm(q), self.k_norm(k)
- if self.enable_flashattn:
- if is_npu_available() and q.dtype in [torch.float16, torch.bfloat16]:
- x = torch_npu.npu_fusion_attention(
- q, k, v, self.num_heads, input_layout="BSND",
- pse=None,
- scale=self.scale,
- pre_tockens=65536,
- next_tockens=65536,
- keep_prob=1.-self.attn_drop.p if self.training else 1.,
- sync=False,
- inner_precise=0,
- )[0]
- else:
- from flash_attn import flash_attn_func
-
- x = flash_attn_func(
- q,
- k,
- v,
- dropout_p=self.attn_drop.p if self.training else 0.0,
- softmax_scale=self.scale,
- )
- else:
- dtype = q.dtype
- q = q * self.scale
- attn = q @ k.transpose(-2, -1) # translate attn to float32
- attn = attn.to(torch.float32)
- attn = attn.softmax(dim=-1)
- attn = attn.to(dtype) # cast back attn to original dtype
- attn = self.attn_drop(attn)
- x = attn @ v
-
- if not self.enable_flashattn:
- x = x.transpose(1, 2)
-
- # apply all to all to gather back attention heads and split sequence
- # [B, N, NUM_HEAD_PER_DEVICE, HEAD_DIM] -> [B, SUB_N, NUM_HEAD, HEAD_DIM]
- x = all_to_all(x, sp_group, scatter_dim=1, gather_dim=2)
-
- # reshape outputs back to [B, N, C]
- x_output_shape = (B, N, C)
- x = x.reshape(x_output_shape)
- x = self.proj(x)
- x = self.proj_drop(x)
- return x
-
-
-class MultiHeadCrossAttention(nn.Module):
- def __init__(self, d_model, num_heads, attn_drop=0.0, proj_drop=0.0):
- super(MultiHeadCrossAttention, self).__init__()
- assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
-
- self.d_model = d_model
- self.num_heads = num_heads
- self.head_dim = d_model // num_heads
-
- self.q_linear = nn.Linear(d_model, d_model)
- self.kv_linear = nn.Linear(d_model, d_model * 2)
- self.attn_drop = nn.Dropout(attn_drop)
- self.proj = nn.Linear(d_model, d_model)
- self.proj_drop = nn.Dropout(proj_drop)
-
- def forward(self, x, cond, mask=None):
- # query/value: img tokens; key: condition; mask: if padding tokens
- B, N, C = x.shape
-
- if is_npu_available() and x.dtype in [torch.float16, torch.bfloat16]:
- q = self.q_linear(x).view(-1, self.num_heads, self.head_dim)
- kv = self.kv_linear(cond).view(-1, 2, self.num_heads, self.head_dim)
- k, v = kv.unbind(1)
-
- actual_seq_qlen = []
- actual_seq_kvlen = []
- if mask is not None:
- ans = 0
- for _ in range(B):
- ans += N
- actual_seq_qlen.append(ans)
- ans = 0
- for m in mask:
- ans += m
- actual_seq_kvlen.append(ans)
- x = torch_npu.npu_fusion_attention(
- q, k, v, self.num_heads, input_layout="TND",
- pse=None,
- scale=1.0 / math.sqrt(self.head_dim),
- pre_tockens=65536,
- next_tockens=65536,
- actual_seq_qlen=tuple(actual_seq_qlen),
- actual_seq_kvlen=tuple(actual_seq_kvlen),
- keep_prob=1. - self.attn_drop.p,
- sparse_mode=0,
- )[0]
- else:
- q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
- kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
- k, v = kv.unbind(2)
-
- attn_bias = None
- if mask is not None:
- attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
- x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
-
- x = x.view(B, -1, C)
- x = self.proj(x)
- x = self.proj_drop(x)
- return x
-
-
-class SeqParallelMultiHeadCrossAttention(MultiHeadCrossAttention):
- def __init__(
- self,
- d_model,
- num_heads,
- attn_drop=0.0,
- proj_drop=0.0,
- ):
- super().__init__(
- d_model=d_model,
- num_heads=num_heads,
- attn_drop=attn_drop,
- proj_drop=proj_drop,
- )
-
- def forward(self, x, cond, mask=None):
- # query/value: img tokens; key: condition; mask: if padding tokens
- sp_group = get_sequence_parallel_group()
- sp_size = dist.get_world_size(sp_group)
- B, SUB_N, C = x.shape
- N = SUB_N * sp_size
-
- # shape:
- # q, k, v: [B, SUB_N, NUM_HEADS, HEAD_DIM]
- q = self.q_linear(x).view(B, -1, self.num_heads, self.head_dim)
- kv = self.kv_linear(cond).view(B, -1, 2, self.num_heads, self.head_dim)
- k, v = kv.unbind(2)
-
- # apply all_to_all to gather sequence and split attention heads
- q = all_to_all(q, sp_group, scatter_dim=2, gather_dim=1)
-
- k = split_forward_gather_backward(k, get_sequence_parallel_group(), dim=2, grad_scale="down")
- v = split_forward_gather_backward(v, get_sequence_parallel_group(), dim=2, grad_scale="down")
-
- # compute attention
- if is_npu_available() and q.dtype in [torch.float16, torch.bfloat16]:
- q = q.view(-1, self.num_heads // sp_size, self.head_dim)
- k = k.view(-1, self.num_heads // sp_size, self.head_dim)
- v = v.view(-1, self.num_heads // sp_size, self.head_dim)
-
- actual_seq_qlen = []
- actual_seq_kvlen = []
- if mask is not None:
- ans = 0
- for _ in range(B):
- ans += N
- actual_seq_qlen.append(ans)
- ans = 0
- for m in mask:
- ans += m
- actual_seq_kvlen.append(ans)
- x = torch_npu.npu_fusion_attention(
- q, k, v, self.num_heads, input_layout="TND",
- pse=None,
- scale=1.0 / math.sqrt(self.head_dim),
- pre_tockens=65536,
- next_tockens=65536,
- actual_seq_qlen=tuple(actual_seq_qlen),
- actual_seq_kvlen=tuple(actual_seq_kvlen),
- keep_prob=1. - self.attn_drop.p,
- sparse_mode=0,
- )[0]
- else:
- q = q.view(1, -1, self.num_heads // sp_size, self.head_dim)
- k = k.view(1, -1, self.num_heads // sp_size, self.head_dim)
- v = v.view(1, -1, self.num_heads // sp_size, self.head_dim)
- attn_bias = None
- if mask is not None:
- attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
- x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
-
- # apply all to all to gather back attention heads and scatter sequence
- x = x.view(B, -1, self.num_heads // sp_size, self.head_dim)
- x = all_to_all(x, sp_group, scatter_dim=1, gather_dim=2)
-
- # apply output projection
- x = x.view(B, -1, C)
- x = self.proj(x)
- x = self.proj_drop(x)
- return x
-
-
-class FinalLayer(nn.Module):
- """
- The final layer of DiT.
- """
-
- def __init__(self, hidden_size, num_patch, out_channels):
- super().__init__()
- self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
- self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True)
- self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
-
- def forward(self, x, c):
- shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
- x = modulate(self.norm_final, x, shift, scale)
- x = self.linear(x)
- return x
-
-
-class T2IFinalLayer(nn.Module):
- """
- The final layer of PixArt.
- """
-
- def __init__(self, hidden_size, num_patch, out_channels, d_t=None, d_s=None):
- super().__init__()
- self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
- self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True)
- self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5)
- self.out_channels = out_channels
- self.d_t = d_t
- self.d_s = d_s
-
- def t_mask_select(self, x_mask, x, masked_x, T, S):
- # x: [B, (T, S), C]
- # mased_x: [B, (T, S), C]
- # x_mask: [B, T]
- x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
- masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S)
- x = torch.where(x_mask[:, :, None, None], x, masked_x)
- x = rearrange(x, "B T S C -> B (T S) C")
- return x
-
- def forward(self, x, t, x_mask=None, t0=None, T=None, S=None):
- if T is None:
- T = self.d_t
- if S is None:
- S = self.d_s
- shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
- x = t2i_modulate(self.norm_final(x), shift, scale)
- if x_mask is not None:
- shift_zero, scale_zero = (self.scale_shift_table[None] + t0[:, None]).chunk(2, dim=1)
- x_zero = t2i_modulate(self.norm_final(x), shift_zero, scale_zero)
- x = self.t_mask_select(x_mask, x, x_zero, T, S)
- x = self.linear(x)
- return x
-
-
-# ===============================================
-# Embedding Layers for Timesteps and Class Labels
-# ===============================================
-
-
-class TimestepEmbedder(nn.Module):
- """
- Embeds scalar timesteps into vector representations.
- """
-
- def __init__(self, hidden_size, frequency_embedding_size=256):
- super().__init__()
- self.mlp = nn.Sequential(
- nn.Linear(frequency_embedding_size, hidden_size, bias=True),
- nn.SiLU(),
- nn.Linear(hidden_size, hidden_size, bias=True),
- )
- self.frequency_embedding_size = frequency_embedding_size
-
- @staticmethod
- def timestep_embedding(t, dim, max_period=10000):
- """
- Create sinusoidal timestep embeddings.
- :param t: a 1-D Tensor of N indices, one per batch element.
- These may be fractional.
- :param dim: the dimension of the output.
- :param max_period: controls the minimum frequency of the embeddings.
- :return: an (N, D) Tensor of positional embeddings.
- """
- # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
- half = dim // 2
- freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half)
- freqs = freqs.to(device=t.device)
- args = t[:, None].float() * freqs[None]
- embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
- if dim % 2:
- embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
- return embedding
-
- def forward(self, t, dtype):
- t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
- if t_freq.dtype != dtype:
- t_freq = t_freq.to(dtype)
- t_emb = self.mlp(t_freq)
- return t_emb
-
-
-class LabelEmbedder(nn.Module):
- """
- Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
- """
-
- def __init__(self, num_classes, hidden_size, dropout_prob):
- super().__init__()
- use_cfg_embedding = dropout_prob > 0
- self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
- self.num_classes = num_classes
- self.dropout_prob = dropout_prob
-
- def token_drop(self, labels, force_drop_ids=None):
- """
- Drops labels to enable classifier-free guidance.
- """
- if force_drop_ids is None:
- drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob
- else:
- drop_ids = force_drop_ids == 1
- labels = torch.where(drop_ids, self.num_classes, labels)
- return labels
-
- def forward(self, labels, train, force_drop_ids=None):
- use_dropout = self.dropout_prob > 0
- if (train and use_dropout) or (force_drop_ids is not None):
- labels = self.token_drop(labels, force_drop_ids)
- return self.embedding_table(labels)
-
-
-class SizeEmbedder(TimestepEmbedder):
- """
- Embeds scalar timesteps into vector representations.
- """
-
- def __init__(self, hidden_size, frequency_embedding_size=256):
- super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size)
- self.mlp = nn.Sequential(
- nn.Linear(frequency_embedding_size, hidden_size, bias=True),
- nn.SiLU(),
- nn.Linear(hidden_size, hidden_size, bias=True),
- )
- self.frequency_embedding_size = frequency_embedding_size
- self.outdim = hidden_size
-
- def forward(self, s, bs):
- if s.ndim == 1:
- s = s[:, None]
- assert s.ndim == 2
- if s.shape[0] != bs:
- s = s.repeat(bs // s.shape[0], 1)
- assert s.shape[0] == bs
- b, dims = s.shape[0], s.shape[1]
- s = rearrange(s, "b d -> (b d)")
- s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype)
- s_emb = self.mlp(s_freq)
- s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
- return s_emb
-
- @property
- def dtype(self):
- return next(self.parameters()).dtype
-
-
-class CaptionEmbedder(nn.Module):
- """
- Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
- """
-
- def __init__(
- self,
- in_channels,
- hidden_size,
- uncond_prob,
- act_layer=nn.GELU(approximate="tanh"),
- token_num=120,
- ):
- super().__init__()
- self.y_proj = Mlp(
- in_features=in_channels,
- hidden_features=hidden_size,
- out_features=hidden_size,
- act_layer=act_layer,
- drop=0,
- )
- self.register_buffer(
- "y_embedding",
- torch.randn(token_num, in_channels) / in_channels**0.5,
- )
- self.uncond_prob = uncond_prob
-
- def token_drop(self, caption, force_drop_ids=None):
- """
- Drops labels to enable classifier-free guidance.
- """
- if force_drop_ids is None:
- drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob
- else:
- drop_ids = force_drop_ids == 1
- caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
- return caption
-
- def forward(self, caption, train, force_drop_ids=None):
- if train:
- assert caption.shape[2:] == self.y_embedding.shape
- use_dropout = self.uncond_prob > 0
- if (train and use_dropout) or (force_drop_ids is not None):
- caption = self.token_drop(caption, force_drop_ids)
- caption = self.y_proj(caption)
- return caption
-
-
-class PositionEmbedding2D(nn.Module):
- def __init__(self, dim: int) -> None:
- super().__init__()
- self.dim = dim
- assert dim % 4 == 0, "dim must be divisible by 4"
- half_dim = dim // 2
- inv_freq = 1.0 / (10000 ** (torch.arange(0, half_dim, 2).float() / half_dim))
- self.register_buffer("inv_freq", inv_freq, persistent=False)
-
- def _get_sin_cos_emb(self, t: torch.Tensor):
- out = torch.einsum("i,d->id", t, self.inv_freq)
- emb_cos = torch.cos(out)
- emb_sin = torch.sin(out)
- return torch.cat((emb_sin, emb_cos), dim=-1)
-
- @functools.lru_cache(maxsize=512)
- def _get_cached_emb(
- self,
- device: torch.device,
- dtype: torch.dtype,
- h: int,
- w: int,
- scale: float = 1.0,
- base_size: Optional[int] = None,
- ):
- grid_h = torch.arange(h, device=device) / scale
- grid_w = torch.arange(w, device=device) / scale
- if base_size is not None:
- grid_h *= base_size / h
- grid_w *= base_size / w
- grid_h, grid_w = torch.meshgrid(
- grid_w,
- grid_h,
- indexing="ij",
- ) # here w goes first
- grid_h = grid_h.t().reshape(-1)
- grid_w = grid_w.t().reshape(-1)
- emb_h = self._get_sin_cos_emb(grid_h)
- emb_w = self._get_sin_cos_emb(grid_w)
- return torch.concat([emb_h, emb_w], dim=-1).unsqueeze(0).to(dtype)
-
- def forward(
- self,
- x: torch.Tensor,
- h: int,
- w: int,
- scale: Optional[float] = 1.0,
- base_size: Optional[int] = None,
- ) -> torch.Tensor:
- return self._get_cached_emb(x.device, x.dtype, h, w, scale, base_size)
-
-
-# ===============================================
-# Sine/Cosine Positional Embedding Functions
-# ===============================================
-# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
-
-
-def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, scale=1.0, base_size=None):
- """
- grid_size: int of the grid height and width
- return:
- pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
- """
- if not isinstance(grid_size, tuple):
- grid_size = (grid_size, grid_size)
-
- grid_h = np.arange(grid_size[0], dtype=np.float32) / scale
- grid_w = np.arange(grid_size[1], dtype=np.float32) / scale
- if base_size is not None:
- grid_h *= base_size / grid_size[0]
- grid_w *= base_size / grid_size[1]
- grid = np.meshgrid(grid_w, grid_h) # here w goes first
- grid = np.stack(grid, axis=0)
-
- grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
- pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
- if cls_token and extra_tokens > 0:
- pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
- return pos_embed
-
-
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
- assert embed_dim % 2 == 0
-
- # use half of dimensions to encode grid_h
- emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
- emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
-
- emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
- return emb
-
-
-def get_1d_sincos_pos_embed(embed_dim, length, scale=1.0):
- pos = np.arange(0, length)[..., None] / scale
- return get_1d_sincos_pos_embed_from_grid(embed_dim, pos)
-
-
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
- """
- embed_dim: output dimension for each position
- pos: a list of positions to be encoded: size (M,)
- out: (M, D)
- """
- assert embed_dim % 2 == 0
- omega = np.arange(embed_dim // 2, dtype=np.float64)
- omega /= embed_dim / 2.0
- omega = 1.0 / 10000**omega # (D/2,)
-
- pos = pos.reshape(-1) # (M,)
- out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
-
- emb_sin = np.sin(out) # (M, D/2)
- emb_cos = np.cos(out) # (M, D/2)
-
- emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
- return emb
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/pixart/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/pixart/__init__.py
deleted file mode 100644
index cf8320211a82bd0a4689b2afa9b600adeee6cfeb..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/pixart/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .pixart import PixArt, PixArt_XL_2
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/pixart/pixart.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/pixart/pixart.py
deleted file mode 100644
index 421f836d4f6be49085b0b26194e3ed5ee543a25f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/pixart/pixart.py
+++ /dev/null
@@ -1,391 +0,0 @@
-# Adapted from PixArt
-#
-# Copyright (C) 2023 PixArt-alpha/PixArt-alpha
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published
-# by the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
-# DiT: https://github.com/facebookresearch/DiT/tree/main
-# --------------------------------------------------------
-
-import numpy as np
-import torch
-import torch.nn as nn
-from einops import rearrange
-from timm.models.layers import DropPath
-from timm.models.vision_transformer import Mlp
-
-# from .builder import MODELS
-from opensora.acceleration.checkpoint import auto_grad_checkpoint
-from opensora.models.layers.blocks import (
- Attention,
- CaptionEmbedder,
- MultiHeadCrossAttention,
- PatchEmbed3D,
- SeqParallelAttention,
- SeqParallelMultiHeadCrossAttention,
- SizeEmbedder,
- T2IFinalLayer,
- TimestepEmbedder,
- approx_gelu,
- get_1d_sincos_pos_embed,
- get_2d_sincos_pos_embed,
- get_layernorm,
- t2i_modulate,
-)
-from opensora.registry import MODELS
-from opensora.utils.ckpt_utils import load_checkpoint
-
-
-class PixArtBlock(nn.Module):
- """
- A PixArt block with adaptive layer norm (adaLN-single) conditioning.
- """
-
- def __init__(
- self,
- hidden_size,
- num_heads,
- mlp_ratio=4.0,
- drop_path=0.0,
- enable_flashattn=False,
- enable_layernorm_kernel=False,
- enable_sequence_parallelism=False,
- ):
- super().__init__()
- self.hidden_size = hidden_size
- self.enable_flashattn = enable_flashattn
- self._enable_sequence_parallelism = enable_sequence_parallelism
-
- if enable_sequence_parallelism:
- self.attn_cls = SeqParallelAttention
- self.mha_cls = SeqParallelMultiHeadCrossAttention
- else:
- self.attn_cls = Attention
- self.mha_cls = MultiHeadCrossAttention
-
- self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
- self.attn = self.attn_cls(
- hidden_size,
- num_heads=num_heads,
- qkv_bias=True,
- enable_flashattn=enable_flashattn,
- )
- self.cross_attn = self.mha_cls(hidden_size, num_heads)
- self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
- self.mlp = Mlp(
- in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
- )
- self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
- self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)
-
- def forward(self, x, y, t, mask=None):
- B, N, C = x.shape
-
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
- self.scale_shift_table[None] + t.reshape(B, 6, -1)
- ).chunk(6, dim=1)
- x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
- x = x + self.cross_attn(x, y, mask)
- x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
-
- return x
-
-
-@MODELS.register_module()
-class PixArt(nn.Module):
- """
- Diffusion model with a Transformer backbone.
- """
-
- def __init__(
- self,
- input_size=(1, 32, 32),
- in_channels=4,
- patch_size=(1, 2, 2),
- hidden_size=1152,
- depth=28,
- num_heads=16,
- mlp_ratio=4.0,
- class_dropout_prob=0.1,
- pred_sigma=True,
- drop_path: float = 0.0,
- no_temporal_pos_emb=False,
- caption_channels=4096,
- model_max_length=120,
- dtype=torch.float32,
- freeze=None,
- space_scale=1.0,
- time_scale=1.0,
- enable_flashattn=False,
- enable_layernorm_kernel=False,
- enable_sequence_parallelism=False,
- ):
- super().__init__()
- assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in this version."
- self.pred_sigma = pred_sigma
- self.in_channels = in_channels
- self.out_channels = in_channels * 2 if pred_sigma else in_channels
- self.hidden_size = hidden_size
- self.patch_size = patch_size
- self.input_size = input_size
- num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
- self.num_patches = num_patches
- self.num_temporal = input_size[0] // patch_size[0]
- self.num_spatial = num_patches // self.num_temporal
- self.base_size = int(np.sqrt(self.num_spatial))
- self.num_heads = num_heads
- self.dtype = dtype
- self.no_temporal_pos_emb = no_temporal_pos_emb
- self.depth = depth
- self.mlp_ratio = mlp_ratio
- self.enable_flashattn = enable_flashattn
- self.enable_layernorm_kernel = enable_layernorm_kernel
- self.space_scale = space_scale
- self.time_scale = time_scale
-
- self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
- self.t_embedder = TimestepEmbedder(hidden_size)
- self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
- self.y_embedder = CaptionEmbedder(
- in_channels=caption_channels,
- hidden_size=hidden_size,
- uncond_prob=class_dropout_prob,
- act_layer=approx_gelu,
- token_num=model_max_length,
- )
-
- self.register_buffer("pos_embed", self.get_spatial_pos_embed())
- self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())
-
- drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule
- self.blocks = nn.ModuleList(
- [
- PixArtBlock(
- hidden_size,
- num_heads,
- mlp_ratio=mlp_ratio,
- drop_path=drop_path[i],
- enable_flashattn=enable_flashattn,
- enable_layernorm_kernel=enable_layernorm_kernel,
- )
- for i in range(depth)
- ]
- )
- self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels)
-
- self.initialize_weights()
- if freeze is not None:
- assert freeze in ["text"]
- if freeze == "text":
- self.freeze_text()
-
- def forward(self, x, timestep, y, mask=None):
- """
- Forward pass of PixArt.
- x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
- t: (N,) tensor of diffusion timesteps
- y: (N, 1, 120, C) tensor of class labels
- """
- x = x.to(self.dtype)
- timestep = timestep.to(self.dtype)
- y = y.to(self.dtype)
-
- # embedding
- x = self.x_embedder(x) # (B, N, D)
- x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
- x = x + self.pos_embed
- if not self.no_temporal_pos_emb:
- x = rearrange(x, "b t s d -> b s t d")
- x = x + self.pos_embed_temporal
- x = rearrange(x, "b s t d -> b (t s) d")
- else:
- x = rearrange(x, "b t s d -> b (t s) d")
-
- t = self.t_embedder(timestep, dtype=x.dtype) # (N, D)
- t0 = self.t_block(t)
- y = self.y_embedder(y, self.training) # (N, 1, L, D)
- if mask is not None:
- if mask.shape[0] != y.shape[0]:
- mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
- mask = mask.squeeze(1).squeeze(1)
- y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
- y_lens = mask.sum(dim=1).tolist()
- else:
- y_lens = [y.shape[2]] * y.shape[0]
- y = y.squeeze(1).view(1, -1, x.shape[-1])
-
- # blocks
- for block in self.blocks:
- x = auto_grad_checkpoint(block, x, y, t0, y_lens)
-
- # final process
- x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels)
- x = self.unpatchify(x) # (N, out_channels, H, W)
-
- # cast to float32 for better accuracy
- x = x.to(torch.float32)
- return x
-
- def unpatchify(self, x):
- c = self.out_channels
- t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
- pt, ph, pw = self.patch_size
-
- x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
- x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
- imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
- return imgs
-
- def get_spatial_pos_embed(self, grid_size=None):
- if grid_size is None:
- grid_size = self.input_size[1:]
- pos_embed = get_2d_sincos_pos_embed(
- self.hidden_size,
- (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]),
- scale=self.space_scale,
- base_size=self.base_size,
- )
- pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
- return pos_embed
-
- def get_temporal_pos_embed(self):
- pos_embed = get_1d_sincos_pos_embed(
- self.hidden_size,
- self.input_size[0] // self.patch_size[0],
- scale=self.time_scale,
- )
- pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
- return pos_embed
-
- def freeze_text(self):
- for n, p in self.named_parameters():
- if "cross_attn" in n:
- p.requires_grad = False
-
- def initialize_weights(self):
- # Initialize transformer layers:
- def _basic_init(module):
- if isinstance(module, nn.Linear):
- torch.nn.init.xavier_uniform_(module.weight)
- if module.bias is not None:
- nn.init.constant_(module.bias, 0)
-
- self.apply(_basic_init)
-
- # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
- w = self.x_embedder.proj.weight.data
- nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
-
- # Initialize timestep embedding MLP:
- nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
- nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
- nn.init.normal_(self.t_block[1].weight, std=0.02)
-
- # Initialize caption embedding MLP:
- nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
- nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)
-
- # Zero-out adaLN modulation layers in PixArt blocks:
- for block in self.blocks:
- nn.init.constant_(block.cross_attn.proj.weight, 0)
- nn.init.constant_(block.cross_attn.proj.bias, 0)
-
- # Zero-out output layers:
- nn.init.constant_(self.final_layer.linear.weight, 0)
- nn.init.constant_(self.final_layer.linear.bias, 0)
-
-
-@MODELS.register_module()
-class PixArtMS(PixArt):
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3"
- self.csize_embedder = SizeEmbedder(self.hidden_size // 3)
- self.ar_embedder = SizeEmbedder(self.hidden_size // 3)
-
- def forward(self, x, timestep, y, mask=None, data_info=None):
- """
- Forward pass of PixArt.
- x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
- t: (N,) tensor of diffusion timesteps
- y: (N, 1, 120, C) tensor of class labels
- """
- x = x.to(self.dtype)
- timestep = timestep.to(self.dtype)
- y = y.to(self.dtype)
-
- c_size = data_info["hw"]
- ar = data_info["ar"]
- pos_embed = self.get_spatial_pos_embed((x.shape[-2], x.shape[-1])).to(x.dtype)
-
- # embedding
- x = self.x_embedder(x) # (B, N, D)
- x = rearrange(x, "b (t s) d -> b t s d", t=self.num_temporal, s=self.num_spatial)
- x = x + pos_embed.to(x.device)
- if not self.no_temporal_pos_emb:
- x = rearrange(x, "b t s d -> b s t d")
- x = x + self.pos_embed_temporal
- x = rearrange(x, "b s t d -> b (t s) d")
- else:
- x = rearrange(x, "b t s d -> b (t s) d")
-
- t = self.t_embedder(timestep, dtype=x.dtype) # (N, D)
- B = x.shape[0]
- csize = self.csize_embedder(c_size, B)
- ar = self.ar_embedder(ar, B)
- t = t + torch.cat([csize, ar], dim=1)
-
- t0 = self.t_block(t)
- y = self.y_embedder(y, self.training) # (N, 1, L, D)
- if mask is not None:
- if mask.shape[0] != y.shape[0]:
- mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
- mask = mask.squeeze(1).squeeze(1)
- y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
- y_lens = mask.sum(dim=1).tolist()
- else:
- y_lens = [y.shape[2]] * y.shape[0]
- y = y.squeeze(1).view(1, -1, x.shape[-1])
-
- # blocks
- for block in self.blocks:
- x = block(x, y, t0, y_lens)
-
- # final process
- x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels)
- x = self.unpatchify(x) # (N, out_channels, H, W)
-
- # cast to float32 for better accuracy
- x = x.to(torch.float32)
- return x
-
-
-@MODELS.register_module("PixArt-XL/2")
-def PixArt_XL_2(from_pretrained=None, **kwargs):
- model = PixArt(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
- if from_pretrained is not None:
- load_checkpoint(model, from_pretrained)
- return model
-
-
-@MODELS.register_module("PixArtMS-XL/2")
-def PixArtMS_XL_2(from_pretrained=None, **kwargs):
- model = PixArtMS(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
- if from_pretrained is not None:
- load_checkpoint(model, from_pretrained)
- return model
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/stdit/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/stdit/__init__.py
deleted file mode 100644
index 605159ebd16d8d368a0903765148c23a9997a6b2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/stdit/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .stdit import STDiT
-from .stdit2 import STDiT2
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/stdit/stdit.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/stdit/stdit.py
deleted file mode 100644
index 6e1605864bb78686d0178b49f2e66d77cde0d42c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/stdit/stdit.py
+++ /dev/null
@@ -1,438 +0,0 @@
-import numpy as np
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from einops import rearrange
-from timm.models.layers import DropPath
-from timm.models.vision_transformer import Mlp
-
-from opensora.acceleration.checkpoint import auto_grad_checkpoint
-from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
-from opensora.acceleration.parallel_states import get_sequence_parallel_group
-from opensora.models.layers.blocks import (
- Attention,
- CaptionEmbedder,
- MultiHeadCrossAttention,
- PatchEmbed3D,
- SeqParallelAttention,
- SeqParallelMultiHeadCrossAttention,
- T2IFinalLayer,
- TimestepEmbedder,
- approx_gelu,
- get_1d_sincos_pos_embed,
- get_2d_sincos_pos_embed,
- get_layernorm,
- t2i_modulate,
-)
-from opensora.registry import MODELS
-from opensora.utils.ckpt_utils import load_checkpoint
-
-
-class STDiTBlock(nn.Module):
- def __init__(
- self,
- hidden_size,
- num_heads,
- d_s=None,
- d_t=None,
- mlp_ratio=4.0,
- drop_path=0.0,
- enable_flashattn=False,
- enable_layernorm_kernel=False,
- enable_sequence_parallelism=False,
- ):
- super().__init__()
- self.hidden_size = hidden_size
- self.enable_flashattn = enable_flashattn
- self._enable_sequence_parallelism = enable_sequence_parallelism
-
- if enable_sequence_parallelism:
- self.attn_cls = SeqParallelAttention
- self.mha_cls = SeqParallelMultiHeadCrossAttention
- else:
- self.attn_cls = Attention
- self.mha_cls = MultiHeadCrossAttention
-
- self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
- self.attn = self.attn_cls(
- hidden_size,
- num_heads=num_heads,
- qkv_bias=True,
- enable_flashattn=enable_flashattn,
- )
- self.cross_attn = self.mha_cls(hidden_size, num_heads)
- self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
- self.mlp = Mlp(
- in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
- )
- self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
- self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)
-
- # temporal attention
- self.d_s = d_s
- self.d_t = d_t
-
- if self._enable_sequence_parallelism:
- sp_size = dist.get_world_size(get_sequence_parallel_group())
- # make sure d_t is divisible by sp_size
- assert d_t % sp_size == 0
- self.d_t = d_t // sp_size
-
- self.attn_temp = self.attn_cls(
- hidden_size,
- num_heads=num_heads,
- qkv_bias=True,
- enable_flashattn=self.enable_flashattn,
- )
-
- def t_mask_select(self, x, masked_x, x_mask):
- # x: [B, (T, S), C]
- # mased_x: [B, (T, S), C]
- # x_mask: [B, T]
- x = rearrange(x, "B (T S) C -> B T S C", T=self.d_t, S=self.d_s)
- masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=self.d_t, S=self.d_s)
- x = torch.where(x_mask[:, :, None, None], x, masked_x)
- x = rearrange(x, "B T S C -> B (T S) C")
- return x
-
- def forward(self, x, y, t, mask=None, tpe=None, x_mask=None, t0=None):
- B, N, C = x.shape
-
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
- self.scale_shift_table[None] + t.reshape(B, 6, -1)
- ).chunk(6, dim=1)
- x_m = t2i_modulate(self.norm1(x), shift_msa, scale_msa)
- if x_mask is not None:
- shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = (
- self.scale_shift_table[None] + t0.reshape(B, 6, -1)
- ).chunk(6, dim=1)
- x_m_zero = t2i_modulate(self.norm1(x), shift_msa_zero, scale_msa_zero)
- x_m = self.t_mask_select(x_m, x_m_zero, x_mask)
-
- # spatial branch
- x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=self.d_t, S=self.d_s)
- x_s = self.attn(x_s)
- x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=self.d_t, S=self.d_s)
-
- if x_mask is not None:
- x_s_zero = gate_msa_zero * x_s
- x_s = gate_msa * x_s
- x_s = self.t_mask_select(x_s, x_s_zero, x_mask)
- else:
- x_s = gate_msa * x_s
-
- x = x + self.drop_path(x_s)
-
- # temporal branch
- x_t = rearrange(x, "B (T S) C -> (B S) T C", T=self.d_t, S=self.d_s)
- if tpe is not None:
- x_t = x_t + tpe
- x_t = self.attn_temp(x_t)
- x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=self.d_t, S=self.d_s)
- x = x + self.drop_path(gate_msa * x_t)
-
- # cross attn
- x = x + self.cross_attn(x, y, mask)
-
- # mlp
- x_m = t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)
- if x_mask is not None:
- x_m_zero = t2i_modulate(self.norm2(x), shift_mlp_zero, scale_mlp_zero)
- x_m = self.t_mask_select(x_m, x_m_zero, x_mask)
-
- x_mlp = self.mlp(x_m)
- if x_mask is not None:
- x_mlp_zero = gate_mlp_zero * x_mlp
- x_mlp = gate_mlp * x_mlp
- x_mlp = self.t_mask_select(x_mlp, x_mlp_zero, x_mask)
- else:
- x_mlp = gate_mlp * x_mlp
-
- x = x + self.drop_path(x_mlp)
-
- return x
-
-
-@MODELS.register_module()
-class STDiT(nn.Module):
- def __init__(
- self,
- input_size=(1, 32, 32),
- in_channels=4,
- patch_size=(1, 2, 2),
- hidden_size=1152,
- depth=28,
- num_heads=16,
- mlp_ratio=4.0,
- class_dropout_prob=0.1,
- pred_sigma=True,
- drop_path=0.0,
- no_temporal_pos_emb=False,
- caption_channels=4096,
- model_max_length=120,
- dtype=torch.float32,
- space_scale=1.0,
- time_scale=1.0,
- freeze=None,
- enable_flashattn=False,
- enable_layernorm_kernel=False,
- enable_sequence_parallelism=False,
- ):
- super().__init__()
- self.pred_sigma = pred_sigma
- self.in_channels = in_channels
- self.out_channels = in_channels * 2 if pred_sigma else in_channels
- self.hidden_size = hidden_size
- self.patch_size = patch_size
- self.input_size = input_size
- num_patches = np.prod([input_size[i] // patch_size[i] for i in range(3)])
- self.num_patches = num_patches
- self.num_temporal = input_size[0] // patch_size[0]
- self.num_spatial = num_patches // self.num_temporal
- self.num_heads = num_heads
- self.dtype = dtype
- self.no_temporal_pos_emb = no_temporal_pos_emb
- self.depth = depth
- self.mlp_ratio = mlp_ratio
- self.enable_flashattn = enable_flashattn
- self.enable_layernorm_kernel = enable_layernorm_kernel
- self.space_scale = space_scale
- self.time_scale = time_scale
-
- self.register_buffer("pos_embed", self.get_spatial_pos_embed())
- self.register_buffer("pos_embed_temporal", self.get_temporal_pos_embed())
-
- self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
- self.t_embedder = TimestepEmbedder(hidden_size)
- self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
- self.y_embedder = CaptionEmbedder(
- in_channels=caption_channels,
- hidden_size=hidden_size,
- uncond_prob=class_dropout_prob,
- act_layer=approx_gelu,
- token_num=model_max_length,
- )
-
- drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]
- self.blocks = nn.ModuleList(
- [
- STDiTBlock(
- self.hidden_size,
- self.num_heads,
- mlp_ratio=self.mlp_ratio,
- drop_path=drop_path[i],
- enable_flashattn=self.enable_flashattn,
- enable_layernorm_kernel=self.enable_layernorm_kernel,
- enable_sequence_parallelism=enable_sequence_parallelism,
- d_t=self.num_temporal,
- d_s=self.num_spatial,
- )
- for i in range(self.depth)
- ]
- )
- self.final_layer = T2IFinalLayer(
- hidden_size,
- np.prod(self.patch_size),
- self.out_channels,
- d_t=self.num_temporal,
- d_s=self.num_spatial,
- )
-
- # init model
- self.initialize_weights()
- self.initialize_temporal()
- if freeze is not None:
- assert freeze in ["not_temporal", "text"]
- if freeze == "not_temporal":
- self.freeze_not_temporal()
- elif freeze == "text":
- self.freeze_text()
-
- # sequence parallel related configs
- self.enable_sequence_parallelism = enable_sequence_parallelism
- if enable_sequence_parallelism:
- self.sp_rank = dist.get_rank(get_sequence_parallel_group())
- else:
- self.sp_rank = None
-
- def forward(self, x, timestep, y, mask=None, x_mask=None):
- """
- Forward pass of STDiT.
- Args:
- x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W]
- timestep (torch.Tensor): diffusion time steps; of shape [B]
- y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C]
- mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token]
-
- Returns:
- x (torch.Tensor): output latent representation; of shape [B, C, T, H, W]
- """
-
- x = x.to(self.dtype)
- timestep = timestep.to(self.dtype)
- y = y.to(self.dtype)
-
- # embedding
- x = self.x_embedder(x) # [B, N, C]
- x = rearrange(x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial)
- x = x + self.pos_embed
- x = rearrange(x, "B T S C -> B (T S) C")
-
- # shard over the sequence dim if sp is enabled
- if self.enable_sequence_parallelism:
- x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="down")
-
- t = self.t_embedder(timestep, dtype=x.dtype) # [B, C]
- t_mlp = self.t_block(t) # [B, C]
- if x_mask is not None:
- t0_timestep = torch.zeros_like(timestep)
- t0 = self.t_embedder(t0_timestep, dtype=x.dtype)
- t0_mlp = self.t_block(t0)
- else:
- t0 = None
- t0_mlp = None
- y = self.y_embedder(y, self.training) # [B, 1, N_token, C]
-
- if mask is not None:
- if mask.shape[0] != y.shape[0]:
- mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
- mask = mask.squeeze(1).squeeze(1)
- y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
- y_lens = mask.sum(dim=1).tolist()
- else:
- y_lens = [y.shape[2]] * y.shape[0]
- y = y.squeeze(1).view(1, -1, x.shape[-1])
-
- # blocks
- for i, block in enumerate(self.blocks):
- if i == 0:
- if self.enable_sequence_parallelism:
- tpe = torch.chunk(
- self.pos_embed_temporal, dist.get_world_size(get_sequence_parallel_group()), dim=1
- )[self.sp_rank].contiguous()
- else:
- tpe = self.pos_embed_temporal
- else:
- tpe = None
- x = auto_grad_checkpoint(block, x, y, t_mlp, y_lens, tpe, x_mask, t0_mlp)
-
- if self.enable_sequence_parallelism:
- x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="up")
- # x.shape: [B, N, C]
-
- # final process
- x = self.final_layer(x, t, x_mask, t0) # [B, N, C=T_p * H_p * W_p * C_out]
- x = self.unpatchify(x) # [B, C_out, T, H, W]
-
- # cast to float32 for better accuracy
- x = x.to(torch.float32)
- return x
-
- def unpatchify(self, x):
- """
- Args:
- x (torch.Tensor): of shape [B, N, C]
-
- Return:
- x (torch.Tensor): of shape [B, C_out, T, H, W]
- """
-
- N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
- T_p, H_p, W_p = self.patch_size
- x = rearrange(
- x,
- "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
- N_t=N_t,
- N_h=N_h,
- N_w=N_w,
- T_p=T_p,
- H_p=H_p,
- W_p=W_p,
- C_out=self.out_channels,
- )
- return x
-
- def unpatchify_old(self, x):
- c = self.out_channels
- t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
- pt, ph, pw = self.patch_size
-
- x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
- x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
- imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
- return imgs
-
- def get_spatial_pos_embed(self, grid_size=None):
- if grid_size is None:
- grid_size = self.input_size[1:]
- pos_embed = get_2d_sincos_pos_embed(
- self.hidden_size,
- (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]),
- scale=self.space_scale,
- )
- pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
- return pos_embed
-
- def get_temporal_pos_embed(self):
- pos_embed = get_1d_sincos_pos_embed(
- self.hidden_size,
- self.input_size[0] // self.patch_size[0],
- scale=self.time_scale,
- )
- pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
- return pos_embed
-
- def freeze_not_temporal(self):
- for n, p in self.named_parameters():
- if "attn_temp" not in n:
- p.requires_grad = False
-
- def freeze_text(self):
- for n, p in self.named_parameters():
- if "cross_attn" in n:
- p.requires_grad = False
-
- def initialize_temporal(self):
- for block in self.blocks:
- nn.init.constant_(block.attn_temp.proj.weight, 0)
- nn.init.constant_(block.attn_temp.proj.bias, 0)
-
- def initialize_weights(self):
- # Initialize transformer layers:
- def _basic_init(module):
- if isinstance(module, nn.Linear):
- torch.nn.init.xavier_uniform_(module.weight)
- if module.bias is not None:
- nn.init.constant_(module.bias, 0)
-
- self.apply(_basic_init)
-
- # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
- w = self.x_embedder.proj.weight.data
- nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
-
- # Initialize timestep embedding MLP:
- nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
- nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
- nn.init.normal_(self.t_block[1].weight, std=0.02)
-
- # Initialize caption embedding MLP:
- nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
- nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)
-
- # Zero-out adaLN modulation layers in PixArt blocks:
- for block in self.blocks:
- nn.init.constant_(block.cross_attn.proj.weight, 0)
- nn.init.constant_(block.cross_attn.proj.bias, 0)
-
- # Zero-out output layers:
- nn.init.constant_(self.final_layer.linear.weight, 0)
- nn.init.constant_(self.final_layer.linear.bias, 0)
-
-
-@MODELS.register_module("STDiT-XL/2")
-def STDiT_XL_2(from_pretrained=None, **kwargs):
- model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
- if from_pretrained is not None:
- load_checkpoint(model, from_pretrained)
- return model
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/stdit/stdit2.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/stdit/stdit2.py
deleted file mode 100644
index 7db306f61e05ed79401ad161015854270e49b48d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/stdit/stdit2.py
+++ /dev/null
@@ -1,512 +0,0 @@
-import numpy as np
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from einops import rearrange
-from rotary_embedding_torch import RotaryEmbedding
-from timm.models.layers import DropPath
-from timm.models.vision_transformer import Mlp
-
-from opensora.acceleration.checkpoint import auto_grad_checkpoint
-from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
-from opensora.acceleration.parallel_states import get_sequence_parallel_group
-from opensora.models.layers.blocks import (
- Attention,
- CaptionEmbedder,
- MultiHeadCrossAttention,
- PatchEmbed3D,
- PositionEmbedding2D,
- SeqParallelAttention,
- SeqParallelMultiHeadCrossAttention,
- SizeEmbedder,
- T2IFinalLayer,
- TimestepEmbedder,
- approx_gelu,
- get_2d_sincos_pos_embed,
- get_layernorm,
- t2i_modulate,
-)
-from opensora.registry import MODELS
-from opensora.utils.ckpt_utils import load_checkpoint
-from opensora.utils.device_utils import is_npu_available
-from opensora.utils.train_utils import NpuRotaryEmbedding
-
-
-class STDiT2Block(nn.Module):
- def __init__(
- self,
- hidden_size,
- num_heads,
- mlp_ratio=4.0,
- drop_path=0.0,
- enable_flashattn=False,
- enable_layernorm_kernel=False,
- enable_sequence_parallelism=False,
- rope=None,
- qk_norm=False,
- ):
- super().__init__()
- self.hidden_size = hidden_size
- self.enable_flashattn = enable_flashattn
- self._enable_sequence_parallelism = enable_sequence_parallelism
-
- assert not self._enable_sequence_parallelism, "Sequence parallelism is not supported."
- if enable_sequence_parallelism:
- self.attn_cls = SeqParallelAttention
- self.mha_cls = SeqParallelMultiHeadCrossAttention
- else:
- self.attn_cls = Attention
- self.mha_cls = MultiHeadCrossAttention
-
- # spatial branch
- self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
- self.attn = self.attn_cls(
- hidden_size,
- num_heads=num_heads,
- qkv_bias=True,
- enable_flashattn=enable_flashattn,
- qk_norm=qk_norm,
- )
- self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)
-
- # cross attn
- self.cross_attn = self.mha_cls(hidden_size, num_heads)
-
- # mlp branch
- self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
- self.mlp = Mlp(
- in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0
- )
- self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-
- # temporal branch
- self.norm_temp = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) # new
- self.attn_temp = self.attn_cls(
- hidden_size,
- num_heads=num_heads,
- qkv_bias=True,
- enable_flashattn=self.enable_flashattn,
- rope=rope,
- qk_norm=qk_norm,
- )
- self.scale_shift_table_temporal = nn.Parameter(torch.randn(3, hidden_size) / hidden_size**0.5) # new
-
- def t_mask_select(self, x_mask, x, masked_x, T, S):
- # x: [B, (T, S), C]
- # mased_x: [B, (T, S), C]
- # x_mask: [B, T]
- x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
- masked_x = rearrange(masked_x, "B (T S) C -> B T S C", T=T, S=S)
- x = torch.where(torch.tile(x_mask[:, :, None, None], (1, 1, x.shape[-2], x.shape[-1])), x, masked_x)
- x = rearrange(x, "B T S C -> B (T S) C")
- return x
-
- def forward(self, x, y, t, t_tmp, mask=None, x_mask=None, t0=None, t0_tmp=None, T=None, S=None):
- B, N, C = x.shape
-
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
- self.scale_shift_table[None] + t.reshape(B, 6, -1)
- ).chunk(6, dim=1)
- shift_tmp, scale_tmp, gate_tmp = (self.scale_shift_table_temporal[None] + t_tmp.reshape(B, 3, -1)).chunk(
- 3, dim=1
- )
- if x_mask is not None:
- shift_msa_zero, scale_msa_zero, gate_msa_zero, shift_mlp_zero, scale_mlp_zero, gate_mlp_zero = (
- self.scale_shift_table[None] + t0.reshape(B, 6, -1)
- ).chunk(6, dim=1)
- shift_tmp_zero, scale_tmp_zero, gate_tmp_zero = (
- self.scale_shift_table_temporal[None] + t0_tmp.reshape(B, 3, -1)
- ).chunk(3, dim=1)
-
- # modulate
- x_norm1 = self.norm1(x)
- x_m = t2i_modulate(x_norm1, shift_msa, scale_msa)
- if x_mask is not None:
- x_m_zero = t2i_modulate(x_norm1, shift_msa_zero, scale_msa_zero)
- x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)
-
- # spatial branch
- x_s = rearrange(x_m, "B (T S) C -> (B T) S C", T=T, S=S)
- x_s = self.attn(x_s)
- x_s = rearrange(x_s, "(B T) S C -> B (T S) C", T=T, S=S)
- if x_mask is not None:
- x_s_zero = gate_msa_zero * x_s
- x_s = gate_msa * x_s
- x_s = self.t_mask_select(x_mask, x_s, x_s_zero, T, S)
- else:
- x_s = gate_msa * x_s
- x = x + self.drop_path(x_s)
-
- # modulate
- x_norm_temp = self.norm_temp(x)
- x_m = t2i_modulate(x_norm_temp, shift_tmp, scale_tmp)
- if x_mask is not None:
- x_m_zero = t2i_modulate(x_norm_temp, shift_tmp_zero, scale_tmp_zero)
- x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)
-
- # temporal branch
- x_t = rearrange(x_m, "B (T S) C -> (B S) T C", T=T, S=S)
- x_t = self.attn_temp(x_t)
- x_t = rearrange(x_t, "(B S) T C -> B (T S) C", T=T, S=S)
- if x_mask is not None:
- x_t_zero = gate_tmp_zero * x_t
- x_t = gate_tmp * x_t
- x_t = self.t_mask_select(x_mask, x_t, x_t_zero, T, S)
- else:
- x_t = gate_tmp * x_t
- x = x + self.drop_path(x_t)
-
- # cross attn
- x = x + self.cross_attn(x, y, mask)
-
- # modulate
- x_norm2 = self.norm2(x)
- x_m = t2i_modulate(x_norm2, shift_mlp, scale_mlp)
- if x_mask is not None:
- x_m_zero = t2i_modulate(x_norm2, shift_mlp_zero, scale_mlp_zero)
- x_m = self.t_mask_select(x_mask, x_m, x_m_zero, T, S)
-
- # mlp
- x_mlp = self.mlp(x_m)
- if x_mask is not None:
- x_mlp_zero = gate_mlp_zero * x_mlp
- x_mlp = gate_mlp * x_mlp
- x_mlp = self.t_mask_select(x_mask, x_mlp, x_mlp_zero, T, S)
- else:
- x_mlp = gate_mlp * x_mlp
- x = x + self.drop_path(x_mlp)
-
- return x
-
-
-@MODELS.register_module()
-class STDiT2(nn.Module):
- def __init__(
- self,
- input_size=(None, None, None),
- input_sq_size=32,
- in_channels=4,
- patch_size=(1, 2, 2),
- hidden_size=1152,
- depth=28,
- num_heads=16,
- mlp_ratio=4.0,
- class_dropout_prob=0.1,
- pred_sigma=True,
- drop_path=0.0,
- no_temporal_pos_emb=False,
- caption_channels=4096,
- model_max_length=120,
- dtype=torch.float32,
- freeze=None,
- qk_norm=False,
- enable_flashattn=False,
- enable_layernorm_kernel=False,
- enable_sequence_parallelism=False,
- ):
- super().__init__()
- self.pred_sigma = pred_sigma
- self.in_channels = in_channels
- self.out_channels = in_channels * 2 if pred_sigma else in_channels
- self.hidden_size = hidden_size
- self.num_heads = num_heads
- self.dtype = dtype
- self.no_temporal_pos_emb = no_temporal_pos_emb
- self.depth = depth
- self.mlp_ratio = mlp_ratio
- self.enable_flashattn = enable_flashattn
- self.enable_layernorm_kernel = enable_layernorm_kernel
-
- # support dynamic input
- self.patch_size = patch_size
- self.input_size = input_size
- self.input_sq_size = input_sq_size
- self.pos_embed = PositionEmbedding2D(hidden_size)
-
- self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
- self.t_embedder = TimestepEmbedder(hidden_size)
- self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
- self.t_block_temp = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 3 * hidden_size, bias=True)) # new
- self.y_embedder = CaptionEmbedder(
- in_channels=caption_channels,
- hidden_size=hidden_size,
- uncond_prob=class_dropout_prob,
- act_layer=approx_gelu,
- token_num=model_max_length,
- )
-
- drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]
- if is_npu_available():
- self.rope = NpuRotaryEmbedding(dim=self.hidden_size // self.num_heads)
- else:
- self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads) # new
- self.blocks = nn.ModuleList(
- [
- STDiT2Block(
- self.hidden_size,
- self.num_heads,
- mlp_ratio=self.mlp_ratio,
- drop_path=drop_path[i],
- enable_flashattn=self.enable_flashattn,
- enable_layernorm_kernel=self.enable_layernorm_kernel,
- enable_sequence_parallelism=enable_sequence_parallelism,
- rope=self.rope.rotate_queries_or_keys,
- qk_norm=qk_norm,
- )
- for i in range(self.depth)
- ]
- )
- self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels)
-
- # multi_res
- assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3"
- self.csize_embedder = SizeEmbedder(self.hidden_size // 3)
- self.ar_embedder = SizeEmbedder(self.hidden_size // 3)
- self.fl_embedder = SizeEmbedder(self.hidden_size) # new
- self.fps_embedder = SizeEmbedder(self.hidden_size) # new
-
- # init model
- self.initialize_weights()
- self.initialize_temporal()
- if freeze is not None:
- assert freeze in ["not_temporal", "text"]
- if freeze == "not_temporal":
- self.freeze_not_temporal()
- elif freeze == "text":
- self.freeze_text()
-
- # sequence parallel related configs
- self.enable_sequence_parallelism = enable_sequence_parallelism
- if enable_sequence_parallelism:
- self.sp_rank = dist.get_rank(get_sequence_parallel_group())
- else:
- self.sp_rank = None
-
- def get_dynamic_size(self, x):
- _, _, T, H, W = x.size()
- if T % self.patch_size[0] != 0:
- T += self.patch_size[0] - T % self.patch_size[0]
- if H % self.patch_size[1] != 0:
- H += self.patch_size[1] - H % self.patch_size[1]
- if W % self.patch_size[2] != 0:
- W += self.patch_size[2] - W % self.patch_size[2]
- T = T // self.patch_size[0]
- H = H // self.patch_size[1]
- W = W // self.patch_size[2]
- return (T, H, W)
-
- def forward(
- self, x, timestep, y, mask=None, x_mask=None, num_frames=None, height=None, width=None, ar=None, fps=None
- ):
- """
- Forward pass of STDiT.
- Args:
- x (torch.Tensor): latent representation of video; of shape [B, C, T, H, W]
- timestep (torch.Tensor): diffusion time steps; of shape [B]
- y (torch.Tensor): representation of prompts; of shape [B, 1, N_token, C]
- mask (torch.Tensor): mask for selecting prompt tokens; of shape [B, N_token]
-
- Returns:
- x (torch.Tensor): output latent representation; of shape [B, C, T, H, W]
- """
- B = x.shape[0]
- x = x.to(self.dtype)
- timestep = timestep.to(self.dtype)
- y = y.to(self.dtype)
-
- # === process data info ===
- # 1. get dynamic size
- hw = torch.cat([height[:, None], width[:, None]], dim=1)
- rs = (height[0].item() * width[0].item()) ** 0.5
- csize = self.csize_embedder(hw, B)
-
- # 2. get aspect ratio
- ar = ar.unsqueeze(1)
- ar = self.ar_embedder(ar, B)
- data_info = torch.cat([csize, ar], dim=1)
-
- # 3. get number of frames
- fl = num_frames.unsqueeze(1)
- fps = fps.unsqueeze(1)
- fl = self.fl_embedder(fl, B)
- fl = fl + self.fps_embedder(fps, B)
-
- # === get dynamic shape size ===
- _, _, Tx, Hx, Wx = x.size()
- T, H, W = self.get_dynamic_size(x)
- S = H * W
- scale = rs / self.input_sq_size
- base_size = round(S**0.5)
- pos_emb = self.pos_embed(x, H, W, scale=scale, base_size=base_size)
-
- # embedding
- x = self.x_embedder(x) # [B, N, C]
- x = rearrange(x, "B (T S) C -> B T S C", T=T, S=S)
- x = x + pos_emb
- x = rearrange(x, "B T S C -> B (T S) C")
-
- # shard over the sequence dim if sp is enabled
- if self.enable_sequence_parallelism:
- x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="down")
-
- # prepare adaIN
- t = self.t_embedder(timestep, dtype=x.dtype) # [B, C]
- t_spc = t + data_info # [B, C]
- t_tmp = t + fl # [B, C]
- t_spc_mlp = self.t_block(t_spc) # [B, 6*C]
- t_tmp_mlp = self.t_block_temp(t_tmp) # [B, 3*C]
- if x_mask is not None:
- t0_timestep = torch.zeros_like(timestep)
- t0 = self.t_embedder(t0_timestep, dtype=x.dtype)
- t0_spc = t0 + data_info
- t0_tmp = t0 + fl
- t0_spc_mlp = self.t_block(t0_spc)
- t0_tmp_mlp = self.t_block_temp(t0_tmp)
- else:
- t0_spc = None
- t0_tmp = None
- t0_spc_mlp = None
- t0_tmp_mlp = None
-
- # prepare y
- y = self.y_embedder(y, self.training) # [B, 1, N_token, C]
-
- if mask is not None:
- if mask.shape[0] != y.shape[0]:
- mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
- mask = mask.squeeze(1).squeeze(1)
- y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
- y_lens = mask.sum(dim=1).tolist()
- else:
- y_lens = [y.shape[2]] * y.shape[0]
- y = y.squeeze(1).view(1, -1, x.shape[-1])
-
- # blocks
- for _, block in enumerate(self.blocks):
- x = auto_grad_checkpoint(
- block,
- x,
- y,
- t_spc_mlp,
- t_tmp_mlp,
- y_lens,
- x_mask,
- t0_spc_mlp,
- t0_tmp_mlp,
- T,
- S,
- )
-
- if self.enable_sequence_parallelism:
- x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="up")
- # x.shape: [B, N, C]
-
- # final process
- x = self.final_layer(x, t, x_mask, t0_spc, T, S) # [B, N, C=T_p * H_p * W_p * C_out]
- x = self.unpatchify(x, T, H, W, Tx, Hx, Wx) # [B, C_out, T, H, W]
-
- # cast to float32 for better accuracy
- x = x.to(torch.float32)
- return x
-
- def unpatchify(self, x, N_t, N_h, N_w, R_t, R_h, R_w):
- """
- Args:
- x (torch.Tensor): of shape [B, N, C]
-
- Return:
- x (torch.Tensor): of shape [B, C_out, T, H, W]
- """
-
- # N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
- T_p, H_p, W_p = self.patch_size
- x = rearrange(
- x,
- "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
- N_t=N_t,
- N_h=N_h,
- N_w=N_w,
- T_p=T_p,
- H_p=H_p,
- W_p=W_p,
- C_out=self.out_channels,
- )
- # unpad
- x = x[:, :, :R_t, :R_h, :R_w]
- return x
-
- def unpatchify_old(self, x):
- c = self.out_channels
- t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)]
- pt, ph, pw = self.patch_size
-
- x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c))
- x = rearrange(x, "n t h w r p q c -> n c t r h p w q")
- imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
- return imgs
-
- def get_spatial_pos_embed(self, H, W, scale=1.0, base_size=None):
- pos_embed = get_2d_sincos_pos_embed(
- self.hidden_size,
- (H, W),
- scale=scale,
- base_size=base_size,
- )
- pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
- return pos_embed
-
- def freeze_not_temporal(self):
- for n, p in self.named_parameters():
- if "attn_temp" not in n:
- p.requires_grad = False
-
- def freeze_text(self):
- for n, p in self.named_parameters():
- if "cross_attn" in n:
- p.requires_grad = False
-
- def initialize_temporal(self):
- for block in self.blocks:
- nn.init.constant_(block.attn_temp.proj.weight, 0)
- nn.init.constant_(block.attn_temp.proj.bias, 0)
-
- def initialize_weights(self):
- # Initialize transformer layers:
- def _basic_init(module):
- if isinstance(module, nn.Linear):
- torch.nn.init.xavier_uniform_(module.weight)
- if module.bias is not None:
- nn.init.constant_(module.bias, 0)
-
- self.apply(_basic_init)
-
- # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
- w = self.x_embedder.proj.weight.data
- nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
-
- # Initialize timestep embedding MLP:
- nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
- nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
- nn.init.normal_(self.t_block[1].weight, std=0.02)
- nn.init.normal_(self.t_block_temp[1].weight, std=0.02)
-
- # Initialize caption embedding MLP:
- nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
- nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)
-
- # Zero-out adaLN modulation layers in PixArt blocks:
- for block in self.blocks:
- nn.init.constant_(block.cross_attn.proj.weight, 0)
- nn.init.constant_(block.cross_attn.proj.bias, 0)
-
- # Zero-out output layers:
- nn.init.constant_(self.final_layer.linear.weight, 0)
- nn.init.constant_(self.final_layer.linear.bias, 0)
-
-
-@MODELS.register_module("STDiT2-XL/2")
-def STDiT2_XL_2(from_pretrained=None, **kwargs):
- model = STDiT2(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
- if from_pretrained is not None:
- load_checkpoint(model, from_pretrained)
- return model
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/__init__.py
deleted file mode 100644
index 9fc6a9995d9652099a51159907eb1ebb7cc219c2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .classes import ClassEncoder
-from .clip import ClipEncoder
-from .t5 import T5Encoder
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/classes.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/classes.py
deleted file mode 100644
index f02c9f299f9a611f62141d063a80f38cd1b34b45..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/classes.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import torch
-
-from opensora.registry import MODELS
-
-
-@MODELS.register_module("classes")
-class ClassEncoder:
- def __init__(self, num_classes, model_max_length=None, device="cuda", dtype=torch.float):
- self.num_classes = num_classes
- self.y_embedder = None
-
- self.model_max_length = model_max_length
- self.output_dim = None
- self.device = device
-
- def encode(self, text):
- return dict(y=torch.tensor([int(t) for t in text]).to(self.device))
-
- def null(self, n):
- return torch.tensor([self.num_classes] * n).to(self.device)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/clip.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/clip.py
deleted file mode 100644
index c628d02bb1aab2c6ee74be1daa0ec824dda160ff..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/clip.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright 2024 Vchitect/Latte
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.# Modified from Latte
-#
-# This file is adapted from the Latte project.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# Latte: https://github.com/Vchitect/Latte
-# DiT: https://github.com/facebookresearch/DiT/tree/main
-# --------------------------------------------------------
-
-
-import torch
-import torch.nn as nn
-import transformers
-from transformers import CLIPTextModel, CLIPTokenizer
-
-from opensora.registry import MODELS
-
-transformers.logging.set_verbosity_error()
-
-
-class AbstractEncoder(nn.Module):
- def __init__(self):
- super().__init__()
-
- def encode(self, *args, **kwargs):
- raise NotImplementedError
-
-
-class FrozenCLIPEmbedder(AbstractEncoder):
- """Uses the CLIP transformer encoder for text (from Hugging Face)"""
-
- def __init__(self, path="openai/clip-vit-huge-patch14", device="cuda", max_length=77):
- super().__init__()
- self.tokenizer = CLIPTokenizer.from_pretrained(path)
- self.transformer = CLIPTextModel.from_pretrained(path)
- self.device = device
- self.max_length = max_length
- self._freeze()
-
- def _freeze(self):
- self.transformer = self.transformer.eval()
- for param in self.parameters():
- param.requires_grad = False
-
- def forward(self, text):
- batch_encoding = self.tokenizer(
- text,
- truncation=True,
- max_length=self.max_length,
- return_length=True,
- return_overflowing_tokens=False,
- padding="max_length",
- return_tensors="pt",
- )
- tokens = batch_encoding["input_ids"].to(self.device)
- outputs = self.transformer(input_ids=tokens)
-
- z = outputs.last_hidden_state
- pooled_z = outputs.pooler_output
- return z, pooled_z
-
- def encode(self, text):
- return self(text)
-
-
-@MODELS.register_module("clip")
-class ClipEncoder:
- """
- Embeds text prompt into vector representations. Also handles text dropout for classifier-free guidance.
- """
-
- def __init__(
- self,
- from_pretrained,
- model_max_length=77,
- device="cuda",
- dtype=torch.float,
- ):
- super().__init__()
- assert from_pretrained is not None, "Please specify the path to the T5 model"
-
- self.text_encoder = FrozenCLIPEmbedder(path=from_pretrained, max_length=model_max_length).to(device, dtype)
- self.y_embedder = None
-
- self.model_max_length = model_max_length
- self.output_dim = self.text_encoder.transformer.config.hidden_size
-
- def encode(self, text):
- _, pooled_embeddings = self.text_encoder.encode(text)
- y = pooled_embeddings.unsqueeze(1).unsqueeze(1)
- return dict(y=y)
-
- def null(self, n):
- null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None]
- return null_y
-
- def to(self, dtype):
- self.text_encoder = self.text_encoder.to(dtype)
- return self
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/t5.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/t5.py
deleted file mode 100644
index aaf2ecf7f701e8fe4e8d606e5409ba1641260dd0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/text_encoder/t5.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# Adapted from PixArt
-#
-# Copyright (C) 2023 PixArt-alpha/PixArt-alpha
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published
-# by the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-#
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
-# T5: https://github.com/google-research/text-to-text-transfer-transformer
-# --------------------------------------------------------
-
-import html
-import re
-
-import ftfy
-import torch
-from transformers import AutoTokenizer, T5EncoderModel
-
-from opensora.registry import MODELS
-
-
-class T5Embedder:
- available_models = ["DeepFloyd/t5-v1_1-xxl"]
-
- def __init__(
- self,
- device,
- from_pretrained=None,
- *,
- cache_dir=None,
- hf_token=None,
- use_text_preprocessing=True,
- t5_model_kwargs=None,
- torch_dtype=None,
- use_offload_folder=None,
- model_max_length=120,
- local_files_only=False,
- ):
- self.device = torch.device(device)
- self.torch_dtype = torch_dtype or torch.bfloat16
- self.cache_dir = cache_dir
-
- if t5_model_kwargs is None:
- t5_model_kwargs = {
- "low_cpu_mem_usage": True,
- "torch_dtype": self.torch_dtype,
- }
-
- if use_offload_folder is not None:
- t5_model_kwargs["offload_folder"] = use_offload_folder
- t5_model_kwargs["device_map"] = {
- "shared": self.device,
- "encoder.embed_tokens": self.device,
- "encoder.block.0": self.device,
- "encoder.block.1": self.device,
- "encoder.block.2": self.device,
- "encoder.block.3": self.device,
- "encoder.block.4": self.device,
- "encoder.block.5": self.device,
- "encoder.block.6": self.device,
- "encoder.block.7": self.device,
- "encoder.block.8": self.device,
- "encoder.block.9": self.device,
- "encoder.block.10": self.device,
- "encoder.block.11": self.device,
- "encoder.block.12": "disk",
- "encoder.block.13": "disk",
- "encoder.block.14": "disk",
- "encoder.block.15": "disk",
- "encoder.block.16": "disk",
- "encoder.block.17": "disk",
- "encoder.block.18": "disk",
- "encoder.block.19": "disk",
- "encoder.block.20": "disk",
- "encoder.block.21": "disk",
- "encoder.block.22": "disk",
- "encoder.block.23": "disk",
- "encoder.final_layer_norm": "disk",
- "encoder.dropout": "disk",
- }
- else:
- t5_model_kwargs["device_map"] = {
- "shared": self.device,
- "encoder": self.device,
- }
-
- self.use_text_preprocessing = use_text_preprocessing
- self.hf_token = hf_token
-
- assert from_pretrained in self.available_models
- self.tokenizer = AutoTokenizer.from_pretrained(
- from_pretrained,
- cache_dir=cache_dir,
- local_files_only=local_files_only,
- )
- self.model = T5EncoderModel.from_pretrained(
- from_pretrained,
- cache_dir=cache_dir,
- local_files_only=local_files_only,
- **t5_model_kwargs,
- ).eval()
- self.model_max_length = model_max_length
-
- def get_text_embeddings(self, texts):
- text_tokens_and_mask = self.tokenizer(
- texts,
- max_length=self.model_max_length,
- padding="max_length",
- truncation=True,
- return_attention_mask=True,
- add_special_tokens=True,
- return_tensors="pt",
- )
-
- input_ids = text_tokens_and_mask["input_ids"].to(self.device)
- attention_mask = text_tokens_and_mask["attention_mask"].to(self.device)
- with torch.no_grad():
- text_encoder_embs = self.model(
- input_ids=input_ids,
- attention_mask=attention_mask,
- )["last_hidden_state"].detach()
- return text_encoder_embs, attention_mask
-
-
-@MODELS.register_module("t5")
-class T5Encoder:
- def __init__(
- self,
- from_pretrained=None,
- model_max_length=120,
- device="cuda",
- dtype=torch.float,
- cache_dir=None,
- shardformer=False,
- local_files_only=False,
- ):
- assert from_pretrained is not None, "Please specify the path to the T5 model"
-
- self.t5 = T5Embedder(
- device=device,
- torch_dtype=dtype,
- from_pretrained=from_pretrained,
- cache_dir=cache_dir,
- model_max_length=model_max_length,
- local_files_only=local_files_only,
- )
- self.t5.model.to(dtype=dtype)
- self.y_embedder = None
-
- self.model_max_length = model_max_length
- self.output_dim = self.t5.model.config.d_model
-
- if shardformer:
- self.shardformer_t5()
-
- def shardformer_t5(self):
- from colossalai.shardformer import ShardConfig, ShardFormer
-
- from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy
- from opensora.utils.misc import requires_grad
-
- shard_config = ShardConfig(
- tensor_parallel_process_group=None,
- pipeline_stage_manager=None,
- enable_tensor_parallelism=False,
- enable_fused_normalization=False,
- enable_flash_attention=False,
- enable_jit_fused=True,
- enable_sequence_parallelism=False,
- enable_sequence_overlap=False,
- )
- shard_former = ShardFormer(shard_config=shard_config)
- optim_model, _ = shard_former.optimize(self.t5.model, policy=T5EncoderPolicy())
- self.t5.model = optim_model.half()
-
- # ensure the weights are frozen
- requires_grad(self.t5.model, False)
-
- def encode(self, text):
- caption_embs, emb_masks = self.t5.get_text_embeddings(text)
- caption_embs = caption_embs[:, None]
- return dict(y=caption_embs, mask=emb_masks)
-
- def null(self, n):
- null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None]
- return null_y
-
-
-def basic_clean(text):
- text = ftfy.fix_text(text)
- text = html.unescape(html.unescape(text))
- return text.strip()
-
-
-BAD_PUNCT_REGEX = re.compile(
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
-) # noqa
-
-
-def clean_caption(caption):
- import urllib.parse as ul
-
- from bs4 import BeautifulSoup
-
- caption = str(caption)
- caption = ul.unquote_plus(caption)
- caption = caption.strip().lower()
- caption = re.sub("", "person", caption)
- # urls:
- caption = re.sub(
- r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
- "",
- caption,
- ) # regex for urls
- caption = re.sub(
- r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
- "",
- caption,
- ) # regex for urls
- # html:
- caption = BeautifulSoup(caption, features="html.parser").text
-
- # @
- caption = re.sub(r"@[\w\d]+\b", "", caption)
-
- # 31C0—31EF CJK Strokes
- # 31F0—31FF Katakana Phonetic Extensions
- # 3200—32FF Enclosed CJK Letters and Months
- # 3300—33FF CJK Compatibility
- # 3400—4DBF CJK Unified Ideographs Extension A
- # 4DC0—4DFF Yijing Hexagram Symbols
- # 4E00—9FFF CJK Unified Ideographs
- caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
- caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
- caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
- caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
- caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
- caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
- caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
- #######################################################
-
- # все виды тире / all types of dash --> "-"
- caption = re.sub(
- r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
- "-",
- caption,
- )
-
- # кавычки к одному стандарту
- caption = re.sub(r"[`´«»“”¨]", '"', caption)
- caption = re.sub(r"[‘’]", "'", caption)
-
- # "
- caption = re.sub(r""?", "", caption)
- # &
- caption = re.sub(r"&", "", caption)
-
- # ip adresses:
- caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
-
- # article ids:
- caption = re.sub(r"\d:\d\d\s+$", "", caption)
-
- # \n
- caption = re.sub(r"\\n", " ", caption)
-
- # "#123"
- caption = re.sub(r"#\d{1,3}\b", "", caption)
- # "#12345.."
- caption = re.sub(r"#\d{5,}\b", "", caption)
- # "123456.."
- caption = re.sub(r"\b\d{6,}\b", "", caption)
- # filenames:
- caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
-
- #
- caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
- caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
-
- caption = re.sub(BAD_PUNCT_REGEX, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
- caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
-
- # this-is-my-cute-cat / this_is_my_cute_cat
- regex2 = re.compile(r"(?:\-|\_)")
- if len(re.findall(regex2, caption)) > 3:
- caption = re.sub(regex2, " ", caption)
-
- caption = basic_clean(caption)
-
- caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640
- caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc
- caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231
-
- caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
- caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
- caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
- caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
- caption = re.sub(r"\bpage\s+\d+\b", "", caption)
-
- caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
-
- caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
-
- caption = re.sub(r"\b\s+\:\s+", r": ", caption)
- caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
- caption = re.sub(r"\s+", " ", caption)
-
- caption.strip()
-
- caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
- caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
- caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
- caption = re.sub(r"^\.\S+$", "", caption)
-
- return caption.strip()
-
-
-def text_preprocessing(text, use_text_preprocessing: bool = True):
- if use_text_preprocessing:
- # The exact text cleaning as was in the training stage:
- text = clean_caption(text)
- text = clean_caption(text)
- return text
- else:
- return text.lower().strip()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/vae/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/vae/__init__.py
deleted file mode 100644
index 63510b08b2036160c01d38b0ad3484757f6bcff7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/vae/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .vae import VideoAutoencoderKL, VideoAutoencoderKLTemporalDecoder
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/vae/vae.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/models/vae/vae.py
deleted file mode 100644
index 273bcfdb424d0200f1c8229030a17b0fc1d18395..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/models/vae/vae.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import torch
-import torch.nn as nn
-from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
-from einops import rearrange
-
-from opensora.registry import MODELS
-
-
-@MODELS.register_module()
-class VideoAutoencoderKL(nn.Module):
- def __init__(self, from_pretrained=None, micro_batch_size=None, cache_dir=None, local_files_only=False):
- super().__init__()
- self.module = AutoencoderKL.from_pretrained(
- from_pretrained, cache_dir=cache_dir, local_files_only=local_files_only
- )
- self.out_channels = self.module.config.latent_channels
- self.patch_size = (1, 8, 8)
- self.micro_batch_size = micro_batch_size
-
- def encode(self, x):
- # x: (B, C, T, H, W)
- B = x.shape[0]
- x = rearrange(x, "B C T H W -> (B T) C H W")
-
- if self.micro_batch_size is None:
- x = self.module.encode(x).latent_dist.sample().mul_(0.18215)
- else:
- bs = self.micro_batch_size
- x_out = []
- for i in range(0, x.shape[0], bs):
- x_bs = x[i : i + bs]
- x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(0.18215)
- x_out.append(x_bs)
- x = torch.cat(x_out, dim=0)
- x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
- return x
-
- def decode(self, x):
- # x: (B, C, T, H, W)
- B = x.shape[0]
- x = rearrange(x, "B C T H W -> (B T) C H W")
- if self.micro_batch_size is None:
- x = self.module.decode(x / 0.18215).sample
- else:
- bs = self.micro_batch_size
- x_out = []
- for i in range(0, x.shape[0], bs):
- x_bs = x[i : i + bs]
- x_bs = self.module.decode(x_bs / 0.18215).sample
- x_out.append(x_bs)
- x = torch.cat(x_out, dim=0)
- x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
- return x
-
- def get_latent_size(self, input_size):
- latent_size = []
- for i in range(3):
- # assert (
- # input_size[i] is None or input_size[i] % self.patch_size[i] == 0
- # ), "Input size must be divisible by patch size"
- latent_size.append(input_size[i] // self.patch_size[i] if input_size[i] is not None else None)
- return latent_size
-
- @property
- def device(self):
- return next(self.parameters()).device
-
- @property
- def dtype(self):
- return next(self.parameters()).dtype
-
-
-@MODELS.register_module()
-class VideoAutoencoderKLTemporalDecoder(nn.Module):
- def __init__(self, from_pretrained=None, cache_dir=None, local_files_only=False):
- super().__init__()
- self.module = AutoencoderKLTemporalDecoder.from_pretrained(
- from_pretrained, cache_dir=cache_dir, local_files_only=local_files_only
- )
- self.out_channels = self.module.config.latent_channels
- self.patch_size = (1, 8, 8)
-
- def encode(self, x):
- raise NotImplementedError
-
- def decode(self, x):
- B, _, T = x.shape[:3]
- x = rearrange(x, "B C T H W -> (B T) C H W")
- x = self.module.decode(x / 0.18215, num_frames=T).sample
- x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
- return x
-
- def get_latent_size(self, input_size):
- latent_size = []
- for i in range(3):
- # assert (
- # input_size[i] is None or input_size[i] % self.patch_size[i] == 0
- # ), "Input size must be divisible by patch size"
- latent_size.append(input_size[i] // self.patch_size[i] if input_size[i] is not None else None)
- return latent_size
-
- @property
- def device(self):
- return next(self.parameters()).device
-
- @property
- def dtype(self):
- return next(self.parameters()).dtype
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/registry.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/registry.py
deleted file mode 100644
index 4335d386f2a662d9e97655d439a6cc02ebd1c6f6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/registry.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from copy import deepcopy
-
-import torch.nn as nn
-from mmengine.registry import Registry
-
-
-def build_module(module, builder, **kwargs):
- """Build module from config or return the module itself.
-
- Args:
- module (Union[dict, nn.Module]): The module to build.
- builder (Registry): The registry to build module.
- *args, **kwargs: Arguments passed to build function.
-
- Returns:
- Any: The built module.
- """
- if isinstance(module, dict):
- cfg = deepcopy(module)
- for k, v in kwargs.items():
- cfg[k] = v
- return builder.build(cfg)
- elif isinstance(module, nn.Module):
- return module
- elif module is None:
- return None
- else:
- raise TypeError(f"Only support dict and nn.Module, but got {type(module)}.")
-
-
-MODELS = Registry(
- "model",
- locations=["opensora.models"],
-)
-
-SCHEDULERS = Registry(
- "scheduler",
- locations=["opensora.schedulers"],
-)
-
-DATASETS = Registry(
- "dataset",
- locations=["opensora.datasets"],
-)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/__init__.py
deleted file mode 100644
index 97ea76f92f8b99664e35c51172e35d66d704edc4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .dpms import DPMS
-from .iddpm import IDDPM
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/dpms/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/dpms/__init__.py
deleted file mode 100644
index ba2f6489651390329a97f0eb952a81a756a37d9b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/dpms/__init__.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from functools import partial
-
-import torch
-
-from opensora.registry import SCHEDULERS
-
-from .dpm_solver import DPMS
-
-
-@SCHEDULERS.register_module("dpm-solver")
-class DMP_SOLVER:
- def __init__(self, num_sampling_steps=None, cfg_scale=4.0):
- self.num_sampling_steps = num_sampling_steps
- self.cfg_scale = cfg_scale
-
- def sample(
- self,
- model,
- text_encoder,
- z,
- prompts,
- device,
- additional_args=None,
- mask=None,
- ):
- assert mask is None, "mask is not supported in dpm-solver"
- n = len(prompts)
- model_args = text_encoder.encode(prompts)
- y = model_args.pop("y")
- null_y = text_encoder.null(n)
- if additional_args is not None:
- model_args.update(additional_args)
-
- dpms = DPMS(
- partial(forward_with_dpmsolver, model),
- condition=y,
- uncondition=null_y,
- cfg_scale=self.cfg_scale,
- model_kwargs=model_args,
- )
- samples = dpms.sample(z, steps=self.num_sampling_steps, order=2, skip_type="time_uniform", method="multistep")
- return samples
-
-
-def forward_with_dpmsolver(self, x, timestep, y, **kwargs):
- """
- dpm solver donnot need variance prediction
- """
- # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
- model_out = self.forward(x, timestep, y, **kwargs)
- return model_out.chunk(2, dim=1)[0]
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/dpms/dpm_solver.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/dpms/dpm_solver.py
deleted file mode 100644
index 106e59ec9c2a22de935210ecfd8153bcf7ebb551..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/dpms/dpm_solver.py
+++ /dev/null
@@ -1,1570 +0,0 @@
-# MIT License
-#
-# Copyright (c) 2022 Cheng Lu
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-#
-# This file is adapted from the dpm-solver project
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# PixArt: https://github.com/PixArt-alpha/PixArt-alpha
-# dpm-solver: https://github.com/LuChengTHU/dpm-solver
-# --------------------------------------------------------
-
-import math
-
-import numpy as np
-import torch
-from tqdm import tqdm
-
-
-def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
- betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
- warmup_time = int(num_diffusion_timesteps * warmup_frac)
- betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
- return betas
-
-
-def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
- """
- This is the deprecated API for creating beta schedules.
- See get_named_beta_schedule() for the new library of schedules.
- """
- if beta_schedule == "quad":
- betas = (
- np.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_diffusion_timesteps,
- dtype=np.float64,
- )
- ** 2
- )
- elif beta_schedule == "linear":
- betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
- elif beta_schedule == "warmup10":
- betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
- elif beta_schedule == "warmup50":
- betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
- elif beta_schedule == "const":
- betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
- elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1
- betas = 1.0 / np.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64)
- else:
- raise NotImplementedError(beta_schedule)
- assert betas.shape == (num_diffusion_timesteps,)
- return betas
-
-
-def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
- """
- Get a pre-defined beta schedule for the given name.
- The beta schedule library consists of beta schedules which remain similar
- in the limit of num_diffusion_timesteps.
- Beta schedules may be added, but should not be removed or changed once
- they are committed to maintain backwards compatibility.
- """
- if schedule_name == "linear":
- # Linear schedule from Ho et al, extended to work for any number of
- # diffusion steps.
- scale = 1000 / num_diffusion_timesteps
- return get_beta_schedule(
- "linear",
- beta_start=scale * 0.0001,
- beta_end=scale * 0.02,
- num_diffusion_timesteps=num_diffusion_timesteps,
- )
- elif schedule_name == "squaredcos_cap_v2":
- return betas_for_alpha_bar(
- num_diffusion_timesteps,
- lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
- )
- else:
- raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
-
-
-def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
- """
- Create a beta schedule that discretizes the given alpha_t_bar function,
- which defines the cumulative product of (1-beta) over time from t = [0,1].
- :param num_diffusion_timesteps: the number of betas to produce.
- :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
- produces the cumulative product of (1-beta) up to that
- part of the diffusion process.
- :param max_beta: the maximum beta to use; use values lower than 1 to
- prevent singularities.
- """
- betas = []
- for i in range(num_diffusion_timesteps):
- t1 = i / num_diffusion_timesteps
- t2 = (i + 1) / num_diffusion_timesteps
- betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
- return np.array(betas)
-
-
-class NoiseScheduleVP:
- def __init__(
- self,
- schedule="discrete",
- betas=None,
- alphas_cumprod=None,
- continuous_beta_0=0.1,
- continuous_beta_1=20.0,
- dtype=torch.float32,
- ):
- """Create a wrapper class for the forward SDE (VP type).
-
- ***
- Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
- We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
- ***
-
- The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
- We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
- Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
-
- log_alpha_t = self.marginal_log_mean_coeff(t)
- sigma_t = self.marginal_std(t)
- lambda_t = self.marginal_lambda(t)
-
- Moreover, as lambda(t) is an invertible function, we also support its inverse function:
-
- t = self.inverse_lambda(lambda_t)
-
- ===============================================================
-
- We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
-
- 1. For discrete-time DPMs:
-
- For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
- t_i = (i + 1) / N
- e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
- We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
-
- Args:
- betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
- alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
-
- Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
-
- **Important**: Please pay special attention for the args for `alphas_cumprod`:
- The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
- q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
- Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
- alpha_{t_n} = \sqrt{\hat{alpha_n}},
- and
- log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
-
-
- 2. For continuous-time DPMs:
-
- We support the linear VPSDE for the continuous time setting. The hyperparameters for the noise
- schedule are the default settings in Yang Song's ScoreSDE:
-
- Args:
- beta_min: A `float` number. The smallest beta for the linear schedule.
- beta_max: A `float` number. The largest beta for the linear schedule.
- T: A `float` number. The ending time of the forward process.
-
- ===============================================================
-
- Args:
- schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
- 'linear' for continuous-time DPMs.
- Returns:
- A wrapper object of the forward SDE (VP type).
-
- ===============================================================
-
- Example:
-
- # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
- >>> ns = NoiseScheduleVP('discrete', betas=betas)
-
- # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
- >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
-
- # For continuous-time DPMs (VPSDE), linear schedule:
- >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
-
- """
-
- if schedule not in ["discrete", "linear"]:
- raise ValueError(f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear'")
-
- self.schedule = schedule
- if schedule == "discrete":
- if betas is not None:
- log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
- else:
- assert alphas_cumprod is not None
- log_alphas = 0.5 * torch.log(alphas_cumprod)
- self.T = 1.0
- self.log_alpha_array = (
- self.numerical_clip_alpha(log_alphas)
- .reshape(
- (
- 1,
- -1,
- )
- )
- .to(dtype=dtype)
- )
- self.total_N = self.log_alpha_array.shape[1]
- self.t_array = torch.linspace(0.0, 1.0, self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
- else:
- self.T = 1.0
- self.total_N = 1000
- self.beta_0 = continuous_beta_0
- self.beta_1 = continuous_beta_1
-
- def numerical_clip_alpha(self, log_alphas, clipped_lambda=-5.1):
- """
- For some beta schedules such as cosine schedule, the log-SNR has numerical isssues.
- We clip the log-SNR near t=T within -5.1 to ensure the stability.
- Such a trick is very useful for diffusion models with the cosine schedule, such as i-DDPM, guided-diffusion and GLIDE.
- """
- log_sigmas = 0.5 * torch.log(1.0 - torch.exp(2.0 * log_alphas))
- lambs = log_alphas - log_sigmas
- idx = torch.searchsorted(torch.flip(lambs, [0]), clipped_lambda)
- if idx > 0:
- log_alphas = log_alphas[:-idx]
- return log_alphas
-
- def marginal_log_mean_coeff(self, t):
- """
- Compute log(alpha_t) of a given continuous-time label t in [0, T].
- """
- if self.schedule == "discrete":
- return interpolate_fn(
- t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)
- ).reshape((-1))
- elif self.schedule == "linear":
- return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
-
- def marginal_alpha(self, t):
- """
- Compute alpha_t of a given continuous-time label t in [0, T].
- """
- return torch.exp(self.marginal_log_mean_coeff(t))
-
- def marginal_std(self, t):
- """
- Compute sigma_t of a given continuous-time label t in [0, T].
- """
- return torch.sqrt(1.0 - torch.exp(2.0 * self.marginal_log_mean_coeff(t)))
-
- def marginal_lambda(self, t):
- """
- Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
- """
- log_mean_coeff = self.marginal_log_mean_coeff(t)
- log_std = 0.5 * torch.log(1.0 - torch.exp(2.0 * log_mean_coeff))
- return log_mean_coeff - log_std
-
- def inverse_lambda(self, lamb):
- """
- Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
- """
- if self.schedule == "linear":
- tmp = 2.0 * (self.beta_1 - self.beta_0) * torch.logaddexp(-2.0 * lamb, torch.zeros((1,)).to(lamb))
- Delta = self.beta_0**2 + tmp
- return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
- elif self.schedule == "discrete":
- log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2.0 * lamb)
- t = interpolate_fn(
- log_alpha.reshape((-1, 1)),
- torch.flip(self.log_alpha_array.to(lamb.device), [1]),
- torch.flip(self.t_array.to(lamb.device), [1]),
- )
- return t.reshape((-1,))
-
-
-def model_wrapper(
- model,
- noise_schedule,
- model_type="noise",
- model_kwargs={},
- guidance_type="uncond",
- condition=None,
- unconditional_condition=None,
- guidance_scale=1.0,
- classifier_fn=None,
- classifier_kwargs={},
-):
- """Create a wrapper function for the noise prediction model.
-
- DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
- firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
-
- We support four types of the diffusion model by setting `model_type`:
-
- 1. "noise": noise prediction model. (Trained by predicting noise).
-
- 2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
-
- 3. "v": velocity prediction model. (Trained by predicting the velocity).
- The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
-
- [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
- arXiv preprint arXiv:2202.00512 (2022).
- [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
- arXiv preprint arXiv:2210.02303 (2022).
-
- 4. "score": marginal score function. (Trained by denoising score matching).
- Note that the score function and the noise prediction model follows a simple relationship:
- ```
- noise(x_t, t) = -sigma_t * score(x_t, t)
- ```
-
- We support three types of guided sampling by DPMs by setting `guidance_type`:
- 1. "uncond": unconditional sampling by DPMs.
- The input `model` has the following format:
- ``
- model(x, t_input, **model_kwargs) -> noise | x_start | v | score
- ``
-
- 2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
- The input `model` has the following format:
- ``
- model(x, t_input, **model_kwargs) -> noise | x_start | v | score
- ``
-
- The input `classifier_fn` has the following format:
- ``
- classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
- ``
-
- [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
- in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
-
- 3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
- The input `model` has the following format:
- ``
- model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
- ``
- And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
-
- [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
- arXiv preprint arXiv:2207.12598 (2022).
-
-
- The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
- or continuous-time labels (i.e. epsilon to T).
-
- We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
- ``
- def model_fn(x, t_continuous) -> noise:
- t_input = get_model_input_time(t_continuous)
- return noise_pred(model, x, t_input, **model_kwargs)
- ``
- where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
-
- ===============================================================
-
- Args:
- model: A diffusion model with the corresponding format described above.
- noise_schedule: A noise schedule object, such as NoiseScheduleVP.
- model_type: A `str`. The parameterization type of the diffusion model.
- "noise" or "x_start" or "v" or "score".
- model_kwargs: A `dict`. A dict for the other inputs of the model function.
- guidance_type: A `str`. The type of the guidance for sampling.
- "uncond" or "classifier" or "classifier-free".
- condition: A pytorch tensor. The condition for the guided sampling.
- Only used for "classifier" or "classifier-free" guidance type.
- unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
- Only used for "classifier-free" guidance type.
- guidance_scale: A `float`. The scale for the guided sampling.
- classifier_fn: A classifier function. Only used for the classifier guidance.
- classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
- Returns:
- A noise prediction model that accepts the noised data and the continuous time as the inputs.
- """
-
- def get_model_input_time(t_continuous):
- """
- Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
- For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
- For continuous-time DPMs, we just use `t_continuous`.
- """
- if noise_schedule.schedule == "discrete":
- return (t_continuous - 1.0 / noise_schedule.total_N) * 1000.0
- else:
- return t_continuous
-
- def noise_pred_fn(x, t_continuous, cond=None):
- t_input = get_model_input_time(t_continuous)
- if cond is None:
- output = model(x, t_input, **model_kwargs)
- else:
- output = model(x, t_input, cond, **model_kwargs)
- if model_type == "noise":
- return output
- elif model_type == "x_start":
- alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
- return (x - expand_dims(alpha_t, x.dim()) * output) / expand_dims(sigma_t, x.dim())
- elif model_type == "v":
- alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
- return expand_dims(alpha_t, x.dim()) * output + expand_dims(sigma_t, x.dim()) * x
- elif model_type == "score":
- sigma_t = noise_schedule.marginal_std(t_continuous)
- return -expand_dims(sigma_t, x.dim()) * output
-
- def cond_grad_fn(x, t_input):
- """
- Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
- """
- with torch.enable_grad():
- x_in = x.detach().requires_grad_(True)
- log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
- return torch.autograd.grad(log_prob.sum(), x_in)[0]
-
- def model_fn(x, t_continuous):
- """
- The noise predicition model function that is used for DPM-Solver.
- """
- if guidance_type == "uncond":
- return noise_pred_fn(x, t_continuous)
- elif guidance_type == "classifier":
- assert classifier_fn is not None
- t_input = get_model_input_time(t_continuous)
- cond_grad = cond_grad_fn(x, t_input)
- sigma_t = noise_schedule.marginal_std(t_continuous)
- noise = noise_pred_fn(x, t_continuous)
- return noise - guidance_scale * expand_dims(sigma_t, x.dim()) * cond_grad
- elif guidance_type == "classifier-free":
- if guidance_scale == 1.0 or unconditional_condition is None:
- return noise_pred_fn(x, t_continuous, cond=condition)
- x_in = torch.cat([x] * 2)
- t_in = torch.cat([t_continuous] * 2)
- c_in = torch.cat([unconditional_condition, condition])
- noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
- return noise_uncond + guidance_scale * (noise - noise_uncond)
-
- assert model_type in ["noise", "x_start", "v", "score"]
- assert guidance_type in ["uncond", "classifier", "classifier-free"]
- return model_fn
-
-
-class DPM_Solver:
- def __init__(
- self,
- model_fn,
- noise_schedule,
- algorithm_type="dpmsolver++",
- correcting_x0_fn=None,
- correcting_xt_fn=None,
- thresholding_max_val=1.0,
- dynamic_thresholding_ratio=0.995,
- ):
- """Construct a DPM-Solver.
-
- We support both DPM-Solver (`algorithm_type="dpmsolver"`) and DPM-Solver++ (`algorithm_type="dpmsolver++"`).
-
- We also support the "dynamic thresholding" method in Imagen[1]. For pixel-space diffusion models, you
- can set both `algorithm_type="dpmsolver++"` and `correcting_x0_fn="dynamic_thresholding"` to use the
- dynamic thresholding. The "dynamic thresholding" can greatly improve the sample quality for pixel-space
- DPMs with large guidance scales. Note that the thresholding method is **unsuitable** for latent-space
- DPMs (such as stable-diffusion).
-
- To support advanced algorithms in image-to-image applications, we also support corrector functions for
- both x0 and xt.
-
- Args:
- model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
- ``
- def model_fn(x, t_continuous):
- return noise
- ``
- The shape of `x` is `(batch_size, **shape)`, and the shape of `t_continuous` is `(batch_size,)`.
- noise_schedule: A noise schedule object, such as NoiseScheduleVP.
- algorithm_type: A `str`. Either "dpmsolver" or "dpmsolver++".
- correcting_x0_fn: A `str` or a function with the following format:
- ```
- def correcting_x0_fn(x0, t):
- x0_new = ...
- return x0_new
- ```
- This function is to correct the outputs of the data prediction model at each sampling step. e.g.,
- ```
- x0_pred = data_pred_model(xt, t)
- if correcting_x0_fn is not None:
- x0_pred = correcting_x0_fn(x0_pred, t)
- xt_1 = update(x0_pred, xt, t)
- ```
- If `correcting_x0_fn="dynamic_thresholding"`, we use the dynamic thresholding proposed in Imagen[1].
- correcting_xt_fn: A function with the following format:
- ```
- def correcting_xt_fn(xt, t, step):
- x_new = ...
- return x_new
- ```
- This function is to correct the intermediate samples xt at each sampling step. e.g.,
- ```
- xt = ...
- xt = correcting_xt_fn(xt, t, step)
- ```
- thresholding_max_val: A `float`. The max value for thresholding.
- Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
- dynamic_thresholding_ratio: A `float`. The ratio for dynamic thresholding (see Imagen[1] for details).
- Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
-
- [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour,
- Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models
- with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
- """
- self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
- self.noise_schedule = noise_schedule
- assert algorithm_type in ["dpmsolver", "dpmsolver++"]
- self.algorithm_type = algorithm_type
- if correcting_x0_fn == "dynamic_thresholding":
- self.correcting_x0_fn = self.dynamic_thresholding_fn
- else:
- self.correcting_x0_fn = correcting_x0_fn
- self.correcting_xt_fn = correcting_xt_fn
- self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
- self.thresholding_max_val = thresholding_max_val
-
- def dynamic_thresholding_fn(self, x0, t):
- """
- The dynamic thresholding method.
- """
- dims = x0.dim()
- p = self.dynamic_thresholding_ratio
- s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
- s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
- x0 = torch.clamp(x0, -s, s) / s
- return x0
-
- def noise_prediction_fn(self, x, t):
- """
- Return the noise prediction model.
- """
- return self.model(x, t)
-
- def data_prediction_fn(self, x, t):
- """
- Return the data prediction model (with corrector).
- """
- noise = self.noise_prediction_fn(x, t)
- alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
- x0 = (x - sigma_t * noise) / alpha_t
- if self.correcting_x0_fn is not None:
- x0 = self.correcting_x0_fn(x0, t)
- return x0
-
- def model_fn(self, x, t):
- """
- Convert the model to the noise prediction model or the data prediction model.
- """
- if self.algorithm_type == "dpmsolver++":
- return self.data_prediction_fn(x, t)
- else:
- return self.noise_prediction_fn(x, t)
-
- def get_time_steps(self, skip_type, t_T, t_0, N, device):
- """Compute the intermediate time steps for sampling.
-
- Args:
- skip_type: A `str`. The type for the spacing of the time steps. We support three types:
- - 'logSNR': uniform logSNR for the time steps.
- - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
- - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
- t_T: A `float`. The starting time of the sampling (default is T).
- t_0: A `float`. The ending time of the sampling (default is epsilon).
- N: A `int`. The total number of the spacing of the time steps.
- device: A torch device.
- Returns:
- A pytorch tensor of the time steps, with the shape (N + 1,).
- """
- if skip_type == "logSNR":
- lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
- lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
- logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
- return self.noise_schedule.inverse_lambda(logSNR_steps)
- elif skip_type == "time_uniform":
- return torch.linspace(t_T, t_0, N + 1).to(device)
- elif skip_type == "time_quadratic":
- t_order = 2
- return torch.linspace(t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1).pow(t_order).to(device)
- else:
- raise ValueError(
- f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'"
- )
-
- def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
- """
- Get the order of each step for sampling by the singlestep DPM-Solver.
-
- We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast".
- Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is:
- - If order == 1:
- We take `steps` of DPM-Solver-1 (i.e. DDIM).
- - If order == 2:
- - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling.
- - If steps % 2 == 0, we use K steps of DPM-Solver-2.
- - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1.
- - If order == 3:
- - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
- - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
- - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
- - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.
-
- ============================================
- Args:
- order: A `int`. The max order for the solver (2 or 3).
- steps: A `int`. The total number of function evaluations (NFE).
- skip_type: A `str`. The type for the spacing of the time steps. We support three types:
- - 'logSNR': uniform logSNR for the time steps.
- - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
- - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
- t_T: A `float`. The starting time of the sampling (default is T).
- t_0: A `float`. The ending time of the sampling (default is epsilon).
- device: A torch device.
- Returns:
- orders: A list of the solver order of each step.
- """
- if order == 3:
- K = steps // 3 + 1
- if steps % 3 == 0:
- orders = [
- 3,
- ] * (
- K - 2
- ) + [2, 1]
- elif steps % 3 == 1:
- orders = [
- 3,
- ] * (
- K - 1
- ) + [1]
- else:
- orders = [
- 3,
- ] * (
- K - 1
- ) + [2]
- elif order == 2:
- if steps % 2 == 0:
- K = steps // 2
- orders = [
- 2,
- ] * K
- else:
- K = steps // 2 + 1
- orders = [
- 2,
- ] * (
- K - 1
- ) + [1]
- elif order == 1:
- K = 1
- orders = [
- 1,
- ] * steps
- else:
- raise ValueError("'order' must be '1' or '2' or '3'.")
- if skip_type == "logSNR":
- # To reproduce the results in DPM-Solver paper
- timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
- else:
- timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
- torch.cumsum(
- torch.tensor(
- [
- 0,
- ]
- + orders
- ),
- 0,
- ).to(device)
- ]
- return timesteps_outer, orders
-
- def denoise_to_zero_fn(self, x, s):
- """
- Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
- """
- return self.data_prediction_fn(x, s)
-
- def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=False):
- """
- DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`.
-
- Args:
- x: A pytorch tensor. The initial value at time `s`.
- s: A pytorch tensor. The starting time, with the shape (1,).
- t: A pytorch tensor. The ending time, with the shape (1,).
- model_s: A pytorch tensor. The model function evaluated at time `s`.
- If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
- return_intermediate: A `bool`. If true, also return the model value at time `s`.
- Returns:
- x_t: A pytorch tensor. The approximated solution at time `t`.
- """
- ns = self.noise_schedule
- x.dim()
- lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
- h = lambda_t - lambda_s
- log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
- sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
- alpha_t = torch.exp(log_alpha_t)
-
- if self.algorithm_type == "dpmsolver++":
- phi_1 = torch.expm1(-h)
- if model_s is None:
- model_s = self.model_fn(x, s)
- x_t = sigma_t / sigma_s * x - alpha_t * phi_1 * model_s
- else:
- phi_1 = torch.expm1(h)
- if model_s is None:
- model_s = self.model_fn(x, s)
- x_t = torch.exp(log_alpha_t - log_alpha_s) * x - (sigma_t * phi_1) * model_s
- return (x_t, {"model_s": model_s}) if return_intermediate else x_t
-
- def singlestep_dpm_solver_second_update(
- self, x, s, t, r1=0.5, model_s=None, return_intermediate=False, solver_type="dpmsolver"
- ):
- """
- Singlestep solver DPM-Solver-2 from time `s` to time `t`.
-
- Args:
- x: A pytorch tensor. The initial value at time `s`.
- s: A pytorch tensor. The starting time, with the shape (1,).
- t: A pytorch tensor. The ending time, with the shape (1,).
- r1: A `float`. The hyperparameter of the second-order solver.
- model_s: A pytorch tensor. The model function evaluated at time `s`.
- If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
- return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
- solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
- The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
- Returns:
- x_t: A pytorch tensor. The approximated solution at time `t`.
- """
- if solver_type not in ["dpmsolver", "taylor"]:
- raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
- if r1 is None:
- r1 = 0.5
- ns = self.noise_schedule
- lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
- h = lambda_t - lambda_s
- lambda_s1 = lambda_s + r1 * h
- s1 = ns.inverse_lambda(lambda_s1)
- log_alpha_s, log_alpha_s1, log_alpha_t = (
- ns.marginal_log_mean_coeff(s),
- ns.marginal_log_mean_coeff(s1),
- ns.marginal_log_mean_coeff(t),
- )
- sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
- alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
-
- if self.algorithm_type == "dpmsolver++":
- phi_11 = torch.expm1(-r1 * h)
- phi_1 = torch.expm1(-h)
-
- if model_s is None:
- model_s = self.model_fn(x, s)
- x_s1 = (sigma_s1 / sigma_s) * x - (alpha_s1 * phi_11) * model_s
- model_s1 = self.model_fn(x_s1, s1)
- if solver_type == "dpmsolver":
- x_t = (
- (sigma_t / sigma_s) * x
- - (alpha_t * phi_1) * model_s
- - (0.5 / r1) * (alpha_t * phi_1) * (model_s1 - model_s)
- )
- elif solver_type == "taylor":
- x_t = (
- (sigma_t / sigma_s) * x
- - (alpha_t * phi_1) * model_s
- + (1.0 / r1) * (alpha_t * (phi_1 / h + 1.0)) * (model_s1 - model_s)
- )
- else:
- phi_11 = torch.expm1(r1 * h)
- phi_1 = torch.expm1(h)
-
- if model_s is None:
- model_s = self.model_fn(x, s)
- x_s1 = torch.exp(log_alpha_s1 - log_alpha_s) * x - (sigma_s1 * phi_11) * model_s
- model_s1 = self.model_fn(x_s1, s1)
- if solver_type == "dpmsolver":
- x_t = (
- torch.exp(log_alpha_t - log_alpha_s) * x
- - (sigma_t * phi_1) * model_s
- - (0.5 / r1) * (sigma_t * phi_1) * (model_s1 - model_s)
- )
- elif solver_type == "taylor":
- x_t = (
- torch.exp(log_alpha_t - log_alpha_s) * x
- - (sigma_t * phi_1) * model_s
- - (1.0 / r1) * (sigma_t * (phi_1 / h - 1.0)) * (model_s1 - model_s)
- )
- if return_intermediate:
- return x_t, {"model_s": model_s, "model_s1": model_s1}
- else:
- return x_t
-
- def singlestep_dpm_solver_third_update(
- self,
- x,
- s,
- t,
- r1=1.0 / 3.0,
- r2=2.0 / 3.0,
- model_s=None,
- model_s1=None,
- return_intermediate=False,
- solver_type="dpmsolver",
- ):
- """
- Singlestep solver DPM-Solver-3 from time `s` to time `t`.
-
- Args:
- x: A pytorch tensor. The initial value at time `s`.
- s: A pytorch tensor. The starting time, with the shape (1,).
- t: A pytorch tensor. The ending time, with the shape (1,).
- r1: A `float`. The hyperparameter of the third-order solver.
- r2: A `float`. The hyperparameter of the third-order solver.
- model_s: A pytorch tensor. The model function evaluated at time `s`.
- If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
- model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
- If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
- return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
- solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
- The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
- Returns:
- x_t: A pytorch tensor. The approximated solution at time `t`.
- """
- if solver_type not in ["dpmsolver", "taylor"]:
- raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
- if r1 is None:
- r1 = 1.0 / 3.0
- if r2 is None:
- r2 = 2.0 / 3.0
- ns = self.noise_schedule
- lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
- h = lambda_t - lambda_s
- lambda_s1 = lambda_s + r1 * h
- lambda_s2 = lambda_s + r2 * h
- s1 = ns.inverse_lambda(lambda_s1)
- s2 = ns.inverse_lambda(lambda_s2)
- log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = (
- ns.marginal_log_mean_coeff(s),
- ns.marginal_log_mean_coeff(s1),
- ns.marginal_log_mean_coeff(s2),
- ns.marginal_log_mean_coeff(t),
- )
- sigma_s, sigma_s1, sigma_s2, sigma_t = (
- ns.marginal_std(s),
- ns.marginal_std(s1),
- ns.marginal_std(s2),
- ns.marginal_std(t),
- )
- alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
-
- if self.algorithm_type == "dpmsolver++":
- phi_11 = torch.expm1(-r1 * h)
- phi_12 = torch.expm1(-r2 * h)
- phi_1 = torch.expm1(-h)
- phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.0
- phi_2 = phi_1 / h + 1.0
- phi_3 = phi_2 / h - 0.5
-
- if model_s is None:
- model_s = self.model_fn(x, s)
- if model_s1 is None:
- x_s1 = (sigma_s1 / sigma_s) * x - (alpha_s1 * phi_11) * model_s
- model_s1 = self.model_fn(x_s1, s1)
- x_s2 = (
- (sigma_s2 / sigma_s) * x
- - (alpha_s2 * phi_12) * model_s
- + r2 / r1 * (alpha_s2 * phi_22) * (model_s1 - model_s)
- )
- model_s2 = self.model_fn(x_s2, s2)
- if solver_type == "dpmsolver":
- x_t = (
- (sigma_t / sigma_s) * x
- - (alpha_t * phi_1) * model_s
- + (1.0 / r2) * (alpha_t * phi_2) * (model_s2 - model_s)
- )
- elif solver_type == "taylor":
- D1_0 = (1.0 / r1) * (model_s1 - model_s)
- D1_1 = (1.0 / r2) * (model_s2 - model_s)
- D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
- D2 = 2.0 * (D1_1 - D1_0) / (r2 - r1)
- x_t = (
- (sigma_t / sigma_s) * x
- - (alpha_t * phi_1) * model_s
- + (alpha_t * phi_2) * D1
- - (alpha_t * phi_3) * D2
- )
- else:
- phi_11 = torch.expm1(r1 * h)
- phi_12 = torch.expm1(r2 * h)
- phi_1 = torch.expm1(h)
- phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.0
- phi_2 = phi_1 / h - 1.0
- phi_3 = phi_2 / h - 0.5
-
- if model_s is None:
- model_s = self.model_fn(x, s)
- if model_s1 is None:
- x_s1 = (torch.exp(log_alpha_s1 - log_alpha_s)) * x - (sigma_s1 * phi_11) * model_s
- model_s1 = self.model_fn(x_s1, s1)
- x_s2 = (
- (torch.exp(log_alpha_s2 - log_alpha_s)) * x
- - (sigma_s2 * phi_12) * model_s
- - r2 / r1 * (sigma_s2 * phi_22) * (model_s1 - model_s)
- )
- model_s2 = self.model_fn(x_s2, s2)
- if solver_type == "dpmsolver":
- x_t = (
- (torch.exp(log_alpha_t - log_alpha_s)) * x
- - (sigma_t * phi_1) * model_s
- - (1.0 / r2) * (sigma_t * phi_2) * (model_s2 - model_s)
- )
- elif solver_type == "taylor":
- D1_0 = (1.0 / r1) * (model_s1 - model_s)
- D1_1 = (1.0 / r2) * (model_s2 - model_s)
- D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
- D2 = 2.0 * (D1_1 - D1_0) / (r2 - r1)
- x_t = (
- (torch.exp(log_alpha_t - log_alpha_s)) * x
- - (sigma_t * phi_1) * model_s
- - (sigma_t * phi_2) * D1
- - (sigma_t * phi_3) * D2
- )
-
- if return_intermediate:
- return x_t, {"model_s": model_s, "model_s1": model_s1, "model_s2": model_s2}
- else:
- return x_t
-
- def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"):
- """
- Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.
-
- Args:
- x: A pytorch tensor. The initial value at time `s`.
- model_prev_list: A list of pytorch tensor. The previous computed model values.
- t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
- t: A pytorch tensor. The ending time, with the shape (1,).
- solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
- The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
- Returns:
- x_t: A pytorch tensor. The approximated solution at time `t`.
- """
- if solver_type not in ["dpmsolver", "taylor"]:
- raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}")
- ns = self.noise_schedule
- model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1]
- t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1]
- lambda_prev_1, lambda_prev_0, lambda_t = (
- ns.marginal_lambda(t_prev_1),
- ns.marginal_lambda(t_prev_0),
- ns.marginal_lambda(t),
- )
- log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
- sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
- alpha_t = torch.exp(log_alpha_t)
-
- h_0 = lambda_prev_0 - lambda_prev_1
- h = lambda_t - lambda_prev_0
- r0 = h_0 / h
- D1_0 = (1.0 / r0) * (model_prev_0 - model_prev_1)
- if self.algorithm_type == "dpmsolver++":
- phi_1 = torch.expm1(-h)
- if solver_type == "dpmsolver":
- x_t = (sigma_t / sigma_prev_0) * x - (alpha_t * phi_1) * model_prev_0 - 0.5 * (alpha_t * phi_1) * D1_0
- elif solver_type == "taylor":
- x_t = (
- (sigma_t / sigma_prev_0) * x
- - (alpha_t * phi_1) * model_prev_0
- + (alpha_t * (phi_1 / h + 1.0)) * D1_0
- )
- else:
- phi_1 = torch.expm1(h)
- if solver_type == "dpmsolver":
- x_t = (
- (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
- - (sigma_t * phi_1) * model_prev_0
- - 0.5 * (sigma_t * phi_1) * D1_0
- )
- elif solver_type == "taylor":
- x_t = (
- (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
- - (sigma_t * phi_1) * model_prev_0
- - (sigma_t * (phi_1 / h - 1.0)) * D1_0
- )
- return x_t
-
- def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"):
- """
- Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.
-
- Args:
- x: A pytorch tensor. The initial value at time `s`.
- model_prev_list: A list of pytorch tensor. The previous computed model values.
- t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
- t: A pytorch tensor. The ending time, with the shape (1,).
- solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
- The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
- Returns:
- x_t: A pytorch tensor. The approximated solution at time `t`.
- """
- ns = self.noise_schedule
- model_prev_2, model_prev_1, model_prev_0 = model_prev_list
- t_prev_2, t_prev_1, t_prev_0 = t_prev_list
- lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = (
- ns.marginal_lambda(t_prev_2),
- ns.marginal_lambda(t_prev_1),
- ns.marginal_lambda(t_prev_0),
- ns.marginal_lambda(t),
- )
- log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
- sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
- alpha_t = torch.exp(log_alpha_t)
-
- h_1 = lambda_prev_1 - lambda_prev_2
- h_0 = lambda_prev_0 - lambda_prev_1
- h = lambda_t - lambda_prev_0
- r0, r1 = h_0 / h, h_1 / h
- D1_0 = (1.0 / r0) * (model_prev_0 - model_prev_1)
- D1_1 = (1.0 / r1) * (model_prev_1 - model_prev_2)
- D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
- D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
- if self.algorithm_type == "dpmsolver++":
- phi_1 = torch.expm1(-h)
- phi_2 = phi_1 / h + 1.0
- phi_3 = phi_2 / h - 0.5
- return (
- (sigma_t / sigma_prev_0) * x
- - (alpha_t * phi_1) * model_prev_0
- + (alpha_t * phi_2) * D1
- - (alpha_t * phi_3) * D2
- )
- else:
- phi_1 = torch.expm1(h)
- phi_2 = phi_1 / h - 1.0
- phi_3 = phi_2 / h - 0.5
- return (
- (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
- - (sigma_t * phi_1) * model_prev_0
- - (sigma_t * phi_2) * D1
- - (sigma_t * phi_3) * D2
- )
-
- def singlestep_dpm_solver_update(
- self, x, s, t, order, return_intermediate=False, solver_type="dpmsolver", r1=None, r2=None
- ):
- """
- Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
-
- Args:
- x: A pytorch tensor. The initial value at time `s`.
- s: A pytorch tensor. The starting time, with the shape (1,).
- t: A pytorch tensor. The ending time, with the shape (1,).
- order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
- return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
- solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
- The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
- r1: A `float`. The hyperparameter of the second-order or third-order solver.
- r2: A `float`. The hyperparameter of the third-order solver.
- Returns:
- x_t: A pytorch tensor. The approximated solution at time `t`.
- """
- if order == 1:
- return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
- elif order == 2:
- return self.singlestep_dpm_solver_second_update(
- x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1
- )
- elif order == 3:
- return self.singlestep_dpm_solver_third_update(
- x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1, r2=r2
- )
- else:
- raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")
-
- def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type="dpmsolver"):
- """
- Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.
-
- Args:
- x: A pytorch tensor. The initial value at time `s`.
- model_prev_list: A list of pytorch tensor. The previous computed model values.
- t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
- t: A pytorch tensor. The ending time, with the shape (1,).
- order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
- solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
- The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
- Returns:
- x_t: A pytorch tensor. The approximated solution at time `t`.
- """
- if order == 1:
- return self.dpm_solver_first_update(x, t_prev_list[-1], t, model_s=model_prev_list[-1])
- elif order == 2:
- return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
- elif order == 3:
- return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
- else:
- raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}")
-
- def dpm_solver_adaptive(
- self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5, solver_type="dpmsolver"
- ):
- """
- The adaptive step size solver based on singlestep DPM-Solver.
-
- Args:
- x: A pytorch tensor. The initial value at time `t_T`.
- order: A `int`. The (higher) order of the solver. We only support order == 2 or 3.
- t_T: A `float`. The starting time of the sampling (default is T).
- t_0: A `float`. The ending time of the sampling (default is epsilon).
- h_init: A `float`. The initial step size (for logSNR).
- atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1].
- rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05.
- theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
- t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
- current time and `t_0` is less than `t_err`. The default setting is 1e-5.
- solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
- The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
- Returns:
- x_0: A pytorch tensor. The approximated solution at time `t_0`.
-
- [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
- """
- ns = self.noise_schedule
- s = t_T * torch.ones((1,)).to(x)
- lambda_s = ns.marginal_lambda(s)
- lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
- h = h_init * torch.ones_like(s).to(x)
- x_prev = x
- nfe = 0
- if order == 2:
- r1 = 0.5
- lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
- higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(
- x, s, t, r1=r1, solver_type=solver_type, **kwargs
- )
- elif order == 3:
- r1, r2 = 1.0 / 3.0, 2.0 / 3.0
- lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(
- x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type
- )
- higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(
- x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs
- )
- else:
- raise ValueError(f"For adaptive step size solver, order must be 2 or 3, got {order}")
- while torch.abs((s - t_0)).mean() > t_err:
- t = ns.inverse_lambda(lambda_s + h)
- x_lower, lower_noise_kwargs = lower_update(x, s, t)
- x_higher = higher_update(x, s, t, **lower_noise_kwargs)
- delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
- norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
- E = norm_fn((x_higher - x_lower) / delta).max()
- if torch.all(E <= 1.0):
- x = x_higher
- s = t
- x_prev = x_lower
- lambda_s = ns.marginal_lambda(s)
- h = torch.min(theta * h * torch.float_power(E, -1.0 / order).float(), lambda_0 - lambda_s)
- nfe += order
- print("adaptive solver nfe", nfe)
- return x
-
- def add_noise(self, x, t, noise=None):
- """
- Compute the noised input xt = alpha_t * x + sigma_t * noise.
-
- Args:
- x: A `torch.Tensor` with shape `(batch_size, *shape)`.
- t: A `torch.Tensor` with shape `(t_size,)`.
- Returns:
- xt with shape `(t_size, batch_size, *shape)`.
- """
- alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
- if noise is None:
- noise = torch.randn((t.shape[0], *x.shape), device=x.device)
- x = x.reshape((-1, *x.shape))
- xt = expand_dims(alpha_t, x.dim()) * x + expand_dims(sigma_t, x.dim()) * noise
- return xt.squeeze(0) if t.shape[0] == 1 else xt
-
- def inverse(
- self,
- x,
- steps=20,
- t_start=None,
- t_end=None,
- order=2,
- skip_type="time_uniform",
- method="multistep",
- lower_order_final=True,
- denoise_to_zero=False,
- solver_type="dpmsolver",
- atol=0.0078,
- rtol=0.05,
- return_intermediate=False,
- ):
- """
- Inverse the sample `x` from time `t_start` to `t_end` by DPM-Solver.
- For discrete-time DPMs, we use `t_start=1/N`, where `N` is the total time steps during training.
- """
- t_0 = 1.0 / self.noise_schedule.total_N if t_start is None else t_start
- t_T = self.noise_schedule.T if t_end is None else t_end
- assert (
- t_0 > 0 and t_T > 0
- ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
- return self.sample(
- x,
- steps=steps,
- t_start=t_0,
- t_end=t_T,
- order=order,
- skip_type=skip_type,
- method=method,
- lower_order_final=lower_order_final,
- denoise_to_zero=denoise_to_zero,
- solver_type=solver_type,
- atol=atol,
- rtol=rtol,
- return_intermediate=return_intermediate,
- )
-
- def sample(
- self,
- x,
- steps=20,
- t_start=None,
- t_end=None,
- order=2,
- skip_type="time_uniform",
- method="multistep",
- lower_order_final=True,
- denoise_to_zero=False,
- solver_type="dpmsolver",
- atol=0.0078,
- rtol=0.05,
- return_intermediate=False,
- ):
- """
- Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
-
- =====================================================
-
- We support the following algorithms for both noise prediction model and data prediction model:
- - 'singlestep':
- Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver.
- We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps).
- The total number of function evaluations (NFE) == `steps`.
- Given a fixed NFE == `steps`, the sampling procedure is:
- - If `order` == 1:
- - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM).
- - If `order` == 2:
- - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling.
- - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2.
- - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
- - If `order` == 3:
- - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
- - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
- - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1.
- - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2.
- - 'multistep':
- Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`.
- We initialize the first `order` values by lower order multistep solvers.
- Given a fixed NFE == `steps`, the sampling procedure is:
- Denote K = steps.
- - If `order` == 1:
- - We use K steps of DPM-Solver-1 (i.e. DDIM).
- - If `order` == 2:
- - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2.
- - If `order` == 3:
- - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3.
- - 'singlestep_fixed':
- Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3).
- We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE.
- - 'adaptive':
- Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper).
- We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`.
- You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs
- (NFE) and the sample quality.
- - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2.
- - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3.
-
- =====================================================
-
- Some advices for choosing the algorithm:
- - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
- Use singlestep DPM-Solver or DPM-Solver++ ("DPM-Solver-fast" in the paper) with `order = 3`.
- e.g., DPM-Solver:
- >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver")
- >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
- skip_type='time_uniform', method='singlestep')
- e.g., DPM-Solver++:
- >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
- >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
- skip_type='time_uniform', method='singlestep')
- - For **guided sampling with large guidance scale** by DPMs:
- Use multistep DPM-Solver with `algorithm_type="dpmsolver++"` and `order = 2`.
- e.g.
- >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
- >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
- skip_type='time_uniform', method='multistep')
-
- We support three types of `skip_type`:
- - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images**
- - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**.
- - 'time_quadratic': quadratic time for the time steps.
-
- =====================================================
- Args:
- x: A pytorch tensor. The initial value at time `t_start`
- e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution.
- steps: A `int`. The total number of function evaluations (NFE).
- t_start: A `float`. The starting time of the sampling.
- If `T` is None, we use self.noise_schedule.T (default is 1.0).
- t_end: A `float`. The ending time of the sampling.
- If `t_end` is None, we use 1. / self.noise_schedule.total_N.
- e.g. if total_N == 1000, we have `t_end` == 1e-3.
- For discrete-time DPMs:
- - We recommend `t_end` == 1. / self.noise_schedule.total_N.
- For continuous-time DPMs:
- - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15.
- order: A `int`. The order of DPM-Solver.
- skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
- method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
- denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step.
- Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1).
-
- This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and
- score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID
- for diffusion models sampling by diffusion SDEs for low-resolutional images
- (such as CIFAR-10). However, we observed that such trick does not matter for
- high-resolutional images. As it needs an additional NFE, we do not recommend
- it for high-resolutional images.
- lower_order_final: A `bool`. Whether to use lower order solvers at the final steps.
- Only valid for `method=multistep` and `steps < 15`. We empirically find that
- this trick is a key to stabilizing the sampling by DPM-Solver with very few steps
- (especially for steps <= 10). So we recommend to set it to be `True`.
- solver_type: A `str`. The taylor expansion type for the solver. `dpmsolver` or `taylor`. We recommend `dpmsolver`.
- atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
- rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
- return_intermediate: A `bool`. Whether to save the xt at each step.
- When set to `True`, method returns a tuple (x0, intermediates); when set to False, method returns only x0.
- Returns:
- x_end: A pytorch tensor. The approximated solution at time `t_end`.
-
- """
- t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end
- t_T = self.noise_schedule.T if t_start is None else t_start
- assert (
- t_0 > 0 and t_T > 0
- ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
- if return_intermediate:
- assert method in [
- "multistep",
- "singlestep",
- "singlestep_fixed",
- ], "Cannot use adaptive solver when saving intermediate values"
- if self.correcting_xt_fn is not None:
- assert method in [
- "multistep",
- "singlestep",
- "singlestep_fixed",
- ], "Cannot use adaptive solver when correcting_xt_fn is not None"
- device = x.device
- intermediates = []
- with torch.no_grad():
- if method == "adaptive":
- x = self.dpm_solver_adaptive(
- x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol, solver_type=solver_type
- )
- elif method == "multistep":
- assert steps >= order
- timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
- assert timesteps.shape[0] - 1 == steps
- # Init the initial values.
- step = 0
- t = timesteps[step]
- t_prev_list = [t]
- model_prev_list = [self.model_fn(x, t)]
- if self.correcting_xt_fn is not None:
- x = self.correcting_xt_fn(x, t, step)
- if return_intermediate:
- intermediates.append(x)
- # Init the first `order` values by lower order multistep DPM-Solver.
- for step in range(1, order):
- t = timesteps[step]
- x = self.multistep_dpm_solver_update(
- x, model_prev_list, t_prev_list, t, step, solver_type=solver_type
- )
- if self.correcting_xt_fn is not None:
- x = self.correcting_xt_fn(x, t, step)
- if return_intermediate:
- intermediates.append(x)
- t_prev_list.append(t)
- model_prev_list.append(self.model_fn(x, t))
- # Compute the remaining values by `order`-th order multistep DPM-Solver.
- for step in tqdm(range(order, steps + 1)):
- t = timesteps[step]
- # We only use lower order for steps < 10
- if lower_order_final and steps < 10:
- step_order = min(order, steps + 1 - step)
- else:
- step_order = order
- x = self.multistep_dpm_solver_update(
- x, model_prev_list, t_prev_list, t, step_order, solver_type=solver_type
- )
- if self.correcting_xt_fn is not None:
- x = self.correcting_xt_fn(x, t, step)
- if return_intermediate:
- intermediates.append(x)
- for i in range(order - 1):
- t_prev_list[i] = t_prev_list[i + 1]
- model_prev_list[i] = model_prev_list[i + 1]
- t_prev_list[-1] = t
- # We do not need to evaluate the final model value.
- if step < steps:
- model_prev_list[-1] = self.model_fn(x, t)
- elif method in ["singlestep", "singlestep_fixed"]:
- if method == "singlestep":
- timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(
- steps=steps, order=order, skip_type=skip_type, t_T=t_T, t_0=t_0, device=device
- )
- elif method == "singlestep_fixed":
- K = steps // order
- orders = [
- order,
- ] * K
- timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
- for step, order in enumerate(orders):
- s, t = timesteps_outer[step], timesteps_outer[step + 1]
- timesteps_inner = self.get_time_steps(
- skip_type=skip_type, t_T=s.item(), t_0=t.item(), N=order, device=device
- )
- lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
- h = lambda_inner[-1] - lambda_inner[0]
- r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
- r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
- x = self.singlestep_dpm_solver_update(x, s, t, order, solver_type=solver_type, r1=r1, r2=r2)
- if self.correcting_xt_fn is not None:
- x = self.correcting_xt_fn(x, t, step)
- if return_intermediate:
- intermediates.append(x)
- else:
- raise ValueError(f"Got wrong method {method}")
- if denoise_to_zero:
- t = torch.ones((1,)).to(device) * t_0
- x = self.denoise_to_zero_fn(x, t)
- if self.correcting_xt_fn is not None:
- x = self.correcting_xt_fn(x, t, step + 1)
- if return_intermediate:
- intermediates.append(x)
- return (x, intermediates) if return_intermediate else x
-
-
-#############################################################
-# other utility functions
-#############################################################
-
-
-def interpolate_fn(x, xp, yp):
- """
- A piecewise linear function y = f(x), using xp and yp as keypoints.
- We implement f(x) in a differentiable way (i.e. applicable for autograd).
- The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
-
- Args:
- x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
- xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
- yp: PyTorch tensor with shape [C, K].
- Returns:
- The function values f(x), with shape [N, C].
- """
- N, K = x.shape[0], xp.shape[1]
- all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
- sorted_all_x, x_indices = torch.sort(all_x, dim=2)
- x_idx = torch.argmin(x_indices, dim=2)
- cand_start_idx = x_idx - 1
- start_idx = torch.where(
- torch.eq(x_idx, 0),
- torch.tensor(1, device=x.device),
- torch.where(
- torch.eq(x_idx, K),
- torch.tensor(K - 2, device=x.device),
- cand_start_idx,
- ),
- )
- end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
- start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
- end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
- start_idx2 = torch.where(
- torch.eq(x_idx, 0),
- torch.tensor(0, device=x.device),
- torch.where(
- torch.eq(x_idx, K),
- torch.tensor(K - 2, device=x.device),
- cand_start_idx,
- ),
- )
- y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
- start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
- end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
- return start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
-
-
-def expand_dims(v, dims):
- """
- Expand the tensor `v` to the dim `dims`.
-
- Args:
- `v`: a PyTorch tensor with shape [N].
- `dim`: a `int`.
- Returns:
- a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
- """
- return v[(...,) + (None,) * (dims - 1)]
-
-
-def DPMS(
- model,
- condition,
- uncondition,
- cfg_scale,
- model_type="noise",
- noise_schedule="linear",
- guidance_type="classifier-free",
- model_kwargs=None,
- diffusion_steps=1000,
-):
- if model_kwargs is None:
- model_kwargs = {}
- betas = torch.tensor(get_named_beta_schedule(noise_schedule, diffusion_steps))
-
- ## 1. Define the noise schedule.
- noise_schedule = NoiseScheduleVP(schedule="discrete", betas=betas)
-
- ## 2. Convert your discrete-time `model` to the continuous-time
- ## noise prediction model. Here is an example for a diffusion model
- ## `model` with the noise prediction type ("noise") .
- model_fn = model_wrapper(
- model,
- noise_schedule,
- model_type=model_type,
- model_kwargs=model_kwargs,
- guidance_type=guidance_type,
- condition=condition,
- unconditional_condition=uncondition,
- guidance_scale=cfg_scale,
- )
- ## 3. Define dpm-solver and sample by multistep DPM-Solver.
- return DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/__init__.py
deleted file mode 100644
index 2dfa9f4957b1de09c7e20918f4b5cca241673604..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/__init__.py
+++ /dev/null
@@ -1,104 +0,0 @@
-from functools import partial
-
-import torch
-
-from opensora.registry import SCHEDULERS
-
-from . import gaussian_diffusion as gd
-from .respace import SpacedDiffusion, space_timesteps
-from .speed import SpeeDiffusion
-
-
-@SCHEDULERS.register_module("iddpm")
-class IDDPM(SpacedDiffusion):
- def __init__(
- self,
- num_sampling_steps=None,
- timestep_respacing=None,
- noise_schedule="linear",
- use_kl=False,
- sigma_small=False,
- predict_xstart=False,
- learn_sigma=True,
- rescale_learned_sigmas=False,
- diffusion_steps=1000,
- cfg_scale=4.0,
- cfg_channel=None,
- ):
- betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
- if use_kl:
- loss_type = gd.LossType.RESCALED_KL
- elif rescale_learned_sigmas:
- loss_type = gd.LossType.RESCALED_MSE
- else:
- loss_type = gd.LossType.MSE
- if num_sampling_steps is not None:
- assert timestep_respacing is None
- timestep_respacing = str(num_sampling_steps)
- if timestep_respacing is None or timestep_respacing == "":
- timestep_respacing = [diffusion_steps]
- super().__init__(
- use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
- betas=betas,
- model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X),
- model_var_type=(
- (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
- if not learn_sigma
- else gd.ModelVarType.LEARNED_RANGE
- ),
- loss_type=loss_type,
- # rescale_timesteps=rescale_timesteps,
- )
-
- self.cfg_scale = cfg_scale
- self.cfg_channel = cfg_channel
-
- def sample(
- self,
- model,
- text_encoder,
- z,
- prompts,
- device,
- additional_args=None,
- mask=None,
- ):
- n = len(prompts)
- z = torch.cat([z, z], 0)
- model_args = text_encoder.encode(prompts)
- y_null = text_encoder.null(n)
- model_args["y"] = torch.cat([model_args["y"], y_null], 0)
- if additional_args is not None:
- model_args.update(additional_args)
-
- forward = partial(forward_with_cfg, model, cfg_scale=self.cfg_scale, cfg_channel=self.cfg_channel)
- samples = self.p_sample_loop(
- forward,
- z.shape,
- z,
- clip_denoised=False,
- model_kwargs=model_args,
- progress=True,
- device=device,
- mask=mask,
- )
- samples, _ = samples.chunk(2, dim=0)
- return samples
-
-
-def forward_with_cfg(model, x, timestep, y, cfg_scale, cfg_channel=None, **kwargs):
- # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
- half = x[: len(x) // 2]
- combined = torch.cat([half, half], dim=0)
- if "x_mask" in kwargs and kwargs["x_mask"] is not None:
- if len(kwargs["x_mask"]) != len(x):
- kwargs["x_mask"] = torch.cat([kwargs["x_mask"], kwargs["x_mask"]], dim=0)
- model_out = model.forward(combined, timestep, y, **kwargs)
- model_out = model_out["x"] if isinstance(model_out, dict) else model_out
- if cfg_channel is None:
- cfg_channel = model_out.shape[1] // 2
- eps, rest = model_out[:, :cfg_channel], model_out[:, cfg_channel:]
- cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
- half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
- eps = torch.cat([half_eps, half_eps], dim=0)
- return torch.cat([eps, rest], dim=1)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/diffusion_utils.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/diffusion_utils.py
deleted file mode 100644
index 0e15d3c5d76ff16d62778299520a5c357eea7784..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/diffusion_utils.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Adapted from DiT
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# DiT: https://github.com/facebookresearch/DiT/tree/main
-# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-# --------------------------------------------------------
-
-
-import numpy as np
-import torch
-
-
-def normal_kl(mean1, logvar1, mean2, logvar2):
- """
- Compute the KL divergence between two gaussians.
- Shapes are automatically broadcasted, so batches can be compared to
- scalars, among other use cases.
- """
- tensor = None
- for obj in (mean1, logvar1, mean2, logvar2):
- if isinstance(obj, torch.Tensor):
- tensor = obj
- break
- assert tensor is not None, "at least one argument must be a Tensor"
-
- # Force variances to be Tensors. Broadcasting helps convert scalars to
- # Tensors, but it does not work for torch.exp().
- logvar1, logvar2 = [x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) for x in (logvar1, logvar2)]
-
- return 0.5 * (
- -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
- )
-
-
-def approx_standard_normal_cdf(x):
- """
- A fast approximation of the cumulative distribution function of the
- standard normal.
- """
- return 0.5 * (1.0 + torch.tanh(np.sqrt(2.0 / torch.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-def continuous_gaussian_log_likelihood(x, *, means, log_scales):
- """
- Compute the log-likelihood of a continuous Gaussian distribution.
- :param x: the targets
- :param means: the Gaussian mean Tensor.
- :param log_scales: the Gaussian log stddev Tensor.
- :return: a tensor like x of log probabilities (in nats).
- """
- centered_x = x - means
- inv_stdv = torch.exp(-log_scales)
- normalized_x = centered_x * inv_stdv
- log_probs = torch.distributions.Normal(torch.zeros_like(x), torch.ones_like(x)).log_prob(normalized_x)
- return log_probs
-
-
-def discretized_gaussian_log_likelihood(x, *, means, log_scales):
- """
- Compute the log-likelihood of a Gaussian distribution discretizing to a
- given image.
- :param x: the target images. It is assumed that this was uint8 values,
- rescaled to the range [-1, 1].
- :param means: the Gaussian mean Tensor.
- :param log_scales: the Gaussian log stddev Tensor.
- :return: a tensor like x of log probabilities (in nats).
- """
- assert x.shape == means.shape == log_scales.shape
- centered_x = x - means
- inv_stdv = torch.exp(-log_scales)
- plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
- cdf_plus = approx_standard_normal_cdf(plus_in)
- min_in = inv_stdv * (centered_x - 1.0 / 255.0)
- cdf_min = approx_standard_normal_cdf(min_in)
- log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12))
- log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12))
- cdf_delta = cdf_plus - cdf_min
- log_probs = torch.where(
- x < -0.999,
- log_cdf_plus,
- torch.where(x > 0.999, log_one_minus_cdf_min, torch.log(cdf_delta.clamp(min=1e-12))),
- )
- assert log_probs.shape == x.shape
- return log_probs
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/gaussian_diffusion.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/gaussian_diffusion.py
deleted file mode 100644
index 5aa20c7d2cb9b9f0e0915b3b3592227373256523..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/gaussian_diffusion.py
+++ /dev/null
@@ -1,895 +0,0 @@
-# Adapted from DiT
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# DiT: https://github.com/facebookresearch/DiT/tree/main
-# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-# --------------------------------------------------------
-
-import enum
-from typing import Callable, List
-
-import numpy as np
-import torch
-from einops import rearrange
-
-from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
-
-
-def mean_flat(tensor: torch.Tensor, mask=None):
- """
- Take the mean over all non-batch dimensions.
- """
- if mask is None:
- return tensor.mean(dim=list(range(1, len(tensor.shape))))
- else:
- assert tensor.dim() == 5
- assert tensor.shape[2] == mask.shape[1]
- tensor = rearrange(tensor, "b c t h w -> b t (c h w)")
- denom = mask.sum(dim=1) * tensor.shape[-1]
- loss = (tensor * mask.unsqueeze(2)).sum(dim=1).sum(dim=1) / denom
- return loss
-
-
-class ModelMeanType(enum.Enum):
- """
- Which type of output the model predicts.
- """
-
- PREVIOUS_X = enum.auto() # the model predicts x_{t-1}
- START_X = enum.auto() # the model predicts x_0
- EPSILON = enum.auto() # the model predicts epsilon
-
-
-class ModelVarType(enum.Enum):
- """
- What is used as the model's output variance.
- The LEARNED_RANGE option has been added to allow the model to predict
- values between FIXED_SMALL and FIXED_LARGE, making its job easier.
- """
-
- LEARNED = enum.auto()
- FIXED_SMALL = enum.auto()
- FIXED_LARGE = enum.auto()
- LEARNED_RANGE = enum.auto()
-
-
-class LossType(enum.Enum):
- MSE = enum.auto() # use raw MSE loss (and KL when learning variances)
- RESCALED_MSE = enum.auto() # use raw MSE loss (with RESCALED_KL when learning variances)
- KL = enum.auto() # use the variational lower-bound
- RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB
-
- def is_vb(self):
- return self == LossType.KL or self == LossType.RESCALED_KL
-
-
-def _warmup_beta(beta_start: float, beta_end: float, num_diffusion_timesteps: int, warmup_frac: float) -> torch.Tensor:
- betas = beta_end * torch.ones(num_diffusion_timesteps, dtype=torch.float64)
- warmup_time = int(num_diffusion_timesteps * warmup_frac)
- betas[:warmup_time] = torch.linspace(beta_start, beta_end, warmup_time, dtype=torch.float64)
- return betas
-
-
-def get_beta_schedule(
- beta_schedule: str, *, beta_start: float, beta_end: float, num_diffusion_timesteps: int
-) -> torch.Tensor:
- """
- This is the deprecated API for creating beta schedules.
- See get_named_beta_schedule() for the new library of schedules.
- """
- if beta_schedule == "quad":
- betas = (
- torch.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_diffusion_timesteps,
- dtype=torch.float64,
- )
- ** 2
- )
- elif beta_schedule == "linear":
- betas = torch.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=torch.float64)
- elif beta_schedule == "warmup10":
- betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
- elif beta_schedule == "warmup50":
- betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
- elif beta_schedule == "const":
- betas = beta_end * torch.ones(num_diffusion_timesteps, dtype=torch.float64)
- elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1
- betas = 1.0 / torch.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=torch.float64)
- else:
- raise NotImplementedError(beta_schedule)
- assert betas.shape == (num_diffusion_timesteps,)
- return betas
-
-
-def betas_for_alpha_bar(num_diffusion_timesteps: int, alpha_bar: Callable, max_beta: float = 0.999):
- """
- Create a beta schedule that discretizes the given alpha_t_bar function,
- which defines the cumulative product of (1-beta) over time from t = [0,1].
- :param num_diffusion_timesteps: the number of betas to produce.
- :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
- produces the cumulative product of (1-beta) up to that
- part of the diffusion process.
- :param max_beta: the maximum beta to use; use values lower than 1 to
- prevent singularities.
- """
- betas = []
- for i in range(num_diffusion_timesteps):
- t1 = i / num_diffusion_timesteps
- t2 = (i + 1) / num_diffusion_timesteps
- betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
- return torch.DoubleTensor(betas)
-
-
-def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
- """
- Get a pre-defined beta schedule for the given name.
- The beta schedule library consists of beta schedules which remain similar
- in the limit of num_diffusion_timesteps.
- Beta schedules may be added, but should not be removed or changed once
- they are committed to maintain backwards compatibility.
- """
- if schedule_name == "linear":
- # Linear schedule from Ho et al, extended to work for any number of
- # diffusion steps.
- scale = 1000 / num_diffusion_timesteps
- return get_beta_schedule(
- "linear",
- beta_start=scale * 0.0001,
- beta_end=scale * 0.02,
- num_diffusion_timesteps=num_diffusion_timesteps,
- )
- elif schedule_name == "squaredcos_cap_v2":
- return betas_for_alpha_bar(
- num_diffusion_timesteps,
- lambda t: matorch.cos((t + 0.008) / 1.008 * matorch.pi / 2) ** 2,
- )
- else:
- raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
-
-
-class GaussianDiffusion:
- """
- Utilities for training and sampling diffusion models.
- Original ported from this codebase:
- https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
- :param betas: a 1-D numpy array of betas for each diffusion timestep,
- starting at T and going to 1.
- """
-
- def __init__(
- self, *, betas: torch.Tensor, model_mean_type: str, model_var_type: str, loss_type: str, device: str = "cuda"
- ):
- if device == "cuda":
- device = torch.device(f"cuda:{torch.cuda.current_device()}")
- elif device == "cpu":
- device = torch.device("cpu")
- else:
- raise ValueError(f"Unknown device: {device}")
- self.device = device
- self.model_mean_type = model_mean_type
- self.model_var_type = model_var_type
- self.loss_type = loss_type
-
- # Use float64 for accuracy.
- self.betas = betas.to(self.device)
- assert len(self.betas.shape) == 1, "betas must be 1-D"
- assert (self.betas > 0).all() and (self.betas <= 1).all()
-
- self.num_timesteps = int(betas.shape[0])
-
- alphas = 1.0 - self.betas
- self.alphas_cumprod = torch.cumprod(alphas, axis=0)
- self.alphas_cumprod_prev = torch.cat([torch.tensor([1.0], device=self.device), self.alphas_cumprod[:-1]])
- self.alphas_cumprod_next = torch.cat([self.alphas_cumprod[1:], torch.tensor([0.0], device=self.device)])
- assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
-
- # calculations for diffusion q(x_t | x_{t-1}) and others
- self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
- self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)
- self.log_one_minus_alphas_cumprod = torch.log(1.0 - self.alphas_cumprod)
- self.sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod)
- self.sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod - 1)
-
- # calculations for posterior q(x_{t-1} | x_t, x_0)
- self.posterior_variance = self.betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
- # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
- self.posterior_log_variance_clipped = (
- torch.log(torch.cat([self.posterior_variance[1].unsqueeze(0), self.posterior_variance[1:]]))
- if len(self.posterior_variance) > 1
- else torch.DoubleTensor([])
- )
-
- self.posterior_mean_coef1 = self.betas * torch.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
- self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * torch.sqrt(alphas) / (1.0 - self.alphas_cumprod)
-
- def q_mean_variance(self, x_start, t):
- """
- Get the distribution q(x_t | x_0).
- :param x_start: the [N x C x ...] tensor of noiseless inputs.
- :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
- :return: A tuple (mean, variance, log_variance), all of x_start's shape.
- """
- mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
- variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
- log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
- return mean, variance, log_variance
-
- def q_sample(self, x_start, t, noise=None):
- """
- Diffuse the data for a given number of diffusion steps.
- In other words, sample from q(x_t | x_0).
- :param x_start: the initial data batch.
- :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
- :param noise: if specified, the split-out normal noise.
- :return: A noisy version of x_start.
- """
- if noise is None:
- noise = torch.randn_like(x_start)
- assert noise.shape == x_start.shape
- return (
- _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
- + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
- )
-
- def q_posterior_mean_variance(self, x_start, x_t, t):
- """
- Compute the mean and variance of the diffusion posterior:
- q(x_{t-1} | x_t, x_0)
- """
- assert x_start.shape == x_t.shape
- posterior_mean = (
- _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
- + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
- )
- posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
- posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
- assert (
- posterior_mean.shape[0]
- == posterior_variance.shape[0]
- == posterior_log_variance_clipped.shape[0]
- == x_start.shape[0]
- )
- return posterior_mean, posterior_variance, posterior_log_variance_clipped
-
- def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
- """
- Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
- the initial x, x_0.
- :param model: the model, which takes a signal and a batch of timesteps
- as input.
- :param x: the [N x C x ...] tensor at time t.
- :param t: a 1-D Tensor of timesteps.
- :param clip_denoised: if True, clip the denoised signal into [-1, 1].
- :param denoised_fn: if not None, a function which applies to the
- x_start prediction before it is used to sample. Applies before
- clip_denoised.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :return: a dict with the following keys:
- - 'mean': the model mean output.
- - 'variance': the model variance output.
- - 'log_variance': the log of 'variance'.
- - 'pred_xstart': the prediction for x_0.
- """
- if model_kwargs is None:
- model_kwargs = {}
-
- B, C = x.shape[:2]
- assert t.shape == (B,)
- model_output = model(x, t, **model_kwargs)
- if isinstance(model_output, tuple):
- model_output, extra = model_output
- else:
- extra = None
-
- if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
- assert model_output.shape == (B, C * 2, *x.shape[2:])
- model_output, model_var_values = torch.split(model_output, C, dim=1)
- min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
- max_log = _extract_into_tensor(torch.log(self.betas), t, x.shape)
- # The model_var_values is [-1, 1] for [min_var, max_var].
- frac = (model_var_values + 1) / 2
- model_log_variance = frac * max_log + (1 - frac) * min_log
- model_variance = torch.exp(model_log_variance)
- else:
- model_variance, model_log_variance = {
- # for fixedlarge, we set the initial (log-)variance like so
- # to get a better decoder log likelihood.
- ModelVarType.FIXED_LARGE: (
- torch.cat(self.posterior_variance[1].unsqueeze(0), self.betas[1:]),
- torch.log(torch.cat(self.posterior_variance[1].unsqueeze(0), self.betas[1:])),
- ),
- ModelVarType.FIXED_SMALL: (
- self.posterior_variance,
- self.posterior_log_variance_clipped,
- ),
- }[self.model_var_type]
- model_variance = _extract_into_tensor(model_variance, t, x.shape)
- model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
-
- def process_xstart(x):
- if denoised_fn is not None:
- x = denoised_fn(x)
- if clip_denoised:
- return x.clamp(-1, 1)
- return x
-
- if self.model_mean_type == ModelMeanType.START_X:
- pred_xstart = process_xstart(model_output)
- else:
- pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output))
- model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
-
- assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
- return {
- "mean": model_mean,
- "variance": model_variance,
- "log_variance": model_log_variance,
- "pred_xstart": pred_xstart,
- "extra": extra,
- }
-
- def _predict_xstart_from_eps(self, x_t, t, eps):
- assert x_t.shape == eps.shape
- return (
- _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
- - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
- )
-
- def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
- return (
- _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
- ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
-
- def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
- """
- Compute the mean for the previous step, given a function cond_fn that
- computes the gradient of a conditional log probability with respect to
- x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
- condition on y.
- This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
- """
- gradient = cond_fn(x, t, **model_kwargs)
- new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
- return new_mean
-
- def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
- """
- Compute what the p_mean_variance output would have been, should the
- model's score function be conditioned by cond_fn.
- See condition_mean() for details on cond_fn.
- Unlike condition_mean(), this instead uses the conditioning strategy
- from Song et al (2020).
- """
- alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
-
- eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
- eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
-
- out = p_mean_var.copy()
- out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
- out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
- return out
-
- def p_sample(
- self,
- model,
- x,
- t,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- mask=None,
- ):
- """
- Sample x_{t-1} from the model at the given timestep.
- :param model: the model to sample from.
- :param x: the current tensor at x_{t-1}.
- :param t: the value of t, starting at 0 for the first diffusion step.
- :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
- :param denoised_fn: if not None, a function which applies to the
- x_start prediction before it is used to sample.
- :param cond_fn: if not None, this is a gradient function that acts
- similarly to the model.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :return: a dict containing the following keys:
- - 'sample': a random sample from the model.
- - 'pred_xstart': a prediction of x_0.
- """
- if mask is not None:
- if mask.shape[0] != x.shape[0]:
- mask = mask.repeat(2, 1) # HACK
- mask_t = (mask * len(self.betas)).to(torch.int)
-
- # x0: copy unchanged x values
- # x_noise: add noise to x values
- x0 = x.clone()
- x_noise = x0 * _extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) + torch.randn_like(
- x
- ) * _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape)
-
- # active noise addition
- mask_t_equall = (mask_t == t.unsqueeze(1))[:, None, :, None, None]
- x = torch.where(mask_t_equall, x_noise, x0)
-
- # create x_mask
- mask_t_upper = (mask_t > t.unsqueeze(1))[:, None, :, None, None]
- batch_size = x.shape[0]
- model_kwargs["x_mask"] = mask_t_upper.reshape(batch_size, -1).to(torch.bool)
-
- out = self.p_mean_variance(
- model,
- x,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- model_kwargs=model_kwargs,
- )
- noise = torch.randn_like(x)
- nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0
- if cond_fn is not None:
- out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
- sample = out["mean"] + nonzero_mask * torch.exp(0.5 * out["log_variance"]) * noise
-
- if mask is not None:
- mask_t_lower = (mask_t < t.unsqueeze(1))[:, None, :, None, None]
- sample = torch.where(mask_t_lower, x0, sample)
-
- return {"sample": sample, "pred_xstart": out["pred_xstart"]}
-
- def p_sample_loop(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- mask=None,
- ):
- """
- Generate samples from the model.
- :param model: the model module.
- :param shape: the shape of the samples, (N, C, H, W).
- :param noise: if specified, the noise from the encoder to sample.
- Should be of the same shape as `shape`.
- :param clip_denoised: if True, clip x_start predictions to [-1, 1].
- :param denoised_fn: if not None, a function which applies to the
- x_start prediction before it is used to sample.
- :param cond_fn: if not None, this is a gradient function that acts
- similarly to the model.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :param device: if specified, the device to create the samples on.
- If not specified, use a model parameter's device.
- :param progress: if True, show a tqdm progress bar.
- :return: a non-differentiable batch of samples.
- """
- final = None
- for sample in self.p_sample_loop_progressive(
- model,
- shape,
- noise=noise,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- device=device,
- progress=progress,
- mask=mask,
- ):
- final = sample
- return final["sample"]
-
- def p_sample_loop_progressive(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- mask=None,
- ):
- """
- Generate samples from the model and yield intermediate samples from
- each timestep of diffusion.
- Arguments are the same as p_sample_loop().
- Returns a generator over dicts, where each dict is the return value of
- p_sample().
- """
- if device is None:
- device = next(model.parameters()).device
- assert isinstance(shape, (tuple, list))
- if noise is not None:
- img = noise
- else:
- img = torch.randn(*shape, device=device)
- indices = list(range(self.num_timesteps))[::-1]
-
- if progress:
- # Lazy import so that we don't depend on tqdm.
- from tqdm.auto import tqdm
-
- indices = tqdm(indices)
-
- for i in indices:
- t = torch.tensor([i] * shape[0], device=device)
- with torch.no_grad():
- out = self.p_sample(
- model,
- img,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- mask=mask,
- )
- yield out
- img = out["sample"]
-
- def ddim_sample(
- self,
- model,
- x,
- t,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- eta=0.0,
- ):
- """
- Sample x_{t-1} from the model using DDIM.
- Same usage as p_sample().
- """
- out = self.p_mean_variance(
- model,
- x,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- model_kwargs=model_kwargs,
- )
- if cond_fn is not None:
- out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
-
- # Usually our model outputs epsilon, but we re-derive it
- # in case we used x_start or x_prev prediction.
- eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
-
- alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
- alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
- sigma = eta * torch.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * torch.sqrt(1 - alpha_bar / alpha_bar_prev)
- # Equation 12.
- noise = torch.randn_like(x)
- mean_pred = out["pred_xstart"] * torch.sqrt(alpha_bar_prev) + torch.sqrt(1 - alpha_bar_prev - sigma**2) * eps
- nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0
- sample = mean_pred + nonzero_mask * sigma * noise
- return {"sample": sample, "pred_xstart": out["pred_xstart"]}
-
- def ddim_reverse_sample(
- self,
- model,
- x,
- t,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- eta=0.0,
- ):
- """
- Sample x_{t+1} from the model using DDIM reverse ODE.
- """
- assert eta == 0.0, "Reverse ODE only for deterministic path"
- out = self.p_mean_variance(
- model,
- x,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- model_kwargs=model_kwargs,
- )
- if cond_fn is not None:
- out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
- # Usually our model outputs epsilon, but we re-derive it
- # in case we used x_start or x_prev prediction.
- eps = (
- _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"]
- ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
- alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
-
- # Equation 12. reversed
- mean_pred = out["pred_xstart"] * torch.sqrt(alpha_bar_next) + torch.sqrt(1 - alpha_bar_next) * eps
-
- return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
-
- def ddim_sample_loop(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- eta=0.0,
- ):
- """
- Generate samples from the model using DDIM.
- Same usage as p_sample_loop().
- """
- final = None
- for sample in self.ddim_sample_loop_progressive(
- model,
- shape,
- noise=noise,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- device=device,
- progress=progress,
- eta=eta,
- ):
- final = sample
- return final["sample"]
-
- def ddim_sample_loop_progressive(
- self,
- model,
- shape,
- noise=None,
- clip_denoised=True,
- denoised_fn=None,
- cond_fn=None,
- model_kwargs=None,
- device=None,
- progress=False,
- eta=0.0,
- ):
- """
- Use DDIM to sample from the model and yield intermediate samples from
- each timestep of DDIM.
- Same usage as p_sample_loop_progressive().
- """
- if device is None:
- device = next(model.parameters()).device
- assert isinstance(shape, (tuple, list))
- if noise is not None:
- img = noise
- else:
- img = torch.randn(*shape, device=device)
- indices = list(range(self.num_timesteps))[::-1]
-
- if progress:
- # Lazy import so that we don't depend on tqdm.
- from tqdm.auto import tqdm
-
- indices = tqdm(indices)
-
- for i in indices:
- t = torch.tensor([i] * shape[0], device=device)
- with torch.no_grad():
- out = self.ddim_sample(
- model,
- img,
- t,
- clip_denoised=clip_denoised,
- denoised_fn=denoised_fn,
- cond_fn=cond_fn,
- model_kwargs=model_kwargs,
- eta=eta,
- )
- yield out
- img = out["sample"]
-
- def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None, mask=None):
- """
- Get a term for the variational lower-bound.
- The resulting units are bits (rather than nats, as one might expect).
- This allows for comparison to other papers.
- :return: a dict with the following keys:
- - 'output': a shape [N] tensor of NLLs or KLs.
- - 'pred_xstart': the x_0 predictions.
- """
- true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)
- out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs)
- kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"])
- kl = mean_flat(kl, mask=mask) / np.log(2.0)
-
- decoder_nll = -discretized_gaussian_log_likelihood(
- x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
- )
- assert decoder_nll.shape == x_start.shape
- decoder_nll = mean_flat(decoder_nll, mask=mask) / np.log(2.0)
-
- # At the first timestep return the decoder NLL,
- # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
- output = torch.where((t == 0), decoder_nll, kl)
- return {"output": output, "pred_xstart": out["pred_xstart"]}
-
- def training_losses(self, model, x_start, t, model_kwargs=None, noise=None, mask=None, weights=None):
- """
- Compute training losses for a single timestep.
- :param model: the model to evaluate loss on.
- :param x_start: the [N x C x ...] tensor of inputs.
- :param t: a batch of timestep indices.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :param noise: if specified, the specific Gaussian noise to try to remove.
- :return: a dict with the key "loss" containing a tensor of shape [N].
- Some mean or variance settings may also have other keys.
- """
- if model_kwargs is None:
- model_kwargs = {}
- if noise is None:
- noise = torch.randn_like(x_start)
- x_t = self.q_sample(x_start, t, noise=noise)
- if mask is not None:
- t0 = torch.zeros_like(t)
- x_t0 = self.q_sample(x_start, t0, noise=noise)
- x_t = torch.where(mask[:, None, :, None, None], x_t, x_t0)
-
- terms = {}
-
- if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
- assert mask is None, "mask not supported for KL loss"
- terms["loss"] = self._vb_terms_bpd(
- model=model,
- x_start=x_start,
- x_t=x_t,
- t=t,
- clip_denoised=False,
- model_kwargs=model_kwargs,
- )["output"]
- if self.loss_type == LossType.RESCALED_KL:
- terms["loss"] *= self.num_timesteps
- elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
- model_output = model(x_t, t, **model_kwargs)
-
- if self.model_var_type in [
- ModelVarType.LEARNED,
- ModelVarType.LEARNED_RANGE,
- ]:
- B, C = x_t.shape[:2]
- assert model_output.shape == (B, C * 2, *x_t.shape[2:])
- model_output, model_var_values = torch.split(model_output, C, dim=1)
- # Learn the variance using the variational bound, but don't let
- # it affect our mean prediction.
- frozen_out = torch.cat([model_output.detach(), model_var_values], dim=1)
- terms["vb"] = self._vb_terms_bpd(
- model=lambda *args, r=frozen_out: r,
- x_start=x_start,
- x_t=x_t,
- t=t,
- clip_denoised=False,
- mask=mask,
- )["output"]
- if self.loss_type == LossType.RESCALED_MSE:
- # Divide by 1000 for equivalence with initial implementation.
- # Without a factor of 1/1000, the VB term hurts the MSE term.
- terms["vb"] *= self.num_timesteps / 1000.0
-
- target = {
- ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0],
- ModelMeanType.START_X: x_start,
- ModelMeanType.EPSILON: noise,
- }[self.model_mean_type]
- assert model_output.shape == target.shape == x_start.shape
- if weights is None:
- terms["mse"] = mean_flat((target - model_output) ** 2, mask=mask)
- else:
- weight = _extract_into_tensor(weights, t, target.shape)
- terms["mse"] = mean_flat(weight * (target - model_output) ** 2, mask=mask)
- if "vb" in terms:
- terms["loss"] = terms["mse"] + terms["vb"]
- else:
- terms["loss"] = terms["mse"]
- else:
- raise NotImplementedError(self.loss_type)
-
- return terms
-
- def _prior_bpd(self, x_start):
- """
- Get the prior KL term for the variational lower-bound, measured in
- bits-per-dim.
- This term can't be optimized, as it only depends on the encoder.
- :param x_start: the [N x C x ...] tensor of inputs.
- :return: a batch of [N] KL values (in bits), one per batch element.
- """
- batch_size = x_start.shape[0]
- t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
- qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
- kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
- return mean_flat(kl_prior) / np.log(2.0)
-
- def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
- """
- Compute the entire variational lower-bound, measured in bits-per-dim,
- as well as other related quantities.
- :param model: the model to evaluate loss on.
- :param x_start: the [N x C x ...] tensor of inputs.
- :param clip_denoised: if True, clip denoised samples.
- :param model_kwargs: if not None, a dict of extra keyword arguments to
- pass to the model. This can be used for conditioning.
- :return: a dict containing the following keys:
- - total_bpd: the total variational lower-bound, per batch element.
- - prior_bpd: the prior term in the lower-bound.
- - vb: an [N x T] tensor of terms in the lower-bound.
- - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
- - mse: an [N x T] tensor of epsilon MSEs for each timestep.
- """
- device = x_start.device
- batch_size = x_start.shape[0]
-
- vb = []
- xstart_mse = []
- mse = []
- for t in list(range(self.num_timesteps))[::-1]:
- t_batch = torch.tensor([t] * batch_size, device=device)
- noise = torch.randn_like(x_start)
- x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
- # Calculate VLB term at the current timestep
- with torch.no_grad():
- out = self._vb_terms_bpd(
- model,
- x_start=x_start,
- x_t=x_t,
- t=t_batch,
- clip_denoised=clip_denoised,
- model_kwargs=model_kwargs,
- )
- vb.append(out["output"])
- xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
- eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
- mse.append(mean_flat((eps - noise) ** 2))
-
- vb = torch.stack(vb, dim=1)
- xstart_mse = torch.stack(xstart_mse, dim=1)
- mse = torch.stack(mse, dim=1)
-
- prior_bpd = self._prior_bpd(x_start)
- total_bpd = vb.sum(dim=1) + prior_bpd
- return {
- "total_bpd": total_bpd,
- "prior_bpd": prior_bpd,
- "vb": vb,
- "xstart_mse": xstart_mse,
- "mse": mse,
- }
-
-
-def _extract_into_tensor(arr: torch.Tensor, timesteps: torch.Tensor, broadcast_shape: List[int]):
- """
- Extract values from a 1-D numpy array for a batch of indices.
- :param arr: the 1-D numpy array.
- :param timesteps: a tensor of indices into the array to extract.
- :param broadcast_shape: a larger shape of K dimensions with the batch
- dimension equal to the length of timesteps.
- :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
- """
- res = arr.to(timesteps.device)[timesteps].float()
- while len(res.shape) < len(broadcast_shape):
- res = res[..., None]
- return res + torch.zeros(broadcast_shape, device=timesteps.device)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/respace.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/respace.py
deleted file mode 100644
index b2a130f917f6a67e310ac80857d70400e10ff95a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/respace.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Adapted from DiT
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# DiT: https://github.com/facebookresearch/DiT/tree/main
-# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-# --------------------------------------------------------
-
-
-import torch
-from colossalai.utils import get_current_device
-
-from .gaussian_diffusion import GaussianDiffusion
-
-
-def space_timesteps(num_timesteps, section_counts):
- """
- Create a list of timesteps to use from an original diffusion process,
- given the number of timesteps we want to take from equally-sized portions
- of the original process.
- For example, if there's 300 timesteps and the section counts are [10,15,20]
- then the first 100 timesteps are strided to be 10 timesteps, the second 100
- are strided to be 15 timesteps, and the final 100 are strided to be 20.
- If the stride is a string starting with "ddim", then the fixed striding
- from the DDIM paper is used, and only one section is allowed.
- :param num_timesteps: the number of diffusion steps in the original
- process to divide up.
- :param section_counts: either a list of numbers, or a string containing
- comma-separated numbers, indicating the step count
- per section. As a special case, use "ddimN" where N
- is a number of steps to use the striding from the
- DDIM paper.
- :return: a set of diffusion steps from the original process to use.
- """
- if isinstance(section_counts, str):
- if section_counts.startswith("ddim"):
- desired_count = int(section_counts[len("ddim") :])
- for i in range(1, num_timesteps):
- if len(range(0, num_timesteps, i)) == desired_count:
- return set(range(0, num_timesteps, i))
- raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride")
- section_counts = [int(x) for x in section_counts.split(",")]
- size_per = num_timesteps // len(section_counts)
- extra = num_timesteps % len(section_counts)
- start_idx = 0
- all_steps = []
- for i, section_count in enumerate(section_counts):
- size = size_per + (1 if i < extra else 0)
- if size < section_count:
- raise ValueError(f"cannot divide section of {size} steps into {section_count}")
- if section_count <= 1:
- frac_stride = 1
- else:
- frac_stride = (size - 1) / (section_count - 1)
- cur_idx = 0.0
- taken_steps = []
- for _ in range(section_count):
- taken_steps.append(start_idx + round(cur_idx))
- cur_idx += frac_stride
- all_steps += taken_steps
- start_idx += size
- return set(all_steps)
-
-
-class SpacedDiffusion(GaussianDiffusion):
- """
- A diffusion process which can skip steps in a base diffusion process.
- :param use_timesteps: a collection (sequence or set) of timesteps from the
- original diffusion process to retain.
- :param kwargs: the kwargs to create the base diffusion process.
- """
-
- def __init__(self, use_timesteps, **kwargs):
- self.use_timesteps = set(use_timesteps)
- self.timestep_map = []
- self.original_num_steps = len(kwargs["betas"])
-
- base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa
- last_alpha_cumprod = 1.0
- new_betas = []
- for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
- if i in self.use_timesteps:
- new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
- last_alpha_cumprod = alpha_cumprod
- self.timestep_map.append(i)
- kwargs["betas"] = torch.FloatTensor(new_betas)
- super().__init__(**kwargs)
- self.map_tensor = torch.tensor(self.timestep_map, device=get_current_device())
-
- def p_mean_variance(self, model, *args, **kwargs): # pylint: disable=signature-differs
- return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
-
- def training_losses(self, model, *args, **kwargs): # pylint: disable=signature-differs
- return super().training_losses(self._wrap_model(model), *args, **kwargs)
-
- def condition_mean(self, cond_fn, *args, **kwargs):
- return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
-
- def condition_score(self, cond_fn, *args, **kwargs):
- return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
-
- def _wrap_model(self, model):
- if isinstance(model, _WrappedModel):
- return model
- return _WrappedModel(model, self.map_tensor, self.original_num_steps)
-
- def _scale_timesteps(self, t):
- # Scaling is done by the wrapped model.
- return t
-
-
-class _WrappedModel:
- def __init__(self, model, map_tensor, original_num_steps):
- self.model = model
- self.map_tensor = map_tensor
- # self.rescale_timesteps = rescale_timesteps
- self.original_num_steps = original_num_steps
-
- def __call__(self, x, ts, **kwargs):
- new_ts = self.map_tensor[ts].to(device=ts.device, dtype=ts.dtype)
- # if self.rescale_timesteps:
- # new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
- return self.model(x, new_ts, **kwargs)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/speed.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/speed.py
deleted file mode 100644
index 04ff02c4140f55aafb9919c940532fc14b6b0f62..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/speed.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-from opensora.registry import SCHEDULERS
-
-from . import gaussian_diffusion as gd
-from .respace import SpacedDiffusion, space_timesteps
-
-
-@SCHEDULERS.register_module("iddpm-speed")
-class SpeeDiffusion(SpacedDiffusion):
- def __init__(
- self,
- num_sampling_steps=None,
- timestep_respacing=None,
- noise_schedule="linear",
- use_kl=False,
- sigma_small=False,
- predict_xstart=False,
- learn_sigma=True,
- rescale_learned_sigmas=False,
- diffusion_steps=1000,
- cfg_scale=4.0,
- ):
- betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
- if use_kl:
- loss_type = gd.LossType.RESCALED_KL
- elif rescale_learned_sigmas:
- loss_type = gd.LossType.RESCALED_MSE
- else:
- loss_type = gd.LossType.MSE
- if num_sampling_steps is not None:
- assert timestep_respacing is None
- timestep_respacing = str(num_sampling_steps)
- if timestep_respacing is None or timestep_respacing == "":
- timestep_respacing = [diffusion_steps]
- super().__init__(
- use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
- betas=betas,
- model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X),
- model_var_type=(
- (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
- if not learn_sigma
- else gd.ModelVarType.LEARNED_RANGE
- ),
- loss_type=loss_type,
- )
-
- self.cfg_scale = cfg_scale
- # we fallback to numpy here as argmax_cuda is not implemented for Bool
- grad = np.gradient(self.sqrt_one_minus_alphas_cumprod.cpu())
- self.meaningful_steps = np.argmax(grad < 5e-5) + 1
-
- # p2 weighting from: Perception Prioritized Training of Diffusion Models
- self.p2_gamma = 1
- self.p2_k = 1
- self.snr = 1.0 / (1 - self.alphas_cumprod) - 1
- sqrt_one_minus_alphas_bar = self.sqrt_one_minus_alphas_cumprod
- p = torch.tanh(1e6 * (torch.gradient(sqrt_one_minus_alphas_bar)[0] - 1e-4)) + 1.5
- self.p = F.normalize(p, p=1, dim=0)
- self.weights = 1 / (self.p2_k + self.snr) ** self.p2_gamma
-
- def t_sample(self, n, device):
- t = torch.multinomial(self.p, n // 2 + 1, replacement=True).to(device)
- dual_t = torch.where(t < self.meaningful_steps, self.meaningful_steps - t, t - self.meaningful_steps)
- t = torch.cat([t, dual_t], dim=0)[:n]
- return t
-
- def training_losses(self, model, x, t, *args, **kwargs): # pylint: disable=signature-differs
- t = self.t_sample(x.shape[0], x.device)
- return super().training_losses(model, x, t, weights=self.weights, *args, **kwargs)
-
- def sample(self, *args, **kwargs):
- raise NotImplementedError("SpeeDiffusion is only for training")
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/timestep_sampler.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/timestep_sampler.py
deleted file mode 100644
index 52b6717d528f398cd08f34c347b7fb69f4d5a9a3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/schedulers/iddpm/timestep_sampler.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Adapted from DiT
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# References:
-# DiT: https://github.com/facebookresearch/DiT/tree/main
-# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
-# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
-# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-# --------------------------------------------------------
-
-from abc import ABC, abstractmethod
-
-import numpy as np
-import torch as th
-import torch.distributed as dist
-
-
-def create_named_schedule_sampler(name, diffusion):
- """
- Create a ScheduleSampler from a library of pre-defined samplers.
- :param name: the name of the sampler.
- :param diffusion: the diffusion object to sample for.
- """
- if name == "uniform":
- return UniformSampler(diffusion)
- elif name == "loss-second-moment":
- return LossSecondMomentResampler(diffusion)
- else:
- raise NotImplementedError(f"unknown schedule sampler: {name}")
-
-
-class ScheduleSampler(ABC):
- """
- A distribution over timesteps in the diffusion process, intended to reduce
- variance of the objective.
- By default, samplers perform unbiased importance sampling, in which the
- objective's mean is unchanged.
- However, subclasses may override sample() to change how the resampled
- terms are reweighted, allowing for actual changes in the objective.
- """
-
- @abstractmethod
- def weights(self):
- """
- Get a numpy array of weights, one per diffusion step.
- The weights needn't be normalized, but must be positive.
- """
-
- def sample(self, batch_size, device):
- """
- Importance-sample timesteps for a batch.
- :param batch_size: the number of timesteps.
- :param device: the torch device to save to.
- :return: a tuple (timesteps, weights):
- - timesteps: a tensor of timestep indices.
- - weights: a tensor of weights to scale the resulting losses.
- """
- w = self.weights()
- p = w / np.sum(w)
- indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
- indices = th.from_numpy(indices_np).long().to(device)
- weights_np = 1 / (len(p) * p[indices_np])
- weights = th.from_numpy(weights_np).float().to(device)
- return indices, weights
-
-
-class UniformSampler(ScheduleSampler):
- def __init__(self, diffusion):
- self.diffusion = diffusion
- self._weights = np.ones([diffusion.num_timesteps])
-
- def weights(self):
- return self._weights
-
-
-class LossAwareSampler(ScheduleSampler):
- def update_with_local_losses(self, local_ts, local_losses):
- """
- Update the reweighting using losses from a model.
- Call this method from each rank with a batch of timesteps and the
- corresponding losses for each of those timesteps.
- This method will perform synchronization to make sure all of the ranks
- maintain the exact same reweighting.
- :param local_ts: an integer Tensor of timesteps.
- :param local_losses: a 1D Tensor of losses.
- """
- batch_sizes = [th.tensor([0], dtype=th.int32, device=local_ts.device) for _ in range(dist.get_world_size())]
- dist.all_gather(
- batch_sizes,
- th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
- )
-
- # Pad all_gather batches to be the maximum batch size.
- batch_sizes = [x.item() for x in batch_sizes]
- max_bs = max(batch_sizes)
-
- timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
- loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
- dist.all_gather(timestep_batches, local_ts)
- dist.all_gather(loss_batches, local_losses)
- timesteps = [x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]]
- losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
- self.update_with_all_losses(timesteps, losses)
-
- @abstractmethod
- def update_with_all_losses(self, ts, losses):
- """
- Update the reweighting using losses from a model.
- Sub-classes should override this method to update the reweighting
- using losses from the model.
- This method directly updates the reweighting without synchronizing
- between workers. It is called by update_with_local_losses from all
- ranks with identical arguments. Thus, it should have deterministic
- behavior to maintain state across workers.
- :param ts: a list of int timesteps.
- :param losses: a list of float losses, one per timestep.
- """
-
-
-class LossSecondMomentResampler(LossAwareSampler):
- def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
- self.diffusion = diffusion
- self.history_per_term = history_per_term
- self.uniform_prob = uniform_prob
- self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64)
- self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
-
- def weights(self):
- if not self._warmed_up():
- return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
- weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
- weights /= np.sum(weights)
- weights *= 1 - self.uniform_prob
- weights += self.uniform_prob / len(weights)
- return weights
-
- def update_with_all_losses(self, ts, losses):
- for t, loss in zip(ts, losses):
- if self._loss_counts[t] == self.history_per_term:
- # Shift out the oldest loss term.
- self._loss_history[t, :-1] = self._loss_history[t, 1:]
- self._loss_history[t, -1] = loss
- else:
- self._loss_history[t, self._loss_counts[t]] = loss
- self._loss_counts[t] += 1
-
- def _warmed_up(self):
- return (self._loss_counts == self.history_per_term).all()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/ckpt_utils.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/ckpt_utils.py
deleted file mode 100644
index 7dc16eea393e8b1c18eaf824e7cfbaf3e3173aa7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/ckpt_utils.py
+++ /dev/null
@@ -1,273 +0,0 @@
-import functools
-import json
-import logging
-import operator
-import os
-from typing import Tuple
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from colossalai.booster import Booster
-from colossalai.checkpoint_io import GeneralCheckpointIO
-from colossalai.cluster import DistCoordinator
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
-from torchvision.datasets.utils import download_url
-
-from opensora.datasets.sampler import VariableVideoBatchSampler
-
-hf_endpoint = os.environ.get("HF_ENDPOINT")
-if hf_endpoint is None:
- hf_endpoint = "https://huggingface.co"
-
-pretrained_models = {
- "DiT-XL-2-512x512.pt": "https://dl.fbaipublicfiles.com/DiT/models/DiT-XL-2-512x512.pt",
- "DiT-XL-2-256x256.pt": "https://dl.fbaipublicfiles.com/DiT/models/DiT-XL-2-256x256.pt",
- "Latte-XL-2-256x256-ucf101.pt": hf_endpoint + "/maxin-cn/Latte/resolve/main/ucf101.pt",
- "PixArt-XL-2-256x256.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-256x256.pth",
- "PixArt-XL-2-SAM-256x256.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-SAM-256x256.pth",
- "PixArt-XL-2-512x512.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-512x512.pth",
- "PixArt-XL-2-1024-MS.pth": hf_endpoint + "/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-1024-MS.pth",
- "OpenSora-v1-16x256x256.pth": hf_endpoint + "/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-16x256x256.pth",
- "OpenSora-v1-HQ-16x256x256.pth": hf_endpoint + "/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-HQ-16x256x256.pth",
- "OpenSora-v1-HQ-16x512x512.pth": hf_endpoint + "/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-HQ-16x512x512.pth",
-}
-
-
-def reparameter(ckpt, name=None, model=None):
- name = os.path.basename(name)
- if name in ["DiT-XL-2-512x512.pt", "DiT-XL-2-256x256.pt"]:
- ckpt["x_embedder.proj.weight"] = ckpt["x_embedder.proj.weight"].unsqueeze(2)
- del ckpt["pos_embed"]
- if name in ["Latte-XL-2-256x256-ucf101.pt"]:
- ckpt = ckpt["ema"]
- ckpt["x_embedder.proj.weight"] = ckpt["x_embedder.proj.weight"].unsqueeze(2)
- del ckpt["pos_embed"]
- del ckpt["temp_embed"]
- if name in ["PixArt-XL-2-256x256.pth", "PixArt-XL-2-SAM-256x256.pth", "PixArt-XL-2-512x512.pth"]:
- ckpt = ckpt["state_dict"]
- ckpt["x_embedder.proj.weight"] = ckpt["x_embedder.proj.weight"].unsqueeze(2)
- del ckpt["pos_embed"]
-
- # no need pos_embed
- if "pos_embed_temporal" in ckpt:
- del ckpt["pos_embed_temporal"]
- if "pos_embed" in ckpt:
- del ckpt["pos_embed"]
- # different text length
- if "y_embedder.y_embedding" in ckpt:
- if ckpt["y_embedder.y_embedding"].shape[0] < model.y_embedder.y_embedding.shape[0]:
- print(
- f"Extend y_embedding from {ckpt['y_embedder.y_embedding'].shape[0]} to {model.y_embedder.y_embedding.shape[0]}"
- )
- additional_length = model.y_embedder.y_embedding.shape[0] - ckpt["y_embedder.y_embedding"].shape[0]
- new_y_embedding = torch.zeros(additional_length, model.y_embedder.y_embedding.shape[1])
- new_y_embedding[:] = ckpt["y_embedder.y_embedding"][-1]
- ckpt["y_embedder.y_embedding"] = torch.cat([ckpt["y_embedder.y_embedding"], new_y_embedding], dim=0)
- elif ckpt["y_embedder.y_embedding"].shape[0] > model.y_embedder.y_embedding.shape[0]:
- print(
- f"Shrink y_embedding from {ckpt['y_embedder.y_embedding'].shape[0]} to {model.y_embedder.y_embedding.shape[0]}"
- )
- ckpt["y_embedder.y_embedding"] = ckpt["y_embedder.y_embedding"][: model.y_embedder.y_embedding.shape[0]]
-
- return ckpt
-
-
-def find_model(model_name, model=None):
- """
- Finds a pre-trained DiT model, downloading it if necessary. Alternatively, loads a model from a local path.
- """
- if model_name in pretrained_models: # Find/download our pre-trained DiT checkpoints
- model_ckpt = download_model(model_name)
- model_ckpt = reparameter(model_ckpt, model_name, model=model)
- else: # Load a custom DiT checkpoint:
- assert os.path.isfile(model_name), f"Could not find DiT checkpoint at {model_name}"
- model_ckpt = torch.load(model_name, map_location=lambda storage, loc: storage)
- model_ckpt = reparameter(model_ckpt, model_name, model=model)
- return model_ckpt
-
-
-def download_model(model_name=None, local_path=None, url=None):
- """
- Downloads a pre-trained DiT model from the web.
- """
- if model_name is not None:
- assert model_name in pretrained_models
- local_path = f"pretrained_models/{model_name}"
- web_path = pretrained_models[model_name]
- else:
- assert local_path is not None
- assert url is not None
- web_path = url
- if not os.path.isfile(local_path):
- os.makedirs("pretrained_models", exist_ok=True)
- dir_name = os.path.dirname(local_path)
- file_name = os.path.basename(local_path)
- download_url(web_path, dir_name, file_name)
- model = torch.load(local_path, map_location=lambda storage, loc: storage)
- return model
-
-
-def load_from_sharded_state_dict(model, ckpt_path):
- ckpt_io = GeneralCheckpointIO()
- ckpt_io.load_model(model, os.path.join(ckpt_path, "model"))
-
-
-def model_sharding(model: torch.nn.Module):
- global_rank = dist.get_rank()
- world_size = dist.get_world_size()
- for _, param in model.named_parameters():
- padding_size = (world_size - param.numel() % world_size) % world_size
- if padding_size > 0:
- padding_param = torch.nn.functional.pad(param.data.view(-1), [0, padding_size])
- else:
- padding_param = param.data.view(-1)
- splited_params = padding_param.split(padding_param.numel() // world_size)
- splited_params = splited_params[global_rank]
- param.data = splited_params
-
-
-def load_json(file_path: str):
- with open(file_path, "r") as f:
- return json.load(f)
-
-
-def save_json(data, file_path: str):
- with open(file_path, "w") as f:
- json.dump(data, f, indent=4)
-
-
-def remove_padding(tensor: torch.Tensor, original_shape: Tuple) -> torch.Tensor:
- return tensor[: functools.reduce(operator.mul, original_shape)]
-
-
-def model_gathering(model: torch.nn.Module, model_shape_dict: dict):
- global_rank = dist.get_rank()
- global_size = dist.get_world_size()
- for name, param in model.named_parameters():
- all_params = [torch.empty_like(param.data) for _ in range(global_size)]
- dist.all_gather(all_params, param.data, group=dist.group.WORLD)
- if int(global_rank) == 0:
- all_params = torch.cat(all_params)
- param.data = remove_padding(all_params, model_shape_dict[name]).view(model_shape_dict[name])
- dist.barrier()
-
-
-def record_model_param_shape(model: torch.nn.Module) -> dict:
- param_shape = {}
- for name, param in model.named_parameters():
- param_shape[name] = param.shape
- return param_shape
-
-
-def save(
- booster: Booster,
- model: nn.Module,
- ema: nn.Module,
- optimizer: Optimizer,
- lr_scheduler: _LRScheduler,
- epoch: int,
- step: int,
- global_step: int,
- batch_size: int,
- coordinator: DistCoordinator,
- save_dir: str,
- shape_dict: dict,
- sampler=None,
-):
- save_dir = os.path.join(save_dir, f"epoch{epoch}-global_step{global_step}")
- os.makedirs(os.path.join(save_dir, "model"), exist_ok=True)
-
- booster.save_model(model, os.path.join(save_dir, "model"), shard=True)
- # ema is not boosted, so we don't need to use booster.save_model
- model_gathering(ema, shape_dict)
- global_rank = dist.get_rank()
- if int(global_rank) == 0:
- torch.save(ema.state_dict(), os.path.join(save_dir, "ema.pt"))
- model_sharding(ema)
-
- booster.save_optimizer(optimizer, os.path.join(save_dir, "optimizer"), shard=True, size_per_shard=4096)
- if lr_scheduler is not None:
- booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, "lr_scheduler"))
- sampler_start_idx = step * batch_size if batch_size is not None else None
- running_states = {
- "epoch": epoch,
- "step": step,
- "global_step": global_step,
- "sample_start_index": sampler_start_idx,
- }
- if coordinator.is_master():
- save_json(running_states, os.path.join(save_dir, "running_states.json"))
- if sampler is not None:
- if isinstance(sampler, VariableVideoBatchSampler):
- torch.save(sampler.state_dict(step), os.path.join(save_dir, "sampler"))
- else:
- torch.save(sampler.state_dict(), os.path.join(save_dir, "sampler"))
- dist.barrier()
-
-
-def load(
- booster: Booster,
- model: nn.Module,
- ema: nn.Module,
- optimizer: Optimizer,
- lr_scheduler: _LRScheduler,
- load_dir: str,
- sampler=None,
-) -> Tuple[int, int, int]:
- booster.load_model(model, os.path.join(load_dir, "model"))
- # ema is not boosted, so we don't use booster.load_model
- ema.load_state_dict(
- torch.load(os.path.join(load_dir, "ema.pt"), map_location=torch.device("cpu")),
- strict=False,
- )
- booster.load_optimizer(optimizer, os.path.join(load_dir, "optimizer"))
- if lr_scheduler is not None:
- booster.load_lr_scheduler(lr_scheduler, os.path.join(load_dir, "lr_scheduler"))
- running_states = load_json(os.path.join(load_dir, "running_states.json"))
- if sampler is not None:
- sampler.load_state_dict(torch.load(os.path.join(load_dir, "sampler")))
- dist.barrier()
- return (
- running_states["epoch"],
- running_states["step"],
- running_states["sample_start_index"],
- )
-
-
-def create_logger(logging_dir):
- """
- Create a logger that writes to a log file and stdout.
- """
- if dist.get_rank() == 0: # real logger
- logging.basicConfig(
- level=logging.INFO,
- format="[\033[34m%(asctime)s\033[0m] %(message)s",
- datefmt="%Y-%m-%d %H:%M:%S",
- handlers=[
- logging.StreamHandler(),
- logging.FileHandler(f"{logging_dir}/log.txt"),
- ],
- )
- logger = logging.getLogger(__name__)
- else: # dummy logger (does nothing)
- logger = logging.getLogger(__name__)
- logger.addHandler(logging.NullHandler())
- return logger
-
-
-def load_checkpoint(model, ckpt_path, save_as_pt=False):
- if ckpt_path.endswith(".pt") or ckpt_path.endswith(".pth"):
- state_dict = find_model(ckpt_path, model=model)
- missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
- print(f"Missing keys: {missing_keys}")
- print(f"Unexpected keys: {unexpected_keys}")
- elif os.path.isdir(ckpt_path):
- load_from_sharded_state_dict(model, ckpt_path)
- if save_as_pt:
- save_path = os.path.join(ckpt_path, "model_ckpt.pt")
- torch.save(model.state_dict(), save_path)
- print(f"Model checkpoint saved to {save_path}")
- else:
- raise ValueError(f"Invalid checkpoint path: {ckpt_path}")
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/config_utils.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/config_utils.py
deleted file mode 100644
index bebd52b39cdaa361f0ec744bd32f075a627d46c5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/config_utils.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-import argparse
-import json
-import os
-from glob import glob
-
-from mmengine.config import Config
-from torch.utils.tensorboard import SummaryWriter
-
-
-def load_prompts(prompt_path):
- with open(prompt_path, "r") as f:
- prompts = [line.strip() for line in f.readlines()]
- return prompts
-
-
-def parse_args(training=False):
- parser = argparse.ArgumentParser()
-
- # model config
- parser.add_argument("config", help="model config file path")
-
- # ======================================================
- # General
- # ======================================================
- parser.add_argument("--seed", default=42, type=int, help="generation seed")
- parser.add_argument("--ckpt-path", type=str, help="path to model ckpt; will overwrite cfg.ckpt_path if specified")
- parser.add_argument("--batch-size", default=None, type=int, help="batch size")
-
- # ======================================================
- # Inference
- # ======================================================
- if not training:
- # output
- parser.add_argument("--save-dir", default=None, type=str, help="path to save generated samples")
- parser.add_argument("--sample-name", default=None, type=str, help="sample name, default is sample_idx")
- parser.add_argument("--start-index", default=None, type=int, help="start index for sample name")
- parser.add_argument("--end-index", default=None, type=int, help="end index for sample name")
- parser.add_argument("--num-sample", default=None, type=int, help="number of samples to generate for one prompt")
- parser.add_argument("--prompt-as-path", action="store_true", help="use prompt as path to save samples")
-
- # prompt
- parser.add_argument("--prompt-path", default=None, type=str, help="path to prompt txt file")
- parser.add_argument("--prompt", default=None, type=str, nargs="+", help="prompt list")
-
- # image/video
- parser.add_argument("--num-frames", default=None, type=int, help="number of frames")
- parser.add_argument("--fps", default=None, type=int, help="fps")
- parser.add_argument("--image-size", default=None, type=int, nargs=2, help="image size")
-
- # hyperparameters
- parser.add_argument("--num-sampling-steps", default=None, type=int, help="sampling steps")
- parser.add_argument("--cfg-scale", default=None, type=float, help="balance between cond & uncond")
-
- # reference
- parser.add_argument("--loop", default=None, type=int, help="loop")
- parser.add_argument("--condition-frame-length", default=None, type=int, help="condition frame length")
- parser.add_argument("--reference-path", default=None, type=str, nargs="+", help="reference path")
- parser.add_argument("--mask-strategy", default=None, type=str, nargs="+", help="mask strategy")
- # ======================================================
- # Training
- # ======================================================
- else:
- parser.add_argument("--wandb", default=None, type=bool, help="enable wandb")
- parser.add_argument("--load", default=None, type=str, help="path to continue training")
- parser.add_argument("--data-path", default=None, type=str, help="path to data csv")
- parser.add_argument("--start-from-scratch", action="store_true", help="start training from scratch")
- parser.add_argument("--max-train-steps", default=0, type=int, help="max train steps")
-
- return parser.parse_args()
-
-
-def merge_args(cfg, args, training=False):
- if args.ckpt_path is not None:
- cfg.model["from_pretrained"] = args.ckpt_path
- args.ckpt_path = None
- if training and args.data_path is not None:
- cfg.dataset["data_path"] = args.data_path
- args.data_path = None
- if training and args.max_train_steps is not None:
- cfg["max_train_steps"] = args.max_train_steps
- args.max_train_steps = None
- if not training and args.cfg_scale is not None:
- cfg.scheduler["cfg_scale"] = args.cfg_scale
- args.cfg_scale = None
- if not training and args.num_sampling_steps is not None:
- cfg.scheduler["num_sampling_steps"] = args.num_sampling_steps
- args.num_sampling_steps = None
-
- for k, v in vars(args).items():
- if v is not None:
- cfg[k] = v
-
- if not training:
- # Inference only
- # - Allow not set
- if "reference_path" not in cfg:
- cfg["reference_path"] = None
- if "loop" not in cfg:
- cfg["loop"] = 1
- if "frame_interval" not in cfg:
- cfg["frame_interval"] = 1
- if "sample_name" not in cfg:
- cfg["sample_name"] = None
- if "num_sample" not in cfg:
- cfg["num_sample"] = 1
- if "prompt_as_path" not in cfg:
- cfg["prompt_as_path"] = False
- # - Prompt handling
- if "prompt" not in cfg or cfg["prompt"] is None:
- assert cfg["prompt_path"] is not None, "prompt or prompt_path must be provided"
- cfg["prompt"] = load_prompts(cfg["prompt_path"])
- if args.start_index is not None and args.end_index is not None:
- cfg["prompt"] = cfg["prompt"][args.start_index : args.end_index]
- elif args.start_index is not None:
- cfg["prompt"] = cfg["prompt"][args.start_index :]
- elif args.end_index is not None:
- cfg["prompt"] = cfg["prompt"][: args.end_index]
- else:
- # Training only
- # - Allow not set
- if "mask_ratios" not in cfg:
- cfg["mask_ratios"] = None
- if "start_from_scratch" not in cfg:
- cfg["start_from_scratch"] = False
- if "bucket_config" not in cfg:
- cfg["bucket_config"] = None
- if "transform_name" not in cfg.dataset:
- cfg.dataset["transform_name"] = "center"
- if "num_bucket_build_workers" not in cfg:
- cfg["num_bucket_build_workers"] = 1
-
- # Both training and inference
- if "multi_resolution" not in cfg:
- cfg["multi_resolution"] = False
-
- return cfg
-
-
-def parse_configs(training=False):
- args = parse_args(training)
- cfg = Config.fromfile(args.config)
- cfg = merge_args(cfg, args, training)
- return cfg
-
-
-def create_experiment_workspace(cfg):
- """
- This function creates a folder for experiment tracking.
-
- Args:
- args: The parsed arguments.
-
- Returns:
- exp_dir: The path to the experiment folder.
- """
- # Make outputs folder (holds all experiment subfolders)
- os.makedirs(cfg.outputs, exist_ok=True)
- experiment_index = len(glob(f"{cfg.outputs}/*"))
-
- # Create an experiment folder
- model_name = cfg.model["type"].replace("/", "-")
- exp_name = f"{experiment_index:03d}-{model_name}"
- exp_dir = f"{cfg.outputs}/{exp_name}"
- os.makedirs(exp_dir, exist_ok=True)
- return exp_name, exp_dir
-
-
-def save_training_config(cfg, experiment_dir):
- with open(f"{experiment_dir}/config.txt", "w") as f:
- json.dump(cfg, f, indent=4)
-
-
-def create_tensorboard_writer(exp_dir):
- tensorboard_dir = f"{exp_dir}/tensorboard"
- os.makedirs(tensorboard_dir, exist_ok=True)
- writer = SummaryWriter(tensorboard_dir)
- return writer
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/device_utils.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/device_utils.py
deleted file mode 100644
index 3a8048e7976a835ceb0ee4398da6bde94390c76e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/device_utils.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-import torch
-import importlib
-
-
-def is_npu_available():
- "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
- if importlib.util.find_spec("torch") is None or importlib.util.find_spec("torch_npu") is None:
- return False
-
- import torch_npu
-
- try:
- # Will raise a RuntimeError if no NPU is found
- _ = torch.npu.device_count()
- return torch.npu.is_available()
- except RuntimeError:
- return False
\ No newline at end of file
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/misc.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/misc.py
deleted file mode 100644
index d9528f862d8cab21efb27b75bc3a650c5db89d89..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/misc.py
+++ /dev/null
@@ -1,287 +0,0 @@
-import collections
-import importlib
-import logging
-import os
-import time
-from collections import OrderedDict
-from collections.abc import Sequence
-from itertools import repeat
-from typing import Tuple
-
-import numpy as np
-import torch
-import torch.distributed as dist
-
-
-def print_rank(var_name, var_value, rank=0):
- if dist.get_rank() == rank:
- print(f"[Rank {rank}] {var_name}: {var_value}")
-
-
-def print_0(*args, **kwargs):
- if dist.get_rank() == 0:
- print(*args, **kwargs)
-
-
-def requires_grad(model: torch.nn.Module, flag: bool = True) -> None:
- """
- Set requires_grad flag for all parameters in a model.
- """
- for p in model.parameters():
- p.requires_grad = flag
-
-
-def format_numel_str(numel: int) -> str:
- B = 1024**3
- M = 1024**2
- K = 1024
- if numel >= B:
- return f"{numel / B:.2f} B"
- elif numel >= M:
- return f"{numel / M:.2f} M"
- elif numel >= K:
- return f"{numel / K:.2f} K"
- else:
- return f"{numel}"
-
-
-def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
- dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
- tensor.div_(dist.get_world_size())
- return tensor
-
-
-def get_model_numel(model: torch.nn.Module) -> Tuple[int, int]:
- num_params = 0
- num_params_trainable = 0
- for p in model.parameters():
- num_params += p.numel()
- if p.requires_grad:
- num_params_trainable += p.numel()
- return num_params, num_params_trainable
-
-
-def try_import(name):
- """Try to import a module.
-
- Args:
- name (str): Specifies what module to import in absolute or relative
- terms (e.g. either pkg.mod or ..mod).
- Returns:
- ModuleType or None: If importing successfully, returns the imported
- module, otherwise returns None.
- """
- try:
- return importlib.import_module(name)
- except ImportError:
- return None
-
-
-def transpose(x):
- """
- transpose a list of list
- Args:
- x (list[list]):
- """
- ret = list(map(list, zip(*x)))
- return ret
-
-
-def get_timestamp():
- timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
- return timestamp
-
-
-def format_time(seconds):
- days = int(seconds / 3600 / 24)
- seconds = seconds - days * 3600 * 24
- hours = int(seconds / 3600)
- seconds = seconds - hours * 3600
- minutes = int(seconds / 60)
- seconds = seconds - minutes * 60
- secondsf = int(seconds)
- seconds = seconds - secondsf
- millis = int(seconds * 1000)
-
- f = ""
- i = 1
- if days > 0:
- f += str(days) + "D"
- i += 1
- if hours > 0 and i <= 2:
- f += str(hours) + "h"
- i += 1
- if minutes > 0 and i <= 2:
- f += str(minutes) + "m"
- i += 1
- if secondsf > 0 and i <= 2:
- f += str(secondsf) + "s"
- i += 1
- if millis > 0 and i <= 2:
- f += str(millis) + "ms"
- i += 1
- if f == "":
- f = "0ms"
- return f
-
-
-def to_tensor(data):
- """Convert objects of various python types to :obj:`torch.Tensor`.
-
- Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
- :class:`Sequence`, :class:`int` and :class:`float`.
-
- Args:
- data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
- be converted.
- """
-
- if isinstance(data, torch.Tensor):
- return data
- elif isinstance(data, np.ndarray):
- return torch.from_numpy(data)
- elif isinstance(data, Sequence) and not isinstance(data, str):
- return torch.tensor(data)
- elif isinstance(data, int):
- return torch.LongTensor([data])
- elif isinstance(data, float):
- return torch.FloatTensor([data])
- else:
- raise TypeError(f"type {type(data)} cannot be converted to tensor.")
-
-
-def to_ndarray(data):
- if isinstance(data, torch.Tensor):
- return data.numpy()
- elif isinstance(data, np.ndarray):
- return data
- elif isinstance(data, Sequence):
- return np.array(data)
- elif isinstance(data, int):
- return np.ndarray([data], dtype=int)
- elif isinstance(data, float):
- return np.array([data], dtype=float)
- else:
- raise TypeError(f"type {type(data)} cannot be converted to ndarray.")
-
-
-def to_torch_dtype(dtype):
- if isinstance(dtype, torch.dtype):
- return dtype
- elif isinstance(dtype, str):
- dtype_mapping = {
- "float64": torch.float64,
- "float32": torch.float32,
- "float16": torch.float16,
- "fp32": torch.float32,
- "fp16": torch.float16,
- "half": torch.float16,
- "bf16": torch.bfloat16,
- }
- if dtype not in dtype_mapping:
- raise ValueError
- dtype = dtype_mapping[dtype]
- return dtype
- else:
- raise ValueError
-
-
-def count_params(model):
- return sum(p.numel() for p in model.parameters() if p.requires_grad)
-
-
-def _ntuple(n):
- def parse(x):
- if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
- return x
- return tuple(repeat(x, n))
-
- return parse
-
-
-to_1tuple = _ntuple(1)
-to_2tuple = _ntuple(2)
-to_3tuple = _ntuple(3)
-to_4tuple = _ntuple(4)
-to_ntuple = _ntuple
-
-
-def convert_SyncBN_to_BN2d(model_cfg):
- for k in model_cfg:
- v = model_cfg[k]
- if k == "norm_cfg" and v["type"] == "SyncBN":
- v["type"] = "BN2d"
- elif isinstance(v, dict):
- convert_SyncBN_to_BN2d(v)
-
-
-def get_topk(x, dim=4, k=5):
- x = to_tensor(x)
- inds = x[..., dim].topk(k)[1]
- return x[inds]
-
-
-def param_sigmoid(x, alpha):
- ret = 1 / (1 + (-alpha * x).exp())
- return ret
-
-
-def inverse_param_sigmoid(x, alpha, eps=1e-5):
- x = x.clamp(min=0, max=1)
- x1 = x.clamp(min=eps)
- x2 = (1 - x).clamp(min=eps)
- return torch.log(x1 / x2) / alpha
-
-
-def inverse_sigmoid(x, eps=1e-5):
- """Inverse function of sigmoid.
-
- Args:
- x (Tensor): The tensor to do the
- inverse.
- eps (float): EPS avoid numerical
- overflow. Defaults 1e-5.
- Returns:
- Tensor: The x has passed the inverse
- function of sigmoid, has same
- shape with input.
- """
- x = x.clamp(min=0, max=1)
- x1 = x.clamp(min=eps)
- x2 = (1 - x).clamp(min=eps)
- return torch.log(x1 / x2)
-
-
-def count_columns(df, columns):
- cnt_dict = OrderedDict()
- num_samples = len(df)
-
- for col in columns:
- d_i = df[col].value_counts().to_dict()
- for k in d_i:
- d_i[k] = (d_i[k], d_i[k] / num_samples)
- cnt_dict[col] = d_i
-
- return cnt_dict
-
-
-def build_logger(work_dir, cfgname):
- log_file = cfgname + ".log"
- log_path = os.path.join(work_dir, log_file)
-
- logger = logging.getLogger(cfgname)
- logger.setLevel(logging.INFO)
- # formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
- formatter = logging.Formatter("%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
-
- handler1 = logging.FileHandler(log_path)
- handler1.setFormatter(formatter)
-
- handler2 = logging.StreamHandler()
- handler2.setFormatter(formatter)
-
- logger.addHandler(handler1)
- logger.addHandler(handler2)
- logger.propagate = False
-
- return logger
diff --git a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/train_utils.py b/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/train_utils.py
deleted file mode 100644
index d302ef50ca08fa8a33881920ed6e0a4edd5cb140..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/opensora/utils/train_utils.py
+++ /dev/null
@@ -1,383 +0,0 @@
-import math
-import random
-from collections import OrderedDict
-
-import torch
-from torch.nn import Module, ModuleList
-from torch.cuda.amp import autocast
-from torch import nn, einsum, broadcast_tensors, Tensor
-from einops import rearrange, repeat
-from beartype import beartype
-from beartype.typing import Literal, Union, Optional
-from math import pi
-
-@torch.no_grad()
-def update_ema(
- ema_model: torch.nn.Module, model: torch.nn.Module, optimizer=None, decay: float = 0.9999, sharded: bool = True
-) -> None:
- """
- Step the EMA model towards the current model.
- """
- ema_params = OrderedDict(ema_model.named_parameters())
- model_params = OrderedDict(model.named_parameters())
-
- for name, param in model_params.items():
- if name == "pos_embed":
- continue
- if param.requires_grad == False:
- continue
- if not sharded:
- param_data = param.data
- ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)
- else:
- if param.data.dtype != torch.float32:
- param_id = id(param)
- master_param = optimizer._param_store.working_to_master_param[param_id]
- param_data = master_param.data
- else:
- param_data = param.data
- ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)
-
-
-class MaskGenerator:
- def __init__(self, mask_ratios):
- valid_mask_names = [
- "mask_no",
- "mask_quarter_random",
- "mask_quarter_head",
- "mask_quarter_tail",
- "mask_quarter_head_tail",
- "mask_image_random",
- "mask_image_head",
- "mask_image_tail",
- "mask_image_head_tail",
- ]
- assert all(
- mask_name in valid_mask_names for mask_name in mask_ratios.keys()
- ), f"mask_name should be one of {valid_mask_names}, got {mask_ratios.keys()}"
- assert all(
- mask_ratio >= 0 for mask_ratio in mask_ratios.values()
- ), f"mask_ratio should be greater than or equal to 0, got {mask_ratios.values()}"
- assert all(
- mask_ratio <= 1 for mask_ratio in mask_ratios.values()
- ), f"mask_ratio should be less than or equal to 1, got {mask_ratios.values()}"
- # sum of mask_ratios should be 1
- assert math.isclose(
- sum(mask_ratios.values()), 1.0, abs_tol=1e-6
- ), f"sum of mask_ratios should be 1, got {sum(mask_ratios.values())}"
- print(f"mask ratios: {mask_ratios}")
- self.mask_ratios = mask_ratios
-
- def get_mask(self, x):
- mask_type = random.random()
- mask_name = None
- prob_acc = 0.0
- for mask, mask_ratio in self.mask_ratios.items():
- prob_acc += mask_ratio
- if mask_type < prob_acc:
- mask_name = mask
- break
-
- num_frames = x.shape[2]
- # Hardcoded condition_frames
- condition_frames_max = num_frames // 4
-
- mask = torch.ones(num_frames, dtype=torch.bool, device=x.device)
- if num_frames <= 1:
- return mask
-
- if mask_name == "mask_quarter_random":
- random_size = random.randint(1, condition_frames_max)
- random_pos = random.randint(0, x.shape[2] - random_size)
- mask[random_pos : random_pos + random_size] = 0
- elif mask_name == "mask_image_random":
- random_size = 1
- random_pos = random.randint(0, x.shape[2] - random_size)
- mask[random_pos : random_pos + random_size] = 0
- elif mask_name == "mask_quarter_head":
- random_size = random.randint(1, condition_frames_max)
- mask[:random_size] = 0
- elif mask_name == "mask_image_head":
- random_size = 1
- mask[:random_size] = 0
- elif mask_name == "mask_quarter_tail":
- random_size = random.randint(1, condition_frames_max)
- mask[-random_size:] = 0
- elif mask_name == "mask_image_tail":
- random_size = 1
- mask[-random_size:] = 0
- elif mask_name == "mask_quarter_head_tail":
- random_size = random.randint(1, condition_frames_max)
- mask[:random_size] = 0
- mask[-random_size:] = 0
- elif mask_name == "mask_image_head_tail":
- random_size = 1
- mask[:random_size] = 0
- mask[-random_size:] = 0
-
- return mask
-
- def get_masks(self, x):
- masks = []
- for _ in range(len(x)):
- mask = self.get_mask(x)
- masks.append(mask)
- masks = torch.stack(masks, dim=0)
- return masks
-
-def exists(val):
- return val is not None
-
-def default(val, d):
- return val if exists(val) else d
-
-# rotary embedding helper functions
-
-def rotate_half(x):
- x = rearrange(x, '... (d r) -> ... d r', r = 2).contiguous()
- x1, x2 = x.unbind(dim = -1)
- x = torch.stack((-x2, x1), dim = -1)
- return rearrange(x, '... d r -> ... (d r)')
-
-@autocast(enabled = False)
-def apply_rotary_emb(freqs, t, start_index = 0, scale = 1., seq_dim = -2):
- dtype = t.dtype
-
- if t.ndim == 3:
- seq_len = t.shape[seq_dim]
- freqs = freqs[-seq_len:]
-
- rot_dim = freqs.shape[-1]
- end_index = start_index + rot_dim
-
- assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
-
- t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
- t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
- out = torch.cat((t_left, t, t_right), dim = -1)
-
- return out.type(dtype)
-
-# classes
-
-class NpuRotaryEmbedding(Module):
- @beartype
- def __init__(
- self,
- dim,
- custom_freqs: Optional[Tensor] = None,
- freqs_for: Union[
- Literal['lang'],
- Literal['pixel'],
- Literal['constant']
- ] = 'lang',
- theta = 10000,
- max_freq = 10,
- num_freqs = 1,
- learned_freq = False,
- use_xpos = False,
- xpos_scale_base = 512,
- interpolate_factor = 1.,
- theta_rescale_factor = 1.,
- seq_before_head_dim = False,
- cache_if_possible = True
- ):
- super().__init__()
- # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
- # has some connection to NTK literature
-
- theta *= theta_rescale_factor ** (dim / (dim - 2))
-
- self.freqs_for = freqs_for
-
- if exists(custom_freqs):
- freqs = custom_freqs
- elif freqs_for == 'lang':
- freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
- elif freqs_for == 'pixel':
- freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
- elif freqs_for == 'constant':
- freqs = torch.ones(num_freqs).float()
-
- self.cache_if_possible = cache_if_possible
-
- self.tmp_store('cached_freqs', None)
- self.tmp_store('cached_scales', None)
-
- self.freqs = nn.Parameter(freqs, requires_grad = learned_freq)
-
- self.learned_freq = learned_freq
-
- # dummy for device
-
- self.tmp_store('dummy', torch.tensor(0))
-
- # default sequence dimension
-
- self.seq_before_head_dim = seq_before_head_dim
- self.default_seq_dim = -3 if seq_before_head_dim else -2
-
- # interpolation factors
-
- assert interpolate_factor >= 1.
- self.interpolate_factor = interpolate_factor
-
- # xpos
-
- self.use_xpos = use_xpos
- if not use_xpos:
- self.tmp_store('scale', None)
- return
-
- scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
- self.scale_base = xpos_scale_base
- self.tmp_store('scale', scale)
-
- # add apply_rotary_emb as static method
-
- self.apply_rotary_emb = staticmethod(apply_rotary_emb)
-
- @property
- def device(self):
- return self.dummy.device
-
- def tmp_store(self, key, value):
- self.register_buffer(key, value, persistent = False)
-
- def get_seq_pos(self, seq_len, device, dtype, offset = 0):
- return (torch.arange(seq_len, device = device, dtype = dtype) + offset) / self.interpolate_factor
-
- def rotate_queries_or_keys(self, t, seq_dim = None, offset = 0):
- seq_dim = default(seq_dim, self.default_seq_dim)
-
- assert not self.use_xpos, 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
-
- device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
-
- freqs = self.forward(self.get_seq_pos(seq_len, device = device, dtype = dtype, offset = offset), seq_len = seq_len, offset = offset)
-
- if seq_dim == -3:
- freqs = rearrange(freqs, 'n d -> n 1 d')
-
- return apply_rotary_emb(freqs, t, seq_dim = seq_dim)
-
- def rotate_queries_with_cached_keys(self, q, k, seq_dim = None, offset = 0):
- seq_dim = default(seq_dim, self.default_seq_dim)
-
- q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
- assert q_len <= k_len
-
- rotated_q = self.rotate_queries_or_keys(q, seq_dim = seq_dim, offset = k_len - q_len + offset)
- rotated_k = self.rotate_queries_or_keys(k, seq_dim = seq_dim, offset = offset)
-
- rotated_q = rotated_q.type(q.dtype)
- rotated_k = rotated_k.type(k.dtype)
-
- return rotated_q, rotated_k
-
- def rotate_queries_and_keys(self, q, k, seq_dim = None):
- seq_dim = default(seq_dim, self.default_seq_dim)
-
- assert self.use_xpos
- device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
-
- seq = self.get_seq_pos(seq_len, dtype = dtype, device = device)
-
- freqs = self.forward(seq, seq_len = seq_len)
- scale = self.get_scale(seq, seq_len = seq_len).to(dtype)
-
- if seq_dim == -3:
- freqs = rearrange(freqs, 'n d -> n 1 d')
- scale = rearrange(scale, 'n d -> n 1 d')
-
- rotated_q = apply_rotary_emb(freqs, q, scale = scale, seq_dim = seq_dim)
- rotated_k = apply_rotary_emb(freqs, k, scale = scale ** -1, seq_dim = seq_dim)
-
- rotated_q = rotated_q.type(q.dtype)
- rotated_k = rotated_k.type(k.dtype)
-
- return rotated_q, rotated_k
-
- @beartype
- def get_scale(
- self,
- t: Tensor,
- seq_len: Optional[int] = None,
- offset = 0
- ):
- assert self.use_xpos
-
- should_cache = (
- self.cache_if_possible and
- exists(seq_len)
- )
-
- if (
- should_cache and \
- exists(self.cached_scales) and \
- (seq_len + offset) <= self.cached_scales.shape[0]
- ):
- return self.cached_scales[offset:(offset + seq_len)]
-
- scale = 1.
- if self.use_xpos:
- power = (t - len(t) // 2) / self.scale_base
- scale = self.scale ** rearrange(power, 'n -> n 1')
- scale = torch.cat((scale, scale), dim = -1)
-
- if should_cache:
- self.tmp_store('cached_scales', scale)
-
- return scale
-
- def get_axial_freqs(self, *dims):
- Colon = slice(None)
- all_freqs = []
-
- for ind, dim in enumerate(dims):
- if self.freqs_for == 'pixel':
- pos = torch.linspace(-1, 1, steps = dim, device = self.device)
- else:
- pos = torch.arange(dim, device = self.device)
-
- freqs = self.forward(pos, seq_len = dim)
-
- all_axis = [None] * len(dims)
- all_axis[ind] = Colon
-
- new_axis_slice = (Ellipsis, *all_axis, Colon)
- all_freqs.append(freqs[new_axis_slice])
-
- all_freqs = broadcast_tensors(*all_freqs)
- return torch.cat(all_freqs, dim = -1)
-
- @autocast(enabled = False)
- def forward(
- self,
- t: Tensor,
- seq_len = None,
- offset = 0
- ):
- should_cache = (
- self.cache_if_possible and \
- not self.learned_freq and \
- exists(seq_len) and \
- self.freqs_for != 'pixel'
- )
-
- if (
- should_cache and \
- exists(self.cached_freqs) and \
- (offset + seq_len) <= self.cached_freqs.shape[0]
- ):
- return self.cached_freqs[offset:(offset + seq_len)].detach()
-
- freqs = self.freqs
-
- freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
- freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
-
- if should_cache:
- self.tmp_store('cached_freqs', freqs.detach())
-
- return freqs
\ No newline at end of file
diff --git a/PyTorch/built-in/mm/OpenSora1.1/public_address_statement.md b/PyTorch/built-in/mm/OpenSora1.1/public_address_statement.md
deleted file mode 100644
index a52610cb8073c3c616800d536ca65cfbb9b73b2e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/public_address_statement.md
+++ /dev/null
@@ -1,12 +0,0 @@
-| 类型 | 开源代码地址 | 文件名 | 公网IP地址/公网URL地址/域名/邮箱地址 | 用途说明 |
-| ------- |-----------------------------------------------------------------------|---------------------------------------------------------------------|------------------------|--------------|
-| 开源代码引入 | https://github.com/hpcaitech/Open-Sora/blob/main/opensora/utils/ckpt_utils.py | .\opensora\utils\ckpt_utils.py | https://dl.fbaipublicfiles.com/DiT/models/DiT-XL-2-512x512.pt | 模型权重公网下载地址 |
-| 开源代码引入 | https://github.com/hpcaitech/Open-Sora/blob/main/opensora/utils/ckpt_utils.py | .\opensora\utils\ckpt_utils.py | https://dl.fbaipublicfiles.com/DiT/models/DiT-XL-2-256x256.pt | 模型权重公网下载地址 |
-| 开源代码引入 | https://github.com/hpcaitech/Open-Sora/blob/main/opensora/utils/ckpt_utils.py | .\opensora\utils\ckpt_utils.py | https://huggingface.co/maxin-cn/Latte/resolve/main/ucf101.pt | 模型权重公网下载地址 |
-| 开源代码引入 | https://github.com/hpcaitech/Open-Sora/blob/main/opensora/utils/ckpt_utils.py | .\opensora\utils\ckpt_utils.py | https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-256x256.pth | 模型权重公网下载地址 |
-| 开源代码引入 | https://github.com/hpcaitech/Open-Sora/blob/main/opensora/utils/ckpt_utils.py | .\opensora\utils\ckpt_utils.py | https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-SAM-256x256.pth | 模型权重公网下载地址 |
-| 开源代码引入 | https://github.com/hpcaitech/Open-Sora/blob/main/opensora/utils/ckpt_utils.py | .\opensora\utils\ckpt_utils.py | https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-512x512.pth | 模型权重公网下载地址 |
-| 开源代码引入 | https://github.com/hpcaitech/Open-Sora/blob/main/opensora/utils/ckpt_utils.py | .\opensora\utils\ckpt_utils.py | https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-1024-MS.pth | 模型权重公网下载地址 |
-| 开源代码引入 | https://github.com/hpcaitech/Open-Sora/blob/main/opensora/utils/ckpt_utils.py | .\opensora\utils\ckpt_utils.py | https://huggingface.co/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-16x256x256.pth | 模型权重公网下载地址 |
-| 开源代码引入 | https://github.com/hpcaitech/Open-Sora/blob/main/opensora/utils/ckpt_utils.py | .\opensora\utils\ckpt_utils.py | https://huggingface.co/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-HQ-16x256x256.pth | 模型权重公网下载地址 |
-| 开源代码引入 | https://github.com/hpcaitech/Open-Sora/blob/main/opensora/utils/ckpt_utils.py | .\opensora\utils\ckpt_utils.py | https://huggingface.co/hpcai-tech/Open-Sora/resolve/main/OpenSora-v1-HQ-16x512x512.pth | 模型权重公网下载地址 |
diff --git a/PyTorch/built-in/mm/OpenSora1.1/requirements.txt b/PyTorch/built-in/mm/OpenSora1.1/requirements.txt
deleted file mode 100644
index e8031a810f0a78ad9431e86297296f8401f5ca86..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/requirements.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-colossalai
-accelerate
-diffusers
-ftfy
-gdown
-mmengine
-pandas
-pre-commit
-pyarrow
-pyav
-tensorboard
-timm
-tqdm
-transformers
-wandb
-rotary_embedding_torch
-pandarallel
diff --git a/PyTorch/built-in/mm/OpenSora1.1/requirements_npu.txt b/PyTorch/built-in/mm/OpenSora1.1/requirements_npu.txt
deleted file mode 100644
index 8f7ad202a9d4cf0ce964b344642827eea5e43034..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/requirements_npu.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-torch==2.1.0
-torchvision==0.16.0
-colossalai==0.3.7
-accelerate
-diffusers
-ftfy
-gdown
-mmengine
-pandas
-pre-commit
-pyarrow
-av
-tensorboard
-timm
-tqdm
-transformers
-wandb
-rotary_embedding_torch
-pandarallel
-scipy
-decorator
-attrs
-huggingface_hub==0.25.2
\ No newline at end of file
diff --git a/PyTorch/built-in/mm/OpenSora1.1/scripts/inference-long.py b/PyTorch/built-in/mm/OpenSora1.1/scripts/inference-long.py
deleted file mode 100644
index eec19cfeee0320f007c0c4f677bed8349bbf9ba4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/scripts/inference-long.py
+++ /dev/null
@@ -1,318 +0,0 @@
-import json
-import os
-import re
-
-import colossalai
-import torch
-import torch.distributed as dist
-from colossalai.cluster import DistCoordinator
-from mmengine.runner import set_random_seed
-
-from opensora.acceleration.parallel_states import set_sequence_parallel_group
-from opensora.datasets import IMG_FPS, save_sample
-from opensora.datasets.utils import read_from_path
-from opensora.models.text_encoder.t5 import text_preprocessing
-from opensora.registry import MODELS, SCHEDULERS, build_module
-from opensora.utils.config_utils import parse_configs
-from opensora.utils.misc import to_torch_dtype
-
-
-def collect_references_batch(reference_paths, vae, image_size):
- refs_x = []
- for reference_path in reference_paths:
- if reference_path is None:
- refs_x.append([])
- continue
- ref_path = reference_path.split(";")
- ref = []
- for r_path in ref_path:
- r = read_from_path(r_path, image_size, transform_name="resize_crop")
- r_x = vae.encode(r.unsqueeze(0).to(vae.device, vae.dtype))
- r_x = r_x.squeeze(0)
- ref.append(r_x)
- refs_x.append(ref)
- # refs_x: [batch, ref_num, C, T, H, W]
- return refs_x
-
-
-def process_mask_strategy(mask_strategy):
- mask_batch = []
- mask_strategy = mask_strategy.split(";")
- for mask in mask_strategy:
- mask_group = mask.split(",")
- assert len(mask_group) >= 1 and len(mask_group) <= 6, f"Invalid mask strategy: {mask}"
- if len(mask_group) == 1:
- mask_group.extend(["0", "0", "0", "1", "0"])
- elif len(mask_group) == 2:
- mask_group.extend(["0", "0", "1", "0"])
- elif len(mask_group) == 3:
- mask_group.extend(["0", "1", "0"])
- elif len(mask_group) == 4:
- mask_group.extend(["1", "0"])
- elif len(mask_group) == 5:
- mask_group.append("0")
- mask_batch.append(mask_group)
- return mask_batch
-
-
-def apply_mask_strategy(z, refs_x, mask_strategys, loop_i):
- masks = []
- for i, mask_strategy in enumerate(mask_strategys):
- mask = torch.ones(z.shape[2], dtype=torch.float, device=z.device)
- if mask_strategy is None:
- masks.append(mask)
- continue
- mask_strategy = process_mask_strategy(mask_strategy)
- for mst in mask_strategy:
- loop_id, m_id, m_ref_start, m_target_start, m_length, edit_ratio = mst
- loop_id = int(loop_id)
- if loop_id != loop_i:
- continue
- m_id = int(m_id)
- m_ref_start = int(m_ref_start)
- m_length = int(m_length)
- m_target_start = int(m_target_start)
- edit_ratio = float(edit_ratio)
- ref = refs_x[i][m_id] # [C, T, H, W]
- if m_ref_start < 0:
- m_ref_start = ref.shape[1] + m_ref_start
- if m_target_start < 0:
- # z: [B, C, T, H, W]
- m_target_start = z.shape[2] + m_target_start
- z[i, :, m_target_start : m_target_start + m_length] = ref[:, m_ref_start : m_ref_start + m_length]
- mask[m_target_start : m_target_start + m_length] = edit_ratio
- masks.append(mask)
- masks = torch.stack(masks)
- return masks
-
-
-def process_prompts(prompts, num_loop):
- ret_prompts = []
- for prompt in prompts:
- if prompt.startswith("|0|"):
- prompt_list = prompt.split("|")[1:]
- text_list = []
- for i in range(0, len(prompt_list), 2):
- start_loop = int(prompt_list[i])
- text = prompt_list[i + 1]
- text = text_preprocessing(text)
- end_loop = int(prompt_list[i + 2]) if i + 2 < len(prompt_list) else num_loop
- text_list.extend([text] * (end_loop - start_loop))
- assert len(text_list) == num_loop, f"Prompt loop mismatch: {len(text_list)} != {num_loop}"
- ret_prompts.append(text_list)
- else:
- prompt = text_preprocessing(prompt)
- ret_prompts.append([prompt] * num_loop)
- return ret_prompts
-
-
-def extract_json_from_prompts(prompts):
- additional_infos = []
- ret_prompts = []
- for prompt in prompts:
- parts = re.split(r"(?=[{\[])", prompt)
- assert len(parts) <= 2, f"Invalid prompt: {prompt}"
- ret_prompts.append(parts[0])
- if len(parts) == 1:
- additional_infos.append({})
- else:
- additional_infos.append(json.loads(parts[1]))
- return ret_prompts, additional_infos
-
-
-def main():
- # ======================================================
- # 1. cfg and init distributed env
- # ======================================================
- cfg = parse_configs(training=False)
- print(cfg)
-
- # init distributed
- if os.environ.get("WORLD_SIZE", None):
- use_dist = True
- colossalai.launch_from_torch({})
- coordinator = DistCoordinator()
-
- if coordinator.world_size > 1:
- set_sequence_parallel_group(dist.group.WORLD)
- enable_sequence_parallelism = True
- else:
- enable_sequence_parallelism = False
- else:
- use_dist = False
- enable_sequence_parallelism = False
-
- # ======================================================
- # 2. runtime variables
- # ======================================================
- torch.set_grad_enabled(False)
- torch.backends.cuda.matmul.allow_tf32 = True
- torch.backends.cudnn.allow_tf32 = True
- device = "cuda" if torch.cuda.is_available() else "cpu"
- dtype = to_torch_dtype(cfg.dtype)
- set_random_seed(seed=cfg.seed)
- prompts = cfg.prompt
-
- # ======================================================
- # 3. build model & load weights
- # ======================================================
- # 3.1. build model
- input_size = (cfg.num_frames, *cfg.image_size)
- vae = build_module(cfg.vae, MODELS)
- latent_size = vae.get_latent_size(input_size)
- text_encoder = build_module(cfg.text_encoder, MODELS, device=device) # T5 must be fp32
- model = build_module(
- cfg.model,
- MODELS,
- input_size=latent_size,
- in_channels=vae.out_channels,
- caption_channels=text_encoder.output_dim,
- model_max_length=text_encoder.model_max_length,
- dtype=dtype,
- enable_sequence_parallelism=enable_sequence_parallelism,
- )
- text_encoder.y_embedder = model.y_embedder # hack for classifier-free guidance
-
- # 3.2. move to device & eval
- vae = vae.to(device, dtype).eval()
- model = model.to(device, dtype).eval()
-
- # 3.3. build scheduler
- scheduler = build_module(cfg.scheduler, SCHEDULERS)
-
- # 3.4. support for multi-resolution
- model_args = dict()
- if cfg.multi_resolution == "PixArtMS":
- image_size = cfg.image_size
- hw = torch.tensor([image_size], device=device, dtype=dtype).repeat(cfg.batch_size, 1)
- ar = torch.tensor([[image_size[0] / image_size[1]]], device=device, dtype=dtype).repeat(cfg.batch_size, 1)
- model_args["data_info"] = dict(ar=ar, hw=hw)
- elif cfg.multi_resolution == "STDiT2":
- image_size = cfg.image_size
- height = torch.tensor([image_size[0]], device=device, dtype=dtype).repeat(cfg.batch_size)
- width = torch.tensor([image_size[1]], device=device, dtype=dtype).repeat(cfg.batch_size)
- num_frames = torch.tensor([cfg.num_frames], device=device, dtype=dtype).repeat(cfg.batch_size)
- ar = torch.tensor([image_size[0] / image_size[1]], device=device, dtype=dtype).repeat(cfg.batch_size)
- if cfg.num_frames == 1:
- cfg.fps = IMG_FPS
- fps = torch.tensor([cfg.fps], device=device, dtype=dtype).repeat(cfg.batch_size)
- model_args["height"] = height
- model_args["width"] = width
- model_args["num_frames"] = num_frames
- model_args["ar"] = ar
- model_args["fps"] = fps
-
- # 3.5 reference
- if cfg.reference_path is not None:
- assert len(cfg.reference_path) == len(
- prompts
- ), f"Reference path mismatch: {len(cfg.reference_path)} != {len(prompts)}"
- assert len(cfg.reference_path) == len(
- cfg.mask_strategy
- ), f"Mask strategy mismatch: {len(cfg.mask_strategy)} != {len(prompts)}"
- else:
- cfg.reference_path = [None] * len(prompts)
- cfg.mask_strategy = [None] * len(prompts)
-
- # ======================================================
- # 4. inference
- # ======================================================
- sample_idx = 0
- if cfg.sample_name is not None:
- sample_name = cfg.sample_name
- elif cfg.prompt_as_path:
- sample_name = ""
- else:
- sample_name = "sample"
- save_dir = cfg.save_dir
- os.makedirs(save_dir, exist_ok=True)
-
- # 4.1. batch generation
- for i in range(0, len(prompts), cfg.batch_size):
- batch_prompts_raw = prompts[i : i + cfg.batch_size]
- batch_prompts_raw, additional_infos = extract_json_from_prompts(batch_prompts_raw)
- batch_prompts_loops = process_prompts(batch_prompts_raw, cfg.loop)
- # handle the last batch
- if len(batch_prompts_raw) < cfg.batch_size and cfg.multi_resolution == "STDiT2":
- model_args["height"] = model_args["height"][: len(batch_prompts_raw)]
- model_args["width"] = model_args["width"][: len(batch_prompts_raw)]
- model_args["num_frames"] = model_args["num_frames"][: len(batch_prompts_raw)]
- model_args["ar"] = model_args["ar"][: len(batch_prompts_raw)]
- model_args["fps"] = model_args["fps"][: len(batch_prompts_raw)]
-
- # 4.2. load reference videos & images
- for j, info in enumerate(additional_infos):
- if "reference_path" in info:
- cfg.reference_path[i + j] = info["reference_path"]
- if "mask_strategy" in info:
- cfg.mask_strategy[i + j] = info["mask_strategy"]
- refs_x = collect_references_batch(cfg.reference_path[i : i + cfg.batch_size], vae, cfg.image_size)
- mask_strategy = cfg.mask_strategy[i : i + cfg.batch_size]
-
- # 4.3. diffusion sampling
- old_sample_idx = sample_idx
- # generate multiple samples for each prompt
- for k in range(cfg.num_sample):
- sample_idx = old_sample_idx
- video_clips = []
-
- # 4.4. long video generation
- for loop_i in range(cfg.loop):
- # 4.4 sample in hidden space
- batch_prompts = [prompt[loop_i] for prompt in batch_prompts_loops]
-
- # 4.5. apply mask strategy
- masks = None
- # if cfg.reference_path is not None:
- if loop_i > 0:
- ref_x = vae.encode(video_clips[-1])
- for j, refs in enumerate(refs_x):
- if refs is None:
- refs_x[j] = [ref_x[j]]
- else:
- refs.append(ref_x[j])
- if mask_strategy[j] is None:
- mask_strategy[j] = ""
- else:
- mask_strategy[j] += ";"
- mask_strategy[
- j
- ] += f"{loop_i},{len(refs)-1},-{cfg.condition_frame_length},0,{cfg.condition_frame_length}"
-
- # sampling
- z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
- masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
- samples = scheduler.sample(
- model,
- text_encoder,
- z=z,
- prompts=batch_prompts,
- device=device,
- additional_args=model_args,
- mask=masks, # scheduler must support mask
- )
- samples = vae.decode(samples.to(dtype))
- video_clips.append(samples)
-
- # 4.7. save video
- if loop_i == cfg.loop - 1:
- if not use_dist or coordinator.is_master():
- for idx in range(len(video_clips[0])):
- video_clips_i = [video_clips[0][idx]] + [
- video_clips[i][idx][:, cfg.condition_frame_length :] for i in range(1, cfg.loop)
- ]
- video = torch.cat(video_clips_i, dim=1)
- print(f"Prompt: {batch_prompts_raw[idx]}")
- if cfg.prompt_as_path:
- sample_name_suffix = batch_prompts_raw[idx]
- else:
- sample_name_suffix = f"_{sample_idx}"
- save_path = os.path.join(save_dir, f"{sample_name}{sample_name_suffix}")
- if cfg.num_sample != 1:
- save_path = f"{save_path}-{k}"
- save_sample(video, fps=cfg.fps // cfg.frame_interval, save_path=save_path)
- sample_idx += 1
-
-
-if __name__ == "__main__":
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/scripts/inference.py b/PyTorch/built-in/mm/OpenSora1.1/scripts/inference.py
deleted file mode 100644
index 602710160373d2d65e445ee189caf52411148ac1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/scripts/inference.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-import os
-import time
-
-import colossalai
-import torch
-import torch.distributed as dist
-from colossalai.cluster import DistCoordinator
-from mmengine.runner import set_random_seed
-
-from opensora.acceleration.parallel_states import set_sequence_parallel_group
-from opensora.datasets import IMG_FPS, save_sample
-from opensora.models.text_encoder.t5 import text_preprocessing
-from opensora.registry import MODELS, SCHEDULERS, build_module
-from opensora.utils.config_utils import parse_configs
-from opensora.utils.misc import to_torch_dtype
-from opensora.utils.device_utils import is_npu_available
-if is_npu_available():
- from torch_npu.contrib import transfer_to_npu
- torch.npu.config.allow_internal_format = False
-
-
-def main():
- # ======================================================
- # 1. cfg and init distributed env
- # ======================================================
- cfg = parse_configs(training=False)
- print(cfg)
-
- # init distributed
- if os.environ.get("WORLD_SIZE", None):
- use_dist = True
- colossalai.launch_from_torch({})
- coordinator = DistCoordinator()
-
- if coordinator.world_size > 1:
- set_sequence_parallel_group(dist.group.WORLD)
- enable_sequence_parallelism = True
- else:
- enable_sequence_parallelism = False
- else:
- use_dist = False
- enable_sequence_parallelism = False
-
- # ======================================================
- # 2. runtime variables
- # ======================================================
- torch.set_grad_enabled(False)
- torch.backends.cuda.matmul.allow_tf32 = True
- torch.backends.cudnn.allow_tf32 = True
- device = "cuda" if torch.cuda.is_available() else "cpu"
- dtype = to_torch_dtype(cfg.dtype)
- set_random_seed(seed=cfg.seed)
- prompts = cfg.prompt
-
- # ======================================================
- # 3. build model & load weights
- # ======================================================
- # 3.1. build model
- input_size = (cfg.num_frames, *cfg.image_size)
- vae = build_module(cfg.vae, MODELS)
- latent_size = vae.get_latent_size(input_size)
- text_encoder = build_module(cfg.text_encoder, MODELS, device=device) # T5 must be fp32
- model = build_module(
- cfg.model,
- MODELS,
- input_size=latent_size,
- in_channels=vae.out_channels,
- caption_channels=text_encoder.output_dim,
- model_max_length=text_encoder.model_max_length,
- dtype=dtype,
- enable_sequence_parallelism=enable_sequence_parallelism,
- )
- text_encoder.y_embedder = model.y_embedder # hack for classifier-free guidance
-
- # 3.2. move to device & eval
- vae = vae.to(device, dtype).eval()
- model = model.to(device, dtype).eval()
-
- # 3.3. build scheduler
- scheduler = build_module(cfg.scheduler, SCHEDULERS)
-
- # 3.4. support for multi-resolution
- model_args = dict()
- if cfg.multi_resolution == "PixArtMS":
- image_size = cfg.image_size
- hw = torch.tensor([image_size], device=device, dtype=dtype).repeat(cfg.batch_size, 1)
- ar = torch.tensor([[image_size[0] / image_size[1]]], device=device, dtype=dtype).repeat(cfg.batch_size, 1)
- model_args["data_info"] = dict(ar=ar, hw=hw)
- elif cfg.multi_resolution == "STDiT2":
- image_size = cfg.image_size
- height = torch.tensor([image_size[0]], device=device, dtype=dtype).repeat(cfg.batch_size)
- width = torch.tensor([image_size[1]], device=device, dtype=dtype).repeat(cfg.batch_size)
- num_frames = torch.tensor([cfg.num_frames], device=device, dtype=dtype).repeat(cfg.batch_size)
- ar = torch.tensor([image_size[0] / image_size[1]], device=device, dtype=dtype).repeat(cfg.batch_size)
- if cfg.num_frames == 1:
- cfg.fps = IMG_FPS
- fps = torch.tensor([cfg.fps], device=device, dtype=dtype).repeat(cfg.batch_size)
- model_args["height"] = height
- model_args["width"] = width
- model_args["num_frames"] = num_frames
- model_args["ar"] = ar
- model_args["fps"] = fps
-
- # ======================================================
- # 4. inference
- # ======================================================
- sample_idx = 0
- if cfg.sample_name is not None:
- sample_name = cfg.sample_name
- elif cfg.prompt_as_path:
- sample_name = ""
- else:
- sample_name = "sample"
- save_dir = cfg.save_dir
- os.makedirs(save_dir, exist_ok=True)
-
- # 4.1. batch generation
- for i in range(0, len(prompts), cfg.batch_size):
- step_start_time = time.time()
- # 4.2 sample in hidden space
- batch_prompts_raw = prompts[i : i + cfg.batch_size]
- batch_prompts = [text_preprocessing(prompt) for prompt in batch_prompts_raw]
- # handle the last batch
- if len(batch_prompts_raw) < cfg.batch_size and cfg.multi_resolution == "STDiT2":
- model_args["height"] = model_args["height"][: len(batch_prompts_raw)]
- model_args["width"] = model_args["width"][: len(batch_prompts_raw)]
- model_args["num_frames"] = model_args["num_frames"][: len(batch_prompts_raw)]
- model_args["ar"] = model_args["ar"][: len(batch_prompts_raw)]
- model_args["fps"] = model_args["fps"][: len(batch_prompts_raw)]
- step_data_time = time.time()
-
- # 4.3. diffusion sampling
- old_sample_idx = sample_idx
- # generate multiple samples for each prompt
- for k in range(cfg.num_sample):
- sample_idx = old_sample_idx
-
- # Skip if the sample already exists
- # This is useful for resuming sampling VBench
- if cfg.prompt_as_path:
- skip = True
- for batch_prompt in batch_prompts_raw:
- path = os.path.join(save_dir, f"{sample_name}{batch_prompt}")
- if cfg.num_sample != 1:
- path = f"{path}-{k}"
- path = f"{path}.mp4"
- if not os.path.exists(path):
- skip = False
- break
- if skip:
- continue
-
- # sampling
- z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
- samples = scheduler.sample(
- model,
- text_encoder,
- z=z,
- prompts=batch_prompts,
- device=device,
- additional_args=model_args,
- )
- sample_time = time.time()
- samples = vae.decode(samples.to(dtype))
-
- # 4.4. save samples
- if not use_dist or coordinator.is_master():
- for idx, sample in enumerate(samples):
- print(f"Prompt: {batch_prompts_raw[idx]}")
- if cfg.prompt_as_path:
- sample_name_suffix = batch_prompts_raw[idx]
- else:
- sample_name_suffix = f"_{sample_idx}"
- save_path = os.path.join(save_dir, f"{sample_name}{sample_name_suffix}")
- if cfg.num_sample != 1:
- save_path = f"{save_path}-{k}"
- save_sample(sample, fps=cfg.fps // cfg.frame_interval, save_path=save_path)
- sample_idx += 1
- write_video_time = time.time()
- print(f"step {i} step_data_time {step_data_time - step_start_time} | "
- f"denoise_time {sample_time - step_data_time} | "
- f"step_infer_time {write_video_time - step_start_time} | "
- f"FPS {cfg.batch_size / (write_video_time - step_start_time)}")
-
-
-if __name__ == "__main__":
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/scripts/misc/search_bs.py b/PyTorch/built-in/mm/OpenSora1.1/scripts/misc/search_bs.py
deleted file mode 100644
index d6789dff551d57d35036f707f98630b447273b19..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/scripts/misc/search_bs.py
+++ /dev/null
@@ -1,457 +0,0 @@
-import argparse
-import time
-import traceback
-from copy import deepcopy
-
-import colossalai
-import torch
-import torch.distributed as dist
-from colossalai.booster import Booster
-from colossalai.booster.plugin import LowLevelZeroPlugin
-from colossalai.cluster import DistCoordinator
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.utils import get_current_device
-from mmengine.config import Config
-from tqdm import tqdm
-
-from opensora.acceleration.checkpoint import set_grad_checkpoint
-from opensora.acceleration.parallel_states import (
- get_data_parallel_group,
- set_data_parallel_group,
- set_sequence_parallel_group,
-)
-from opensora.acceleration.plugin import ZeroSeqParallelPlugin
-from opensora.datasets import prepare_variable_dataloader
-from opensora.registry import DATASETS, MODELS, SCHEDULERS, build_module
-from opensora.utils.ckpt_utils import model_sharding
-from opensora.utils.config_utils import merge_args, parse_configs
-from opensora.utils.misc import format_numel_str, get_model_numel, requires_grad, to_torch_dtype
-from opensora.utils.train_utils import MaskGenerator, update_ema
-
-
-class BColors:
- HEADER = "\033[95m"
- OKBLUE = "\033[94m"
- OKCYAN = "\033[96m"
- OKGREEN = "\033[92m"
- WARNING = "\033[93m"
- FAIL = "\033[91m"
- ENDC = "\033[0m"
- BOLD = "\033[1m"
- UNDERLINE = "\033[4m"
-
-
-# BUCKETS = [
-# ("240p", 16),
-# ("240p", 32),
-# ("240p", 64),
-# ("240p", 128),
-# ("256", 1),
-# ("512", 1),
-# ("480p", 1),
-# ("480p", 16),
-# ("480p", 32),
-# ("720p", 16),
-# ("720p", 32),
-# ("1024", 1),
-# ("1080p", 1),
-# ]
-
-
-def parse_configs():
- parser = argparse.ArgumentParser()
- parser.add_argument("config", help="model config file path")
- parser.add_argument("-o", "--output", help="output config file path", default="output_config.py")
-
- parser.add_argument("--seed", default=42, type=int, help="generation seed")
- parser.add_argument(
- "--ckpt-path",
- type=str,
- help="path to model ckpt; will overwrite cfg.ckpt_path if specified",
- )
- parser.add_argument("--data-path", default=None, type=str, help="path to data csv", required=True)
- parser.add_argument("--warmup-steps", default=1, type=int, help="warmup steps")
- parser.add_argument("--active-steps", default=1, type=int, help="active steps")
- parser.add_argument("--base-resolution", default="240p", type=str, help="base resolution")
- parser.add_argument("--base-frames", default=128, type=int, help="base frames")
- parser.add_argument("--batch-size-start", default=2, type=int, help="batch size start")
- parser.add_argument("--batch-size-end", default=256, type=int, help="batch size end")
- parser.add_argument("--batch-size-step", default=2, type=int, help="batch size step")
- args = parser.parse_args()
- cfg = Config.fromfile(args.config)
- cfg = merge_args(cfg, args, training=True)
- return cfg, args
-
-
-def rewrite_config(cfg, resolution, num_frames, batch_size):
- cfg.bucket_config = {resolution: {num_frames: (1.0, batch_size)}}
- return cfg
-
-
-def update_bucket_config_bs(bucket_config, resolution, num_frames, batch_size):
- p, _ = bucket_config[resolution][num_frames]
- bucket_config[resolution][num_frames] = (p, batch_size)
-
-
-def main():
- # ======================================================
- # 1. args & cfg
- # ======================================================
- cfg, args = parse_configs()
- print(cfg)
- assert cfg.dataset.type == "VariableVideoTextDataset", "Only VariableVideoTextDataset is supported"
-
- # ======================================================
- # 2. runtime variables & colossalai launch
- # ======================================================
- assert torch.cuda.is_available(), "Training currently requires at least one GPU."
- assert cfg.dtype in ["fp16", "bf16"], f"Unknown mixed precision {cfg.dtype}"
-
- # 2.1. colossalai init distributed training
- colossalai.launch_from_torch({})
- coordinator = DistCoordinator()
- device = get_current_device()
- dtype = to_torch_dtype(cfg.dtype)
-
- # 2.3. initialize ColossalAI booster
- if cfg.plugin == "zero2":
- plugin = LowLevelZeroPlugin(
- stage=2,
- precision=cfg.dtype,
- initial_scale=2**16,
- max_norm=cfg.grad_clip,
- )
- set_data_parallel_group(dist.group.WORLD)
- elif cfg.plugin == "zero2-seq":
- plugin = ZeroSeqParallelPlugin(
- sp_size=cfg.sp_size,
- stage=2,
- precision=cfg.dtype,
- initial_scale=2**16,
- max_norm=cfg.grad_clip,
- )
- set_sequence_parallel_group(plugin.sp_group)
- set_data_parallel_group(plugin.dp_group)
- else:
- raise ValueError(f"Unknown plugin {cfg.plugin}")
- booster = Booster(plugin=plugin)
-
- # ======================================================
- # 4. build model
- # ======================================================
- # 4.1. build model
- text_encoder = build_module(cfg.text_encoder, MODELS, device=device)
- vae = build_module(cfg.vae, MODELS)
- input_size = (cfg.dataset.num_frames, *cfg.dataset.image_size)
- latent_size = vae.get_latent_size(input_size)
- model = build_module(
- cfg.model,
- MODELS,
- input_size=latent_size,
- in_channels=vae.out_channels,
- caption_channels=text_encoder.output_dim,
- model_max_length=text_encoder.model_max_length,
- dtype=dtype,
- )
- model_numel, model_numel_trainable = get_model_numel(model)
- coordinator.print_on_master(
- f"Trainable model params: {format_numel_str(model_numel_trainable)}, Total model params: {format_numel_str(model_numel)}"
- )
-
- # 4.2. create ema
- ema = deepcopy(model).to(torch.float32).to(device)
- requires_grad(ema, False)
-
- # 4.3. move to device
- vae = vae.to(device, dtype)
- model = model.to(device, dtype)
-
- # 4.4. build scheduler
- scheduler = build_module(cfg.scheduler, SCHEDULERS)
-
- # 4.5. setup optimizer
- optimizer = HybridAdam(
- filter(lambda p: p.requires_grad, model.parameters()),
- lr=cfg.lr,
- weight_decay=0,
- adamw_mode=True,
- )
- lr_scheduler = None
-
- # 4.6. prepare for training
- if cfg.grad_checkpoint:
- set_grad_checkpoint(model)
- model.train()
- update_ema(ema, model, decay=0, sharded=False)
- ema.eval()
- if cfg.mask_ratios is not None:
- mask_generator = MaskGenerator(cfg.mask_ratios)
- else:
- mask_generator = None
-
- # =======================================================
- # 5. boost model for distributed training with colossalai
- # =======================================================
- torch.set_default_dtype(dtype)
- model, optimizer, _, _, lr_scheduler = booster.boost(
- model=model,
- optimizer=optimizer,
- lr_scheduler=lr_scheduler,
- )
- torch.set_default_dtype(torch.float)
- coordinator.print_on_master("Boost model for distributed training")
-
- model_sharding(ema)
-
- buckets = [
- (res, f) for res, d in cfg.bucket_config.items() for f, (p, bs) in d.items() if bs is not None and p > 0.0
- ]
- output_bucket_cfg = deepcopy(cfg.bucket_config)
- # find the base batch size
- assert (args.base_resolution, args.base_frames) in buckets
- del buckets[buckets.index((args.base_resolution, args.base_frames))]
- base_batch_size, base_step_time = benchmark(
- args,
- cfg,
- args.base_resolution,
- args.base_frames,
- device,
- dtype,
- booster,
- vae,
- text_encoder,
- model,
- mask_generator,
- scheduler,
- optimizer,
- ema,
- )
- update_bucket_config_bs(output_bucket_cfg, args.base_resolution, args.base_frames, base_batch_size)
- coordinator.print_on_master(
- f"{BColors.OKBLUE}Base resolution: {args.base_resolution}, Base frames: {args.base_frames}, Batch size: {base_batch_size}, Base step time: {base_step_time}{BColors.ENDC}"
- )
- result_table = [f"{args.base_resolution}, {args.base_frames}, {base_batch_size}, {base_step_time:.2f}"]
- for resolution, frames in buckets:
- try:
- batch_size, step_time = benchmark(
- args,
- cfg,
- resolution,
- frames,
- device,
- dtype,
- booster,
- vae,
- text_encoder,
- model,
- mask_generator,
- scheduler,
- optimizer,
- ema,
- target_step_time=base_step_time,
- )
- coordinator.print_on_master(
- f"{BColors.OKBLUE}Resolution: {resolution}, Frames: {frames}, Batch size: {batch_size}, Step time: {step_time}{BColors.ENDC}"
- )
- update_bucket_config_bs(output_bucket_cfg, resolution, frames, batch_size)
- result_table.append(f"{resolution}, {frames}, {batch_size}, {step_time:.2f}")
- except RuntimeError:
- pass
- result_table = "\n".join(result_table)
- coordinator.print_on_master(
- f"{BColors.OKBLUE}Resolution, Frames, Batch size, Step time\n{result_table}{BColors.ENDC}"
- )
- coordinator.print_on_master(f"{BColors.OKBLUE}{output_bucket_cfg}{BColors.ENDC}")
- if coordinator.is_master():
- cfg.bucket_config = output_bucket_cfg
- cfg.dump(args.output)
-
-
-def benchmark(
- args,
- cfg,
- resolution,
- num_frames,
- device,
- dtype,
- booster,
- vae,
- text_encoder,
- model,
- mask_generator,
- scheduler,
- optimizer,
- ema,
- target_step_time=None,
-):
- batch_sizes = []
- step_times = []
-
- def run_step(bs) -> float:
- step_time = train(
- args,
- cfg,
- resolution,
- num_frames,
- bs,
- device,
- dtype,
- booster,
- vae,
- text_encoder,
- model,
- mask_generator,
- scheduler,
- optimizer,
- ema,
- )
- step_times.append(step_time)
- batch_sizes.append(bs)
- return step_time
-
- orig_bs = cfg.bucket_config[resolution][num_frames][1]
- lower_bound = args.batch_size_start
- upper_bound = args.batch_size_end
- step_size = args.batch_size_step
- if isinstance(orig_bs, tuple):
- if len(orig_bs) == 1:
- upper_bound = orig_bs[0]
- elif len(orig_bs) == 2:
- lower_bound, upper_bound = orig_bs
- elif len(orig_bs) == 3:
- lower_bound, upper_bound, step_size = orig_bs
- batch_start_size = lower_bound
-
- while lower_bound < upper_bound:
- mid = (lower_bound + upper_bound) // 2
- try:
- step_time = run_step(mid)
- lower_bound = mid + 1
- except Exception:
- traceback.print_exc()
- upper_bound = mid
-
- for batch_size in range(batch_start_size, upper_bound, step_size):
- if batch_size in batch_sizes:
- continue
- step_time = run_step(batch_size)
- if len(step_times) == 0:
- raise RuntimeError("No valid batch size found")
- if target_step_time is None:
- # find the fastest batch size
- throughputs = [batch_size / step_time for step_time, batch_size in zip(step_times, batch_sizes)]
- max_throughput = max(throughputs)
- target_batch_size = batch_sizes[throughputs.index(max_throughput)]
- step_time = step_times[throughputs.index(max_throughput)]
- else:
- # find the batch size that meets the target step time
- diff = [abs(t - target_step_time) for t in step_times]
- closest_step_time = min(diff)
- target_batch_size = batch_sizes[diff.index(closest_step_time)]
- step_time = step_times[diff.index(closest_step_time)]
- return target_batch_size, step_time
-
-
-def train(
- args,
- cfg,
- resolution,
- num_frames,
- batch_size,
- device,
- dtype,
- booster,
- vae,
- text_encoder,
- model,
- mask_generator,
- scheduler,
- optimizer,
- ema,
-):
- total_steps = args.warmup_steps + args.active_steps
- cfg = rewrite_config(deepcopy(cfg), resolution, num_frames, batch_size)
-
- dataset = build_module(cfg.dataset, DATASETS)
- dataset.dummy = True
- dataloader_args = dict(
- dataset=dataset,
- batch_size=cfg.batch_size,
- num_workers=cfg.num_workers,
- shuffle=True,
- drop_last=True,
- pin_memory=True,
- process_group=get_data_parallel_group(),
- )
- dataloader = prepare_variable_dataloader(
- bucket_config=cfg.bucket_config,
- **dataloader_args,
- )
- dataloader_iter = iter(dataloader)
- num_steps_per_epoch = dataloader.batch_sampler.get_num_batch() // dist.get_world_size()
-
- assert num_steps_per_epoch >= total_steps, f"num_steps_per_epoch={num_steps_per_epoch} < total_steps={total_steps}"
- duration = 0
- # this is essential for the first iteration after OOM
- optimizer._grad_store.reset_all_gradients()
- optimizer._bucket_store.reset_num_elements_in_bucket()
- optimizer._bucket_store.grad_to_param_mapping = dict()
- optimizer._bucket_store._grad_in_bucket = dict()
- optimizer._bucket_store._param_list = []
- optimizer._bucket_store._padding_size = []
- for rank in range(optimizer._bucket_store._world_size):
- optimizer._bucket_store._grad_in_bucket[rank] = []
- optimizer._bucket_store.offset_list = [0]
- optimizer.zero_grad()
- for step, batch in tqdm(
- enumerate(dataloader_iter),
- desc=f"{resolution}:{num_frames} bs={batch_size}",
- total=total_steps,
- ):
- if step >= total_steps:
- break
- if step >= args.warmup_steps:
- start = time.time()
-
- x = batch.pop("video").to(device, dtype) # [B, C, T, H, W]
- y = batch.pop("text")
- # Visual and text encoding
- with torch.no_grad():
- # Prepare visual inputs
- x = vae.encode(x) # [B, C, T, H/P, W/P]
- # Prepare text inputs
- model_args = text_encoder.encode(y)
-
- # Mask
- if cfg.mask_ratios is not None:
- mask = mask_generator.get_masks(x)
- model_args["x_mask"] = mask
- else:
- mask = None
-
- # Video info
- for k, v in batch.items():
- model_args[k] = v.to(device, dtype)
-
- # Diffusion
- t = torch.randint(0, scheduler.num_timesteps, (x.shape[0],), device=device)
- loss_dict = scheduler.training_losses(model, x, t, model_args, mask=mask)
-
- # Backward & update
- loss = loss_dict["loss"].mean()
- booster.backward(loss=loss, optimizer=optimizer)
- optimizer.step()
- optimizer.zero_grad()
-
- # Update EMA
- update_ema(ema, model.module, optimizer=optimizer)
- if step >= args.warmup_steps:
- end = time.time()
- duration += end - start
-
- avg_step_time = duration / args.active_steps
- return avg_step_time
-
-
-if __name__ == "__main__":
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/scripts/train.py b/PyTorch/built-in/mm/OpenSora1.1/scripts/train.py
deleted file mode 100644
index d2d59f1df83fe64482a5dee42d1ce089639ae9da..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/scripts/train.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-from copy import deepcopy
-from datetime import timedelta
-from pprint import pprint
-import time
-
-import torch
-import torch.distributed as dist
-import wandb
-from colossalai.booster import Booster
-from colossalai.booster.plugin import LowLevelZeroPlugin
-from colossalai.cluster import DistCoordinator
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.utils import get_current_device, set_seed
-from tqdm import tqdm
-
-from opensora.acceleration.checkpoint import set_grad_checkpoint
-from opensora.acceleration.parallel_states import (
- get_data_parallel_group,
- set_data_parallel_group,
- set_sequence_parallel_group,
-)
-from opensora.acceleration.plugin import ZeroSeqParallelPlugin
-from opensora.datasets import prepare_dataloader, prepare_variable_dataloader
-from opensora.registry import DATASETS, MODELS, SCHEDULERS, build_module
-from opensora.utils.ckpt_utils import create_logger, load, model_sharding, record_model_param_shape, save
-from opensora.utils.config_utils import (
- create_experiment_workspace,
- create_tensorboard_writer,
- parse_configs,
- save_training_config,
-)
-from opensora.utils.misc import all_reduce_mean, format_numel_str, get_model_numel, requires_grad, to_torch_dtype
-from opensora.utils.train_utils import MaskGenerator, update_ema
-from opensora.utils.device_utils import is_npu_available
-if is_npu_available():
- from torch_npu.contrib import transfer_to_npu
- torch.npu.config.allow_internal_format = False
-
-
-def main():
- # ======================================================
- # 1. args & cfg
- # ======================================================
- cfg = parse_configs(training=True)
- exp_name, exp_dir = create_experiment_workspace(cfg)
- save_training_config(cfg._cfg_dict, exp_dir)
-
- # ======================================================
- # 2. runtime variables & colossalai launch
- # ======================================================
- assert torch.cuda.is_available(), "Training currently requires at least one GPU."
- assert cfg.dtype in ["fp16", "bf16"], f"Unknown mixed precision {cfg.dtype}"
-
- # 2.1. colossalai init distributed training
- # we set a very large timeout to avoid some processes exit early
- dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
- torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
- set_seed(1024)
- coordinator = DistCoordinator()
- device = get_current_device()
- dtype = to_torch_dtype(cfg.dtype)
-
- # 2.2. init logger, tensorboard & wandb
- if not coordinator.is_master():
- logger = create_logger(None)
- else:
- print("Training configuration:")
- pprint(cfg._cfg_dict)
- logger = create_logger(exp_dir)
- logger.info(f"Experiment directory created at {exp_dir}")
-
- writer = create_tensorboard_writer(exp_dir)
- if cfg.wandb:
- wandb.init(project="minisora", name=exp_name, config=cfg._cfg_dict)
-
- # 2.3. initialize ColossalAI booster
- if cfg.plugin == "zero2":
- plugin = LowLevelZeroPlugin(
- stage=2,
- precision=cfg.dtype,
- initial_scale=2**16,
- max_norm=cfg.grad_clip,
- )
- set_data_parallel_group(dist.group.WORLD)
- elif cfg.plugin == "zero2-seq":
- plugin = ZeroSeqParallelPlugin(
- sp_size=cfg.sp_size,
- stage=2,
- precision=cfg.dtype,
- initial_scale=2**16,
- max_norm=cfg.grad_clip,
- )
- set_sequence_parallel_group(plugin.sp_group)
- set_data_parallel_group(plugin.dp_group)
- else:
- raise ValueError(f"Unknown plugin {cfg.plugin}")
- booster = Booster(plugin=plugin)
-
- # ======================================================
- # 3. build dataset and dataloader
- # ======================================================
- dataset = build_module(cfg.dataset, DATASETS)
- logger.info(f"Dataset contains {len(dataset)} samples.")
- dataloader_args = dict(
- dataset=dataset,
- batch_size=cfg.batch_size,
- num_workers=cfg.num_workers,
- seed=cfg.seed,
- shuffle=True,
- drop_last=True,
- pin_memory=True,
- process_group=get_data_parallel_group(),
- )
- # TODO: use plugin's prepare dataloader
- if cfg.bucket_config is None:
- dataloader = prepare_dataloader(**dataloader_args)
- else:
- dataloader = prepare_variable_dataloader(
- bucket_config=cfg.bucket_config,
- num_bucket_build_workers=cfg.num_bucket_build_workers,
- **dataloader_args,
- )
- if cfg.dataset.type == "VideoTextDataset":
- total_batch_size = cfg.batch_size * dist.get_world_size() // cfg.sp_size
- logger.info(f"Total batch size: {total_batch_size}")
-
- # ======================================================
- # 4. build model
- # ======================================================
- # 4.1. build model
- text_encoder = build_module(cfg.text_encoder, MODELS, device=device)
- vae = build_module(cfg.vae, MODELS)
- input_size = (dataset.num_frames, *dataset.image_size)
- latent_size = vae.get_latent_size(input_size)
- model = build_module(
- cfg.model,
- MODELS,
- input_size=latent_size,
- in_channels=vae.out_channels,
- caption_channels=text_encoder.output_dim,
- model_max_length=text_encoder.model_max_length,
- dtype=dtype,
- )
- model_numel, model_numel_trainable = get_model_numel(model)
- logger.info(
- f"Trainable model params: {format_numel_str(model_numel_trainable)}, Total model params: {format_numel_str(model_numel)}"
- )
-
- # 4.2. create ema
- ema = deepcopy(model).to(torch.float32).to(device)
- requires_grad(ema, False)
- ema_shape_dict = record_model_param_shape(ema)
-
- # 4.3. move to device
- vae = vae.to(device, dtype)
- model = model.to(device, dtype)
-
- # 4.4. build scheduler
- scheduler = build_module(cfg.scheduler, SCHEDULERS)
-
- # 4.5. setup optimizer
- if is_npu_available():
- from mindspeed.optimizer.adamw import AdamW
- optimizer = AdamW(
- filter(lambda p: p.requires_grad, model.parameters()),
- lr=cfg.lr,
- weight_decay=0
- )
- else:
- optimizer = HybridAdam(
- filter(lambda p: p.requires_grad, model.parameters()),
- lr=cfg.lr,
- weight_decay=0,
- adamw_mode=True,
- )
- lr_scheduler = None
-
- # 4.6. prepare for training
- if cfg.grad_checkpoint:
- set_grad_checkpoint(model)
- model.train()
- update_ema(ema, model, decay=0, sharded=False)
- ema.eval()
- if cfg.mask_ratios is not None:
- mask_generator = MaskGenerator(cfg.mask_ratios)
-
- # =======================================================
- # 5. boost model for distributed training with colossalai
- # =======================================================
- torch.set_default_dtype(dtype)
- model, optimizer, _, dataloader, lr_scheduler = booster.boost(
- model=model,
- optimizer=optimizer,
- lr_scheduler=lr_scheduler,
- dataloader=dataloader,
- )
- torch.set_default_dtype(torch.float)
- logger.info("Boost model for distributed training")
- if cfg.dataset.type == "VariableVideoTextDataset":
- num_steps_per_epoch = dataloader.batch_sampler.get_num_batch() // dist.get_world_size()
- else:
- num_steps_per_epoch = len(dataloader)
-
- # =======================================================
- # 6. training loop
- # =======================================================
- start_epoch = start_step = log_step = sampler_start_idx = acc_step = 0
- running_loss = 0.0
- sampler_to_io = dataloader.batch_sampler if cfg.dataset.type == "VariableVideoTextDataset" else None
- # 6.1. resume training
- if cfg.load is not None:
- logger.info("Loading checkpoint")
- ret = load(
- booster,
- model,
- ema,
- optimizer,
- lr_scheduler,
- cfg.load,
- sampler=sampler_to_io if not cfg.start_from_scratch else None,
- )
- if not cfg.start_from_scratch:
- start_epoch, start_step, sampler_start_idx = ret
- logger.info(f"Loaded checkpoint {cfg.load} at epoch {start_epoch} step {start_step}")
- logger.info(f"Training for {cfg.epochs} epochs with {num_steps_per_epoch} steps per epoch")
-
- if cfg.dataset.type == "VideoTextDataset":
- dataloader.sampler.set_start_index(sampler_start_idx)
- model_sharding(ema)
- early_stopping_flag = False
-
- # 6.2. training loop
- for epoch in range(start_epoch, cfg.epochs):
- if cfg.dataset.type == "VideoTextDataset":
- dataloader.sampler.set_epoch(epoch)
- dataloader_iter = iter(dataloader)
- logger.info(f"Beginning epoch {epoch}...")
-
- if early_stopping_flag:
- break
- with tqdm(
- enumerate(dataloader_iter, start=start_step),
- desc=f"Epoch {epoch}",
- disable=not coordinator.is_master(),
- initial=start_step,
- total=num_steps_per_epoch,
- ) as pbar:
- for step, batch in pbar:
- start_step_time = time.time()
- x = batch.pop("video").to(device, dtype) # [B, C, T, H, W]
- y = batch.pop("text")
- data_step_time = time.time()
-
- # Visual and text encoding
- with torch.no_grad():
- # Prepare visual inputs
- x = vae.encode(x) # [B, C, T, H/P, W/P]
- # Prepare text inputs
- model_args = text_encoder.encode(y)
-
- # Mask
- if cfg.mask_ratios is not None:
- mask = mask_generator.get_masks(x)
- model_args["x_mask"] = mask
- else:
- mask = None
-
- # Video info
- for k, v in batch.items():
- model_args[k] = v.to(device, dtype)
-
- # Diffusion
- t = torch.randint(0, scheduler.num_timesteps, (x.shape[0],), device=device)
- loss_dict = scheduler.training_losses(model, x, t, model_args, mask=mask)
-
- # Backward & update
- loss = loss_dict["loss"].mean()
- booster.backward(loss=loss, optimizer=optimizer)
- optimizer.step()
- optimizer.zero_grad()
-
- # Update EMA
- update_ema(ema, model.module, optimizer=optimizer)
-
- # Log loss values:
- all_reduce_mean(loss)
- running_loss += loss.item()
- global_step = epoch * num_steps_per_epoch + step
- log_step += 1
- acc_step += 1
-
- train_step_time = time.time()
-
- # Log to tensorboard
- if coordinator.is_master() and (global_step + 1) % cfg.log_every == 0:
- print(
- f"data time {data_step_time - start_step_time} | E2E train time {train_step_time - start_step_time}", flush=True)
-
- avg_loss = running_loss / log_step
- pbar.set_postfix({"loss": avg_loss, "step": step, "global_step": global_step})
- running_loss = 0
- log_step = 0
- writer.add_scalar("loss", loss.item(), global_step)
- if cfg.wandb:
- wandb.log(
- {
- "iter": global_step,
- "epoch": epoch,
- "loss": loss.item(),
- "avg_loss": avg_loss,
- "acc_step": acc_step,
- },
- step=global_step,
- )
-
- # Save checkpoint
- if cfg.ckpt_every > 0 and (global_step + 1) % cfg.ckpt_every == 0:
- save(
- booster,
- model,
- ema,
- optimizer,
- lr_scheduler,
- epoch,
- step + 1,
- global_step + 1,
- cfg.batch_size,
- coordinator,
- exp_dir,
- ema_shape_dict,
- sampler=sampler_to_io,
- )
- logger.info(
- f"Saved checkpoint at epoch {epoch} step {step + 1} global_step {global_step + 1} to {exp_dir}"
- )
- if cfg.max_train_steps > 0 and global_step == cfg.max_train_steps:
- early_stopping_flag = True
- break
-
- # the continue epochs are not resumed, so we need to reset the sampler start index and start step
- if cfg.dataset.type == "VideoTextDataset":
- dataloader.sampler.set_start_index(0)
- if cfg.dataset.type == "VariableVideoTextDataset":
- dataloader.batch_sampler.set_epoch(epoch + 1)
- print("Epoch done, recomputing batch sampler")
- start_step = 0
-
-
-if __name__ == "__main__":
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/setup.py b/PyTorch/built-in/mm/OpenSora1.1/setup.py
deleted file mode 100644
index 096c109486f6e8ccc9e0152509969aa9a413bb22..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/setup.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-from typing import List
-import importlib
-
-from setuptools import find_packages, setup
-import torch
-
-
-def fetch_requirements(path) -> List[str]:
- """
- This function reads the requirements file.
-
- Args:
- path (str): the path to the requirements file.
-
- Returns:
- The lines in the requirements file.
- """
- with open(path, "r") as fd:
- return [r.strip() for r in fd.readlines()]
-
-
-def fetch_readme() -> str:
- """
- This function reads the README.md file in the current directory.
-
- Returns:
- The lines in the README file.
- """
- with open("README.md", encoding="utf-8") as f:
- return f.read()
-
-
-def is_npu_available():
- "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
- if importlib.util.find_spec("torch") is None or importlib.util.find_spec("torch_npu") is None:
- return False
-
- import torch_npu
-
- try:
- # Will raise a RuntimeError if no NPU is found
- _ = torch.npu.device_count()
- return torch.npu.is_available()
- except RuntimeError:
- return False
-
-
-if is_npu_available():
- requirements_file = "requirements_npu.txt"
-else:
- requirements_file = "requirements.txt"
-
-setup(
- name="opensora",
- version="1.1.0",
- packages=find_packages(
- exclude=(
- "assets",
- "configs",
- "docs",
- "outputs",
- "pretrained_models",
- "scripts",
- "tests",
- "tools",
- "*.egg-info",
- )
- ),
- description="Democratizing Efficient Video Production for All",
- long_description=fetch_readme(),
- long_description_content_type="text/markdown",
- license="Apache Software License 2.0",
- install_requires=fetch_requirements(requirements_file),
- python_requires=">=3.6",
- classifiers=[
- "Programming Language :: Python :: 3",
- "License :: OSI Approved :: Apache Software License",
- "Environment :: GPU :: NVIDIA CUDA",
- "Topic :: Scientific/Engineering :: Artificial Intelligence",
- "Topic :: System :: Distributed Computing",
- ],
-)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/test/env_npu.sh b/PyTorch/built-in/mm/OpenSora1.1/test/env_npu.sh
deleted file mode 100644
index 337088e0c729fa009604622fc32823130aa1c0f8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/test/env_npu.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info'
-
-if [ -f $CANN_INSTALL_PATH_CONF ]; then
- CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2)
-else
- CANN_INSTALL_PATH="/usr/local/Ascend"
-fi
-
-if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then
- source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh
-else
- source ${CANN_INSTALL_PATH}/nnae/set_env.sh
-fi
-
-count=$(npu-smi info -l | grep -c "NPU ID")
-
-for ((i=0; i<${count}; i=i+1))
-do
- msnpureport -g error -d ${i}
-done
-
-#将Host日志输出到串口,0-关闭/1-开启
-export ASCEND_SLOG_PRINT_TO_STDOUT=0
-#设置默认日志级别,0-debug/1-info/2-warning/3-error
-export ASCEND_GLOBAL_LOG_LEVEL=3
-#设置Event日志开启标志,0-关闭/1-开启
-export ASCEND_GLOBAL_EVENT_ENABLE=0
-#设置是否开启taskque,0-关闭/1-开启
-export TASK_QUEUE_ENABLE=1
-#设置是否开启combined标志,0-关闭/1-开启
-export COMBINED_ENABLE=1
-#HCCL白名单开关,1-关闭/0-开启
-export HCCL_WHITELIST_DISABLE=1
-export HCCL_IF_IP=$(hostname -I |awk '{print $1}')
-export HCCL_CONNECT_TIMEOUT=1200
-
-
-path_lib=$(python3 -c """
-import sys
-import re
-result=''
-for index in range(len(sys.path)):
- match_sit = re.search('-packages', sys.path[index])
- if match_sit is not None:
- match_lib = re.search('lib', sys.path[index])
-
- if match_lib is not None:
- end=match_lib.span()[1]
- result += sys.path[index][0:end] + ':'
-
- result+=sys.path[index] + '/torch/lib:'
-print(result)"""
-)
-
-echo ${path_lib}
diff --git a/PyTorch/built-in/mm/OpenSora1.1/test/infer_full_1p_opensorav1_1.sh b/PyTorch/built-in/mm/OpenSora1.1/test/infer_full_1p_opensorav1_1.sh
deleted file mode 100644
index 2ffe28c1f4e199291e913df5b82ddd6ed402c630..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/test/infer_full_1p_opensorav1_1.sh
+++ /dev/null
@@ -1,92 +0,0 @@
-# 微调生成的ckpt路径
-Network="OpenSora"
-BATCH_SIZE=1
-CKPT_PATH=''
-NUM_FRAMES=32
-IMAGE_SIZE_H=480
-IMAGE_SIZE_W=854
-
-for para in $*
-do
- if [[ $para == --batch_size* ]]; then
- BATCH_SIZE=$(echo ${para#*=})
- elif [[ $para == --ckpt_path* ]]; then
- CKPT_PATH=$(echo ${para#*=})
- elif [[ $para == --num_frames* ]]; then
- NUM_FRAMES=$(echo ${para#*=})
- elif [[ $para == --img_h* ]]; then
- IMAGE_SIZE_H=$(echo ${para#*=})
- elif [[ $para == --img_w* ]]; then
- IMAGE_SIZE_W=$(echo ${para#*=})
- fi
-done
-
-# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径
-cur_path=$(pwd)
-cur_path_last_dirname=${cur_path##*/}
-if [ x"${cur_path_last_dirname}" == x"test" ]; then
- test_path_dir=${cur_path}
- cd ..
- cur_path=$(pwd)
-else
- test_path_dir=${cur_path}/test
-fi
-
-source ${test_path_dir}/env_npu.sh
-
-ASCEND_DEVICE_ID=0
-#创建DeviceID输出目录,不需要修改
-if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
- rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
- mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
-else
- mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
-fi
-
-#推理开始时间,不需要修改
-start_time=$(date +%s)
-echo "start_time: ${start_time}"
-
-python scripts/inference.py \
- configs/opensora-v1-1/inference/sample.py \
- --ckpt-path ${CKPT_PATH} \
- --batch-size ${BATCH_SIZE} \
- --prompt "A beautiful sunset over the city" \
- --num-frames ${NUM_FRAMES} \
- --image-size ${IMAGE_SIZE_H} ${IMAGE_SIZE_W} \
- > ${test_path_dir}/output/$ASCEND_DEVICE_ID/infer_${ASCEND_DEVICE_ID}.log 2>&1 &
-
-wait
-
-#训练结束时间,不需要修改
-end_time=$(date +%s)
-e2e_time=$(($end_time - $start_time))
-
-
-# 训练用例信息,不需要修改
-BatchSize=${BATCH_SIZE}
-DeviceType=$(uname -m)
-CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'acc'
-
-# 结果打印,不需要修改
-echo "------------------ Final result ------------------"
-# 输出性能FPS,需要模型审视修改
-FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/infer_${ASCEND_DEVICE_ID}.log|awk -F "FPS " '{print $2}' | tail -9 | awk '{a+=$1} END {if (NR != 0) printf("%.2f",a/NR)}'`
-# 打印,不需要修改
-echo "E2E Training Duration sec : $e2e_time"
-
-
-# 性能看护结果汇总
-# 获取性能数据,不需要修改
-# 吞吐量
-ActualFPS=${FPS}
-
-
-# 关键信息打印到${CaseName}.log中,不需要修改
-echo "Network = ${Network}" >${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
-echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
\ No newline at end of file
diff --git a/PyTorch/built-in/mm/OpenSora1.1/test/train_full_1p_opensorav1_1.sh b/PyTorch/built-in/mm/OpenSora1.1/test/train_full_1p_opensorav1_1.sh
deleted file mode 100644
index f484d2904f25121c9d3e98dc709531e661d38d30..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/test/train_full_1p_opensorav1_1.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-# 网络名称,同目录名称,需要模型审视修改
-Network="OpenSora"
-data_path=""
-
-for para in $*; do
- if [[ $para == --data_path* ]]; then
- data_path=$(echo ${para#*=})
- fi
-done
-
-# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径
-cur_path=$(pwd)
-cur_path_last_dirname=${cur_path##*/}
-if [ x"${cur_path_last_dirname}" == x"test" ]; then
- test_path_dir=${cur_path}
- cd ..
- cur_path=$(pwd)
-else
- test_path_dir=${cur_path}/test
-fi
-
-source ${test_path_dir}/env_npu.sh
-
-ASCEND_DEVICE_ID=0
-#创建DeviceID输出目录,不需要修改
-if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
- rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
- mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
-else
- mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
-fi
-
-#训练开始时间,不需要修改
-start_time=$(date +%s)
-echo "start_time: ${start_time}"
-
-torchrun --nnodes=1 --nproc_per_node=1 --master-port 61888 scripts/train.py \
-configs/opensora-v1-1/train/stage1.py \
---data-path ${data_path} > ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${ASCEND_DEVICE_ID}.log 2>&1 &
-wait
-
-#训练结束时间,不需要修改
-end_time=$(date +%s)
-e2e_time=$(($end_time - $start_time))
-
-
-# 训练用例信息,不需要修改
-DeviceType=$(uname -m)
-CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'acc'
-
-# 结果打印,不需要修改
-echo "------------------ Final result ------------------"
-# 输出性能FPS,需要模型审视修改
-echo "E2E Training Duration sec : $e2e_time"
-
-
-# 关键信息打印到${CaseName}.log中,不需要修改
-echo "Network = ${Network}" >${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
\ No newline at end of file
diff --git a/PyTorch/built-in/mm/OpenSora1.1/test/train_full_8p_opensorav1_1.sh b/PyTorch/built-in/mm/OpenSora1.1/test/train_full_8p_opensorav1_1.sh
deleted file mode 100644
index dbf090ef1ab6753d447efc79ab70030200cb9205..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/test/train_full_8p_opensorav1_1.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-# 网络名称,同目录名称,需要模型审视修改
-Network="OpenSora"
-data_path=""
-
-for para in $*; do
- if [[ $para == --data_path* ]]; then
- data_path=$(echo ${para#*=})
- fi
-done
-
-# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径
-cur_path=$(pwd)
-cur_path_last_dirname=${cur_path##*/}
-if [ x"${cur_path_last_dirname}" == x"test" ]; then
- test_path_dir=${cur_path}
- cd ..
- cur_path=$(pwd)
-else
- test_path_dir=${cur_path}/test
-fi
-
-source ${test_path_dir}/env_npu.sh
-
-#创建DeviceID输出目录,不需要修改
-output_path=${cur_path}/test/output/${ASCEND_DEVICE_ID}
-
-mkdir -p ${output_path}
-
-#训练开始时间,不需要修改
-start_time=$(date +%s)
-echo "start_time: ${start_time}"
-
-torchrun --nnodes=1 --nproc_per_node=8 --master-port 61888 scripts/train.py \
-configs/opensora-v1-1/train/stage1.py \
---data-path ${data_path} > ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${ASCEND_DEVICE_ID}.log 2>&1 &
-wait
-
-#训练结束时间,不需要修改
-end_time=$(date +%s)
-e2e_time=$(($end_time - $start_time))
-
-
-# 训练用例信息,不需要修改
-DeviceType=$(uname -m)
-CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'acc'
-
-# 结果打印,不需要修改
-echo "------------------ Final result ------------------"
-# 输出性能FPS,需要模型审视修改
-echo "E2E Training Duration sec : $e2e_time"
-
-
-# 关键信息打印到${CaseName}.log中,不需要修改
-echo "Network = ${Network}" >${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
\ No newline at end of file
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tests/test_attn.py b/PyTorch/built-in/mm/OpenSora1.1/tests/test_attn.py
deleted file mode 100644
index 48f8c43bf6f4e87862c18eb47f4fc0ba3e174533..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tests/test_attn.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import torch
-from colossalai.accelerator import get_accelerator
-from colossalai.utils import get_current_device
-from rotary_embedding_torch import RotaryEmbedding
-
-from opensora.models.layers.blocks import Attention
-
-# B, S, H = 7488, 1, 1152
-# B, S, H = 32, 234, 1152
-B, S, H = 128, 32, 1152
-N, D = 16, 72
-
-
-def run_attn(enable_flashattn: bool):
- get_accelerator().reset_peak_memory_stats()
- rope = RotaryEmbedding(D).to(device=get_current_device(), dtype=torch.bfloat16)
- attn = Attention(
- H,
- N,
- qkv_bias=True,
- rope=rope.rotate_queries_or_keys,
- enable_flashattn=enable_flashattn,
- ).to(device=get_current_device(), dtype=torch.bfloat16)
- x = torch.randn(B, S, H, device=get_current_device(), dtype=torch.bfloat16).requires_grad_()
- y = attn(x)
- y.mean().backward()
- print(f"Peak memory: {get_accelerator().max_memory_allocated() / 1024**2:.2f} MB")
-
-
-if __name__ == "__main__":
- print("Use flashattn")
- run_attn(True)
- print("No flashattn")
- run_attn(False)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tests/test_np_torch.py b/PyTorch/built-in/mm/OpenSora1.1/tests/test_np_torch.py
deleted file mode 100644
index bc7fdb04d0de0d68da5c91123d8af2d46c1ba52a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tests/test_np_torch.py
+++ /dev/null
@@ -1,346 +0,0 @@
-from typing import Callable
-
-import numpy as np
-import torch
-
-
-# ==================================
-# Warm Up Beta
-# ==================================
-def _warmup_beta_numpy(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
- betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
- warmup_time = int(num_diffusion_timesteps * warmup_frac)
- betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
- return betas
-
-
-def _warmup_beta_torch(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
- betas = beta_end * torch.ones(num_diffusion_timesteps, dtype=torch.float64)
- warmup_time = int(num_diffusion_timesteps * warmup_frac)
- betas[:warmup_time] = torch.linspace(beta_start, beta_end, warmup_time, dtype=torch.float64)
- return betas
-
-
-def test_warmup_beta():
- beta_start = 1e-6
- beta_end = 0.99
- num_diffusion_timesteps = 1000
- warmup_frac = 0.1
- betas_np = _warmup_beta_numpy(beta_start, beta_end, num_diffusion_timesteps, warmup_frac)
- betas_torch = _warmup_beta_torch(beta_start, beta_end, num_diffusion_timesteps, warmup_frac)
- assert np.allclose(betas_np, betas_torch.numpy())
- print("Test passed for warmup_beta()")
-
-
-# ==================================
-# Beta Schedule
-# ==================================
-
-
-def get_beta_schedule_numpy(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
- """
- This is the deprecated API for creating beta schedules.
- See get_named_beta_schedule() for the new library of schedules.
- """
- if beta_schedule == "quad":
- betas = (
- np.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_diffusion_timesteps,
- dtype=np.float64,
- )
- ** 2
- )
- elif beta_schedule == "linear":
- betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
- elif beta_schedule == "warmup10":
- betas = _warmup_beta_numpy(beta_start, beta_end, num_diffusion_timesteps, 0.1)
- elif beta_schedule == "warmup50":
- betas = _warmup_beta_numpy(beta_start, beta_end, num_diffusion_timesteps, 0.5)
- elif beta_schedule == "const":
- betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
- elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1
- betas = 1.0 / np.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64)
- else:
- raise NotImplementedError(beta_schedule)
- assert betas.shape == (num_diffusion_timesteps,)
- return betas
-
-
-def get_beta_schedule_torch(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
- """
- This is the deprecated API for creating beta schedules.
- See get_named_beta_schedule() for the new library of schedules.
- """
- if beta_schedule == "quad":
- betas = (
- np.linspace(
- beta_start**0.5,
- beta_end**0.5,
- num_diffusion_timesteps,
- dtype=np.float64,
- )
- ** 2
- )
- elif beta_schedule == "linear":
- betas = torch.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=torch.float64)
- elif beta_schedule == "warmup10":
- betas = _warmup_beta_torch(beta_start, beta_end, num_diffusion_timesteps, 0.1)
- elif beta_schedule == "warmup50":
- betas = _warmup_beta_torch(beta_start, beta_end, num_diffusion_timesteps, 0.5)
- elif beta_schedule == "const":
- betas = beta_end * torch.ones(num_diffusion_timesteps, dtype=np.float64)
- elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1
- betas = 1.0 / torch.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=torch.float64)
- else:
- raise NotImplementedError(beta_schedule)
- assert betas.shape == (num_diffusion_timesteps,)
- return betas
-
-
-def test_get_beta_Schedule():
- beta_start = 1e-6
- beta_end = 0.99
- num_diffusion_timesteps = 1000
- beta_schedule = "linear"
- betas_np = get_beta_schedule_numpy(
- beta_schedule, beta_start=beta_start, beta_end=beta_end, num_diffusion_timesteps=num_diffusion_timesteps
- )
- betas_torch = get_beta_schedule_torch(
- beta_schedule, beta_start=beta_start, beta_end=beta_end, num_diffusion_timesteps=num_diffusion_timesteps
- )
- assert np.allclose(betas_np, betas_torch.numpy())
- print("Test passed for get_beta_schedule()")
-
-
-# ====================
-# Replace alpha
-# ====================
-def betas_for_alpha_bar_numpy(num_diffusion_timesteps: int, alpha_bar: Callable, max_beta: float = 0.999):
- """
- Create a beta schedule that discretizes the given alpha_t_bar function,
- which defines the cumulative product of (1-beta) over time from t = [0,1].
- :param num_diffusion_timesteps: the number of betas to produce.
- :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
- produces the cumulative product of (1-beta) up to that
- part of the diffusion process.
- :param max_beta: the maximum beta to use; use values lower than 1 to
- prevent singularities.
- """
- betas = []
- for i in range(num_diffusion_timesteps):
- t1 = i / num_diffusion_timesteps
- t2 = (i + 1) / num_diffusion_timesteps
- betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
- return np.array(betas)
-
-
-def betas_for_alpha_bar_torch(num_diffusion_timesteps: int, alpha_bar: Callable, max_beta: float = 0.999):
- """
- Create a beta schedule that discretizes the given alpha_t_bar function,
- which defines the cumulative product of (1-beta) over time from t = [0,1].
- :param num_diffusion_timesteps: the number of betas to produce.
- :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
- produces the cumulative product of (1-beta) up to that
- part of the diffusion process.
- :param max_beta: the maximum beta to use; use values lower than 1 to
- prevent singularities.
- """
- betas = []
- for i in range(num_diffusion_timesteps):
- t1 = i / num_diffusion_timesteps
- t2 = (i + 1) / num_diffusion_timesteps
- betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
- return torch.DoubleTensor(betas)
-
-
-def test_betas_for_alpha_bar():
- num_diffusion_timesteps = 1000
- alpha_bar = lambda t: 1 - t
- max_beta = 0.999
- betas_np = betas_for_alpha_bar_numpy(num_diffusion_timesteps, alpha_bar, max_beta)
- betas_torch = betas_for_alpha_bar_torch(num_diffusion_timesteps, alpha_bar, max_beta)
- assert np.allclose(betas_np, betas_torch.numpy())
- print("Test passed for betas_for_alpha_bar()")
-
-
-# =======================
-# Gaussian init
-# =======================
-def init_numpy(betas):
- # Use float64 for accuracy.
- betas = torch.DoubleTensor(betas)
- assert len(betas.shape) == 1, "betas must be 1-D"
- assert (betas > 0).all() and (betas <= 1).all()
-
- num_timesteps = int(betas.shape[0])
-
- alphas = 1.0 - betas
- alphas_cumprod = np.cumprod(alphas, axis=0)
- alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
- alphas_cumprod_next = np.append(alphas_cumprod[1:], 0.0)
- assert alphas_cumprod_prev.shape == (num_timesteps,)
-
- # calculations for diffusion q(x_t | x_{t-1}) and others
- np.sqrt(alphas_cumprod)
- np.sqrt(1.0 - alphas_cumprod)
- np.log(1.0 - alphas_cumprod)
- np.sqrt(1.0 / alphas_cumprod)
- np.sqrt(1.0 / alphas_cumprod - 1)
-
- # calculations for posterior q(x_{t-1} | x_t, x_0)
- posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
- # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
- posterior_log_variance_clipped = (
- np.log(np.append(posterior_variance[1], posterior_variance[1:]))
- if len(posterior_variance) > 1
- else np.array([])
- )
-
- posterior_mean_coef1 = betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)
- posterior_mean_coef2 = (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod)
-
- return alphas_cumprod_prev, alphas_cumprod_next, posterior_mean_coef1, posterior_mean_coef2
-
-
-def gaussian_init_numpy(betas):
- # Use float64 for accuracy.
- betas = np.array(betas, dtype=np.float64)
- assert len(betas.shape) == 1, "betas must be 1-D"
- assert (betas > 0).all() and (betas <= 1).all()
-
- num_timesteps = int(betas.shape[0])
-
- alphas = 1.0 - betas
- alphas_cumprod = np.cumprod(alphas, axis=0)
- alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
- alphas_cumprod_next = np.append(alphas_cumprod[1:], 0.0)
- assert alphas_cumprod_prev.shape == (num_timesteps,)
-
- # calculations for diffusion q(x_t | x_{t-1}) and others
- sqrt_alphas_cumprod = np.sqrt(alphas_cumprod)
- sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - alphas_cumprod)
- log_one_minus_alphas_cumprod = np.log(1.0 - alphas_cumprod)
- sqrt_recip_alphas_cumprod = np.sqrt(1.0 / alphas_cumprod)
- sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / alphas_cumprod - 1)
-
- # calculations for posterior q(x_{t-1} | x_t, x_0)
- posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
- # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
- posterior_log_variance_clipped = (
- np.log(np.append(posterior_variance[1], posterior_variance[1:]))
- if len(posterior_variance) > 1
- else np.array([])
- )
-
- posterior_mean_coef1 = betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)
- posterior_mean_coef2 = (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod)
-
- return (
- alphas_cumprod_prev,
- alphas_cumprod_next,
- sqrt_alphas_cumprod,
- sqrt_one_minus_alphas_cumprod,
- log_one_minus_alphas_cumprod,
- sqrt_recip_alphas_cumprod,
- sqrt_recipm1_alphas_cumprod,
- posterior_log_variance_clipped,
- posterior_mean_coef1,
- posterior_mean_coef2,
- )
-
-
-def gaussian_init_torch(betas):
- # Use float64 for accuracy.
- betas = torch.DoubleTensor(betas)
- assert len(betas.shape) == 1, "betas must be 1-D"
- assert (betas > 0).all() and (betas <= 1).all()
-
- num_timesteps = int(betas.shape[0])
-
- alphas = 1.0 - betas
- alphas_cumprod = torch.cumprod(alphas, axis=0)
- alphas_cumprod_prev = torch.cat([torch.tensor([1.0]), alphas_cumprod[:-1]])
- alphas_cumprod_next = torch.cat([alphas_cumprod[1:], torch.tensor([0.0])])
- assert alphas_cumprod_prev.shape == (num_timesteps,)
-
- # calculations for diffusion q(x_t | x_{t-1}) and others
- sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
- sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
- log_one_minus_alphas_cumprod = torch.log(1.0 - alphas_cumprod)
- sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / alphas_cumprod)
- sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / alphas_cumprod - 1)
-
- # calculations for posterior q(x_{t-1} | x_t, x_0)
- posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
- # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
-
- posterior_log_variance_clipped = (
- torch.log(torch.cat([posterior_variance[1].unsqueeze(0), posterior_variance[1:]]))
- if len(posterior_variance) > 1
- else torch.array([])
- )
-
- posterior_mean_coef1 = betas * torch.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)
- posterior_mean_coef2 = (1.0 - alphas_cumprod_prev) * torch.sqrt(alphas) / (1.0 - alphas_cumprod)
-
- return (
- alphas_cumprod_prev,
- alphas_cumprod_next,
- sqrt_alphas_cumprod,
- sqrt_one_minus_alphas_cumprod,
- log_one_minus_alphas_cumprod,
- sqrt_recip_alphas_cumprod,
- sqrt_recipm1_alphas_cumprod,
- posterior_log_variance_clipped,
- posterior_mean_coef1,
- posterior_mean_coef2,
- )
-
-
-def test_gaussian_init():
- betas = np.linspace(1e-6, 0.99, 1000)
- (
- alphas_cumprod_prev,
- alphas_cumprod_next,
- sqrt_alphas_cumprod,
- sqrt_one_minus_alphas_cumprod,
- log_one_minus_alphas_cumprod,
- sqrt_recip_alphas_cumprod,
- sqrt_recipm1_alphas_cumprod,
- posterior_log_variance_clipped,
- posterior_mean_coef1,
- posterior_mean_coef2,
- ) = gaussian_init_numpy(betas)
- (
- alphas_cumprod_prev_t,
- alphas_cumprod_next_t,
- sqrt_alphas_cumprod_t,
- sqrt_one_minus_alphas_cumprod_t,
- log_one_minus_alphas_cumprod_t,
- sqrt_recip_alphas_cumprod_t,
- sqrt_recipm1_alphas_cumprod_t,
- posterior_log_variance_clipped_t,
- posterior_mean_coef1_t,
- posterior_mean_coef2_t,
- ) = gaussian_init_torch(betas)
-
- assert np.allclose(alphas_cumprod_prev, alphas_cumprod_prev_t.numpy())
- assert np.allclose(alphas_cumprod_next, alphas_cumprod_next_t.numpy())
- assert np.allclose(sqrt_alphas_cumprod, sqrt_alphas_cumprod_t.numpy())
- assert np.allclose(sqrt_one_minus_alphas_cumprod, sqrt_one_minus_alphas_cumprod_t.numpy())
- assert np.allclose(log_one_minus_alphas_cumprod, log_one_minus_alphas_cumprod_t.numpy())
- assert np.allclose(sqrt_recip_alphas_cumprod, sqrt_recip_alphas_cumprod_t.numpy())
- assert np.allclose(sqrt_recipm1_alphas_cumprod, sqrt_recipm1_alphas_cumprod_t.numpy())
- assert np.allclose(posterior_log_variance_clipped, posterior_log_variance_clipped_t.numpy())
- assert np.allclose(posterior_mean_coef1, posterior_mean_coef1_t.numpy())
- assert np.allclose(posterior_mean_coef2, posterior_mean_coef2_t.numpy())
- print("Test passed for gaussian_init()")
-
-
-if __name__ == "__main__":
- test_warmup_beta()
- test_get_beta_Schedule()
- test_betas_for_alpha_bar()
- test_gaussian_init()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tests/test_pos_emb.py b/PyTorch/built-in/mm/OpenSora1.1/tests/test_pos_emb.py
deleted file mode 100644
index c409acb1b992a364fa2ffb6dfc1b0341f250962d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tests/test_pos_emb.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import pytest
-import torch
-
-from opensora.models.layers.blocks import PositionEmbedding2D, get_2d_sincos_pos_embed
-
-D = 8
-SCALE = 2.0
-from torch.testing import assert_close
-
-
-def get_spatial_pos_embed(x, hidden_size, h, w, scale, base_size=None):
- pos_embed = get_2d_sincos_pos_embed(
- hidden_size,
- (h, w),
- scale=scale,
- base_size=base_size,
- )
- pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
- return pos_embed.to(device=x.device, dtype=x.dtype)
-
-
-@pytest.mark.parametrize("dtype", [torch.float, torch.float16])
-@pytest.mark.parametrize("device", ["cpu", "cuda"])
-def test_pos_emb(dtype, device):
- # just a placeholder to get the device and dtype
- x = torch.empty(1, dtype=dtype, device=device)
- pos_embedder = PositionEmbedding2D(
- D,
- max_position_embeddings=8,
- scale=SCALE,
- ).to(device=device, dtype=dtype)
- output = pos_embedder(x, 8, 7)
- target = get_spatial_pos_embed(x, D, 8, 7, SCALE)
- assert_close(output, target)
- output = pos_embedder(x, 15, 16)
- target = get_spatial_pos_embed(x, D, 15, 16, SCALE)
- assert_close(output, target)
- output = pos_embedder(x, 30, 20, base_size=2)
- target = get_spatial_pos_embed(x, D, 30, 20, SCALE, base_size=2)
- assert_close(output, target)
- # test cache
- output = pos_embedder(x, 30, 20, base_size=2)
- target = get_spatial_pos_embed(x, D, 30, 20, SCALE, base_size=2)
- assert_close(output, target)
- assert pos_embedder._get_cached_emb.cache_info().hits >= 1
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tests/test_seq_parallel_attention.py b/PyTorch/built-in/mm/OpenSora1.1/tests/test_seq_parallel_attention.py
deleted file mode 100644
index 9cce2b245c406972088fe766fa2bfe88c5d81b65..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tests/test_seq_parallel_attention.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import colossalai
-import torch
-import torch.distributed as dist
-from colossalai.testing import spawn
-
-from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
-from opensora.acceleration.parallel_states import set_sequence_parallel_group
-from opensora.models.layers.blocks import (
- Attention,
- MultiHeadCrossAttention,
- SeqParallelAttention,
- SeqParallelMultiHeadCrossAttention,
-)
-
-
-def run_attention(rank, world_size):
- # create model
- torch.manual_seed(1024)
- set_sequence_parallel_group(dist.group.WORLD)
-
- seq_parallel_attention = SeqParallelAttention(dim=256, num_heads=4, qkv_bias=True, enable_flashattn=False).cuda()
-
- torch.manual_seed(1024)
- attention = Attention(
- dim=256,
- num_heads=4,
- qkv_bias=True,
- enable_flashattn=False,
- ).cuda()
-
- # create inputs
- torch.manual_seed(1024)
- x = torch.randn(4, 64, 256).cuda()
- seq_x = x.clone().detach()
-
- x.requires_grad = True
- x.retain_grad()
- seq_x.requires_grad = True
- seq_x.retain_grad()
-
- sub_seq_x = split_forward_gather_backward(seq_x, dist.group.WORLD, dim=1, grad_scale="down")
-
- # run model
- out = attention(x)
- sub_seq_out = seq_parallel_attention(sub_seq_x)
- seq_out = gather_forward_split_backward(sub_seq_out, dist.group.WORLD, dim=1, grad_scale="up")
-
- assert torch.allclose(seq_out, out, atol=1e-7), f"{seq_out}\nvs\n{out}"
-
- # run backward
- seq_out.mean().backward()
- out.mean().backward()
-
- # all reduce gradient for sp
- for p in seq_parallel_attention.parameters():
- if p.grad is not None:
- dist.all_reduce(p.grad, group=dist.group.WORLD)
- p.grad.div_(world_size)
-
- # check grad
- for p1, p2 in zip(seq_parallel_attention.parameters(), attention.parameters()):
- assert torch.allclose(p1.grad, p2.grad, atol=1e-7), f"{p1.grad}\nvs\n{p2.grad}"
-
- # check input grad
- assert torch.allclose(x.grad, seq_x.grad, atol=1e-7), f"{x.grad}\nvs\n{seq_x.grad}"
-
-
-def run_cross_attention(rank, world_size):
- # create model
- torch.manual_seed(1024)
- set_sequence_parallel_group(dist.group.WORLD)
- seq_parallel_attention = (
- SeqParallelMultiHeadCrossAttention(
- d_model=256,
- num_heads=4,
- )
- .cuda()
- .to(torch.bfloat16)
- )
-
- torch.manual_seed(1024)
- attention = (
- MultiHeadCrossAttention(
- d_model=256,
- num_heads=4,
- )
- .cuda()
- .to(torch.bfloat16)
- )
-
- # make sure the weights are the same
- for p1, p2 in zip(seq_parallel_attention.parameters(), attention.parameters()):
- p1.data.copy_(p2.data)
-
- # create inputs
- torch.manual_seed(1024)
- x = torch.randn(4, 64, 256).cuda().to(torch.bfloat16)
- y = torch.randn(4, 32, 256).cuda().to(torch.bfloat16)
-
- mask = [2, 10, 8, 16]
- mask = None
- seq_x = x.clone().detach()
- seq_y = y.clone().detach()
-
- # set grad
- x.requires_grad = True
- x.retain_grad()
- seq_x.requires_grad = True
- seq_x.retain_grad()
- y.requires_grad = True
- y.retain_grad()
- seq_y.requires_grad = True
- seq_y.retain_grad()
-
- # split by sequence
- sub_seq_x = split_forward_gather_backward(seq_x, dist.group.WORLD, dim=1, grad_scale="down")
-
- # run model
- out = attention(x, y, mask)
- sub_seq_out = seq_parallel_attention(sub_seq_x, seq_y, mask)
- seq_out = gather_forward_split_backward(sub_seq_out, dist.group.WORLD, dim=1, grad_scale="up")
-
- assert torch.allclose(seq_out, out, rtol=1e-5, atol=1e-6), f"\n{seq_out}\nvs\n{out}"
-
- # run backward
- seq_out.mean().backward()
- out.mean().backward()
-
- # all reduce gradient for sp
- for name, p in seq_parallel_attention.named_parameters():
- if p.grad is not None:
- dist.all_reduce(p.grad, group=dist.group.WORLD)
- p.grad.div_(world_size)
- else:
- print(f"grad of {name} is None")
-
- # # check grad
- for p1, p2 in zip(seq_parallel_attention.named_parameters(), attention.named_parameters()):
- assert torch.allclose(
- p1[1].grad, p2[1].grad, rtol=1e-3, atol=1e-4
- ), f"\n{p1[0]}\nvs\n{p2[0]}:\n{p1[1].grad}\nvs\n{p2[1].grad}"
-
- # # check input grad
- assert torch.allclose(x.grad, seq_x.grad, atol=1e-7), f"{x.grad}\nvs\n{seq_x.grad}"
- assert torch.allclose(y.grad, seq_y.grad, atol=1e-7), f"{y.grad}\nvs\n{seq_y.grad}"
-
-
-def run_dist(rank, world_size, port):
- colossalai.launch({}, rank=rank, world_size=world_size, host="localhost", port=port)
- # run_attention(rank, world_size)
- run_cross_attention(rank, world_size)
-
-
-def test_seq_parallel_attention():
- spawn(run_dist, nprocs=2)
-
-
-if __name__ == "__main__":
- test_seq_parallel_attention()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tests/test_t5_shardformer.py b/PyTorch/built-in/mm/OpenSora1.1/tests/test_t5_shardformer.py
deleted file mode 100644
index 68040ab39e57d7b8508e7eb4c2d330d7492f30ea..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tests/test_t5_shardformer.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import time
-from copy import deepcopy
-
-import colossalai
-import torch
-from colossalai.shardformer import ShardConfig, ShardFormer
-from colossalai.testing import spawn
-
-from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy
-from opensora.models.text_encoder.t5 import T5Embedder
-
-
-def run_t5_encoder(rank, world_size, port):
- colossalai.launch({}, rank=rank, world_size=world_size, port=port, host="localhost")
-
- # t5 embedder
- t5_path = "./pretrained_models/t5_ckpts"
- hf_t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=t5_path, torch_dtype=torch.float)
- sf_t5 = deepcopy(hf_t5)
-
- # create huggingface model as normal
- shard_config = ShardConfig(
- tensor_parallel_process_group=None,
- pipeline_stage_manager=None,
- enable_tensor_parallelism=False,
- enable_fused_normalization=False,
- enable_flash_attention=False,
- enable_jit_fused=True,
- enable_sequence_parallelism=False,
- enable_sequence_overlap=False,
- )
- shard_former = ShardFormer(shard_config=shard_config)
- sharded_model, _ = shard_former.optimize(sf_t5.model, policy=T5EncoderPolicy())
- sf_t5.model = sharded_model
-
- # test t5 embedder
- texts = ["Who is the best player in the history of NBA?", "How to study computer science?"]
- for i in range(5):
- hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
- sf_embs, sf_masks = sf_t5.get_text_embeddings(texts)
-
- # check accuracy
- assert torch.allclose(hf_embs, sf_embs, rtol=1e-4, atol=1e-5), f"{hf_embs} \nvs\n{sf_embs}"
- assert torch.allclose(hf_masks, sf_masks), f"{hf_masks} \nvs\n{sf_masks}"
-
- # measure perf
- torch.cuda.synchronize()
- hf_start = time.time()
- for i in range(20):
- hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
- torch.cuda.synchronize()
- hf_end = time.time()
-
- # convert sf to fp16
- hf_t5.model = hf_t5.model.half()
- torch.cuda.synchronize()
- sf_start = time.time()
- for i in range(20):
- hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
- torch.cuda.synchronize()
- sf_end = time.time()
-
- print(f"[Performance] native: {hf_end - hf_start}s, shardformer: {sf_end - sf_start} s")
-
-
-def test_t5_encoder():
- spawn(run_t5_encoder)
-
-
-if __name__ == "__main__":
- test_t5_encoder()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/README.md b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/README.md
deleted file mode 100644
index b30856cbd56ca96c630b0d96ba114235ea73a5d2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/README.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# Video Captioning
-
-Human labeling of videos is expensive and time-consuming. We adopt powerful image captioning models to generate captions for videos. Although GPT-4V achieves a better performance, its 20s/sample speed is too slow for us. LLaVA is the second best open-source model in [MMMU](https://mmmu-benchmark.github.io/) and accepts any resolution. We find the quality of 34B model is comparable.
-
-
-
-## LLaVA Captioning
-
-We extract three frames from the video for captioning. With batch inference, we can achieve 10 times speedup. With approximatly 720p resolution and 3 frames, the speed is 2~3 videos/s on 8 GPUs. If we resize the smaller side to 336, the speed can be 8 videos/s.
-
-### Requirement
-
-```bash
-# create conda env
-conda create -n llava python=3.10 -y
-conda activate llava
-
-# install torch
-pip install torch torchvision
-
-# clone llava
-git clone https://github.com/haotian-liu/LLaVA.git
-cd LLaVA
-# CAUTION: This line is to remove torch dependency in pyproject.toml, which is:
-# "torch==2.1.2", "torchvision==0.16.2",
-# It is better manually remove it in your local pyproject.toml
-sed -i '16d' pyproject.toml
-
-# install llava
-pip install --upgrade pip # enable PEP 660 support
-pip install -e .
-
-# install flash attention
-pip install flash-attn --no-build-isolation
-# install colossalai and decord
-pip install colossalai decord
-```
-
-Since only the 34B model's performance is comparable to GPT-4V, we only provide the usage of the 34B model. The 34B model is available [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b), or run our script and it will be downloaded automatically.
-
-### Usage
-
-Prepare a csv file for processing. The csv file can be generated by `convert_dataset.py` according to its [documentation](/tools/datasets/README.md). Then, run the following command to generate captions for videos/images with LLaVA:
-
-```bash
-# we run this on 8xH800 GPUs
-torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 4 --bs 16
-
-# at least two 80G GPUs are required
-torchrun --nproc_per_node 2 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 1 --bs 16
-
-# can also caption images
-torchrun --nproc_per_node 2 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 1 --bs 16 --prompt image-3ex
-
-# caption with llava-34B
-# NOTE: remember to enable flash attention for this model
-torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 4 --tp-size 2 --model-path liuhaotian/llava-v1.6-34b --prompt image-3ex --flash-attention
-
-# caption with mistral-7B
-torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
-# bs can be 48
-```
-
-Please note that you should add the `--flash-attention` flag when running with Llama-based Llava models as it provides speedup but do turn it off for mistral-based ones. Reasons can be found in [this issue](https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453).
-
-After running the script, with `dp-size=N`, you will get `N` parts of csv files. Run the following command to merge them:
-
-```bash
-python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv
-```
-
-### Resume
-
-Sometimes the process may be interrupted. We can resume the process by running the following command:
-
-```bash
-# merge generated results
-python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv
-
-# get the remaining videos
-python -m tools.datasets.datautil DATA.csv --difference DATA_caption.csv --output DATA_remaining.csv
-```
-
-Then use the output csv file to resume the process.
-
-## GPT-4V Captioning
-
-Run the following command to generate captions for videos with GPT-4V:
-
-```bash
-# output: DATA_caption.csv
-python -m tools.caption.caption_gpt4 DATA.csv --key $OPENAI_API_KEY
-```
-
-The cost is approximately $0.01 per video (3 frames per video).
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/policies/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/policies/__init__.py
deleted file mode 100644
index 35998d404993d8c5073a3f6796c161402fdd26c4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/policies/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .llama import LlavaLlamaForCausalLMPolicy
-from .mistral import LlavaMistralForCausalLMPolicy
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/policies/llama.py b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/policies/llama.py
deleted file mode 100644
index dff8f01d68ffd672384f61f2b9c2ce0011b3e556..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/policies/llama.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from typing import Dict, Union
-
-import torch.nn as nn
-from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row
-from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
-
-__all__ = ["LlavaLlamaPolicy", "LlavaLlamaForCausalLMPolicy"]
-
-
-class LlavaLlamaPolicy(Policy):
- def config_sanity_check(self):
- pass
-
- def preprocess(self):
- if self.shard_config.enable_tensor_parallelism:
- # Resize embedding
- self.model.config.vocab_size
- self.shard_config.tensor_parallel_size
-
- # if vocab_size % world_size != 0:
- # new_vocab_size = vocab_size + world_size - vocab_size % world_size
- # self.model.resize_token_embeddings(new_vocab_size)
-
- return self.model
-
- def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
- from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-
- policy = {}
-
- if self.shard_config.enable_tensor_parallelism:
- decoder_attribute_replacement = {
- "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
- "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
- }
- if getattr(self.model.config, "num_key_value_heads", False):
- decoder_attribute_replacement["self_attn.num_key_value_heads"] = (
- self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size
- )
-
- policy[LlamaDecoderLayer] = ModulePolicyDescription(
- attribute_replacement=decoder_attribute_replacement,
- sub_module_replacement=[
- SubModuleReplacementDescription(
- suffix="self_attn.q_proj",
- target_module=Linear1D_Col,
- ),
- SubModuleReplacementDescription(
- suffix="self_attn.k_proj",
- target_module=Linear1D_Col,
- ),
- SubModuleReplacementDescription(
- suffix="self_attn.v_proj",
- target_module=Linear1D_Col,
- ),
- SubModuleReplacementDescription(
- suffix="self_attn.o_proj",
- target_module=Linear1D_Row,
- ),
- SubModuleReplacementDescription(
- suffix="mlp.gate_proj",
- target_module=Linear1D_Col,
- ),
- SubModuleReplacementDescription(
- suffix="mlp.up_proj",
- target_module=Linear1D_Col,
- ),
- SubModuleReplacementDescription(
- suffix="mlp.down_proj",
- target_module=Linear1D_Row,
- ),
- ],
- )
-
- return policy
-
- def postprocess(self):
- return self.model
-
-
-class LlavaLlamaForCausalLMPolicy(LlavaLlamaPolicy):
- def module_policy(self):
- from transformers import LlamaForCausalLM
-
- policy = super().module_policy()
- if self.shard_config.enable_tensor_parallelism:
- # add a new item for casual lm
- new_item = {
- LlamaForCausalLM: ModulePolicyDescription(
- sub_module_replacement=[
- SubModuleReplacementDescription(
- suffix="lm_head", target_module=Linear1D_Col, kwargs={"gather_output": True}
- )
- ],
- )
- }
- policy.update(new_item)
- return policy
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/policies/mistral.py b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/policies/mistral.py
deleted file mode 100644
index 0afea570af861d170f4334529e14c43f9a32b542..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/acceleration/llava/policies/mistral.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import warnings
-from typing import Dict, Union
-
-import torch.nn as nn
-from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D
-from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
-
-__all__ = ["LlavaMistralPolicy", "LlavaMistralForCausalLMPolicy"]
-
-
-class LlavaMistralPolicy(Policy):
- def config_sanity_check(self):
- pass
-
- def preprocess(self):
- if self.shard_config.enable_tensor_parallelism:
- # Resize embedding
- vocab_size = self.model.config.vocab_size
- world_size = self.shard_config.tensor_parallel_size
-
- if vocab_size % world_size != 0:
- new_vocab_size = vocab_size + world_size - vocab_size % world_size
- self.model.resize_token_embeddings(new_vocab_size)
-
- return self.model
-
- def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
- from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MistralModel
-
- policy = {}
-
- if self.shard_config.enable_sequence_parallelism:
- self.shard_config.enable_sequence_parallelism = False
- warnings.warn(
- "Mistral doesn't support sequence parallelism now, will ignore the sequence parallelism flag."
- )
-
- if self.shard_config.enable_tensor_parallelism:
- decoder_attribute_replacement = {
- "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
- "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
- "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
- // self.shard_config.tensor_parallel_size,
- }
-
- policy[MistralDecoderLayer] = ModulePolicyDescription(
- attribute_replacement=decoder_attribute_replacement,
- sub_module_replacement=[
- SubModuleReplacementDescription(
- suffix="self_attn.q_proj",
- target_module=Linear1D_Col,
- ),
- SubModuleReplacementDescription(
- suffix="self_attn.k_proj",
- target_module=Linear1D_Col,
- ),
- SubModuleReplacementDescription(
- suffix="self_attn.v_proj",
- target_module=Linear1D_Col,
- ),
- SubModuleReplacementDescription(
- suffix="self_attn.o_proj",
- target_module=Linear1D_Row,
- ),
- SubModuleReplacementDescription(
- suffix="mlp.gate_proj",
- target_module=Linear1D_Col,
- ),
- SubModuleReplacementDescription(
- suffix="mlp.up_proj",
- target_module=Linear1D_Col,
- ),
- SubModuleReplacementDescription(
- suffix="mlp.down_proj",
- target_module=Linear1D_Row,
- ),
- ],
- )
-
- self.append_or_create_submodule_replacement(
- description=SubModuleReplacementDescription(
- suffix="embed_tokens",
- target_module=VocabParallelEmbedding1D,
- ),
- policy=policy,
- target_key=MistralModel,
- )
-
- return policy
-
- def postprocess(self):
- return self.model
-
-
-class LlavaMistralForCausalLMPolicy(LlavaMistralPolicy):
- def module_policy(self):
- from transformers import MistralForCausalLM
-
- policy = super().module_policy()
-
- if self.shard_config.enable_tensor_parallelism:
- # add a new item for casual lm
- new_item = {
- MistralForCausalLM: ModulePolicyDescription(
- sub_module_replacement=[
- SubModuleReplacementDescription(
- suffix="lm_head", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
- )
- ]
- )
- }
- policy.update(new_item)
- return policy
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/camera_motion_detect.py b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/camera_motion_detect.py
deleted file mode 100644
index cc0077c65c254bf0f1a73b11883f9eccd681e792..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/camera_motion_detect.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# ref: https://github.com/antiboredom/camera-motion-detector
-
-import argparse
-
-import cv2
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-
-tqdm.pandas()
-
-
-def apply(df, func, **kwargs):
- if pandas_has_parallel:
- return df.parallel_apply(func, **kwargs)
- return df.progress_apply(func, **kwargs)
-
-
-try:
- from pandarallel import pandarallel
-
- pandarallel.initialize(progress_bar=True)
- pandas_has_parallel = True
-except ImportError:
- pandas_has_parallel = False
-
-
-def make_empty(new_w, new_h):
- empty = []
- for y in range(new_h):
- xvals = []
- for x in range(new_w):
- xvals.append([x, y])
- empty.append(xvals)
-
- empty = np.array(empty)
- return empty
-
-
-def get_type(mag, ang, zoom_in, tau_static=1.0, tau_zoom=(0.4, 0.6)):
- if mag < tau_static:
- return "static"
- if zoom_in < tau_zoom[0]:
- return "zoom out"
- if zoom_in > tau_zoom[1]:
- return "zoom in"
- if ang < 45 or ang >= 315:
- return "pan left"
- if 45 <= ang < 135:
- return "tilt up"
- if 135 <= ang < 225:
- return "pan right"
- if 225 <= ang < 315:
- return "tilt down"
- return "unknown"
-
-
-def get_video_type(frame_types):
- # count the number of each type
- counts = {}
- max_count = 0
- max_type = None
- for frame_type in frame_types:
- if frame_type not in counts:
- counts[frame_type] = 0
- counts[frame_type] += 1
- if counts[frame_type] > max_count:
- max_count = counts[frame_type]
- max_type = frame_type
- if max_count > len(frame_types) / 2:
- return max_type
- if "static" in counts:
- return "unknown"
- if "zoom in" not in counts and "zoom out" not in counts:
- return "pan/tilt"
- return "dynamic"
-
-
-def process(path: str, frame_interval=15) -> str:
- cap = cv2.VideoCapture(path)
- count = 0
- prvs = None
- frame_types = []
- while cap.isOpened():
- ret, frame = cap.read()
- if ret:
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
- if count == 0:
- prvs = frame
- h, w = frame.shape
- empty = make_empty(w, h)
- empty_dists = np.sqrt(
- np.square(empty.ravel()[::2] - (w / 2)) + np.square(empty.ravel()[1::2] - (h / 2))
- )
- else:
- flow = cv2.calcOpticalFlowFarneback(prvs, frame, None, 0.5, 3, 15, 3, 5, 1.2, 0)
- mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1], angleInDegrees=True)
- mean_mag = np.median(mag)
- mean_ang = np.median(ang)
-
- flow_coords = flow + empty
- xvals = flow_coords.ravel()[::2] - (w / 2)
- yvals = flow_coords.ravel()[1::2] - (h / 2)
- dists = np.sqrt(np.square(xvals) + np.square(yvals))
- dist_diff = dists >= empty_dists
- zoom_in_factor = np.count_nonzero(dist_diff) / len(dist_diff)
- frame_types.append(get_type(mean_mag, mean_ang, zoom_in_factor))
- count += frame_interval
- cap.set(cv2.CAP_PROP_POS_FRAMES, count)
- else:
- cap.release()
- break
- video_type = get_video_type(frame_types)
- return video_type
-
-
-def main(args):
- output_file = args.input.replace(".csv", "_cmotion.csv")
- data = pd.read_csv(args.input)
- data["cmotion"] = apply(data["path"], process)
- data.to_csv(output_file, index=False)
- print(f"Output saved to {output_file}")
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("input", type=str)
- parser.add_argument("--disable-parallel", action="store_true")
- args = parser.parse_args()
- if args.disable_parallel:
- pandas_has_parallel = False
- main(args)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/caption_gpt4.py b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/caption_gpt4.py
deleted file mode 100644
index f22c296ea5130a1ba2606bc885109c1851c56363..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/caption_gpt4.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import argparse
-import base64
-import csv
-import os
-from io import BytesIO
-
-import requests
-import tqdm
-
-from .utils import IMG_EXTENSIONS, PROMPTS, VID_EXTENSIONS, VideoTextDataset
-
-
-def to_base64(image):
- buffer = BytesIO()
- image.save(buffer, format="JPEG")
- return base64.b64encode(buffer.getvalue()).decode("utf-8")
-
-
-def get_caption(frame, prompt, api_key):
- headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
- payload = {
- "model": "gpt-4-vision-preview",
- "messages": [
- {
- "role": "user",
- "content": [
- {
- "type": "text",
- "text": prompt,
- },
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}},
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}},
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}},
- ],
- }
- ],
- "max_tokens": 300,
- }
- response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=60)
- caption = response.json()["choices"][0]["message"]["content"]
- caption = caption.replace("\n", " ")
- return caption
-
-
-def main(args):
- # ======================================================
- # 1. read video list
- # ======================================================
- dataset = VideoTextDataset(args.input)
- output_file = os.path.splitext(args.input)[0] + "_caption.csv"
- f = open(output_file, "w")
- writer = csv.writer(f)
- writer.writerow(["video", "text"])
-
- # make sure that the prompt type matches the data type
- data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1]
- prompt_type = PROMPTS[args.prompt]["type"]
- if prompt_type == "image":
- assert (
- data_extension.lower() in IMG_EXTENSIONS
- ), "The prompt is suitable for an image dataset but the data is not image."
- elif prompt_type == "video":
- assert (
- data_extension.lower() in VID_EXTENSIONS
- ), "The prompt is suitable for a video dataset but the data is not video."
- else:
- raise ValueError(f"Found invalid prompt type {prompt_type}")
-
- # ======================================================
- # 2. generate captions
- # ======================================================
- for sample in tqdm.tqdm(dataset):
- prompt = PROMPTS[args.prompt]["text"]
- if "text" in args.prompt:
- prompt = prompt.format(sample["text"])
- frames = sample["image"]
- frames = [to_base64(frame) for frame in frames]
- caption = get_caption(frames, prompt, args.key)
-
- writer.writerow((sample["path"], caption))
- f.close()
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("input", type=str, help="Path to the input CSV file")
- parser.add_argument("--prompt", type=str, default="video-f3-detail-3ex")
- parser.add_argument("--key", type=str)
- args = parser.parse_args()
-
- main(args)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/caption_llava.py b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/caption_llava.py
deleted file mode 100644
index 0134ec51ff13e847625616755f47cc0455c6d9a8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/caption_llava.py
+++ /dev/null
@@ -1,344 +0,0 @@
-import argparse
-import csv
-import time
-import warnings
-from datetime import timedelta
-
-import torch
-import torch.distributed as dist
-from colossalai.cluster import DistCoordinator, ProcessGroupMesh
-from colossalai.shardformer import ShardConfig, ShardFormer
-from colossalai.utils import get_current_device, set_seed
-from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
-from llava.conversation import conv_templates
-from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
-from llava.model.builder import load_pretrained_model
-from llava.utils import disable_torch_init
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm
-
-from .acceleration.llava.policies import LlavaLlamaForCausalLMPolicy, LlavaMistralForCausalLMPolicy
-from .utils import IMG_EXTENSIONS, PROMPTS, VID_EXTENSIONS, Timer, VideoTextDataset, collate_fn
-
-disable_torch_init()
-
-
-class NoPaddingDistributedSampler(DistributedSampler):
- def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0, drop_last=False):
- super().__init__(
- dataset=dataset, num_replicas=num_replicas, rank=rank, seed=seed, shuffle=False, drop_last=False
- )
- remainder = len(self.dataset) % self.num_replicas
- if remainder > 0 and (self.rank + 1) - remainder <= 0:
- # if the dataset is not divisible by num_replicas
- # the remaining items will be allocated to the first n ranks
- self.num_samples = len(self.dataset) // self.num_replicas + 1
- else:
- self.num_samples = len(self.dataset) // self.num_replicas
- self.total_size = len(dataset)
-
- def __iter__(self):
- if self.shuffle:
- # deterministically shuffle based on epoch and seed
- g = torch.Generator()
- g.manual_seed(self.seed + self.epoch)
- indices = torch.randperm(len(self.dataset), generator=g).tolist() # type: ignore[arg-type]
- else:
- indices = list(range(len(self.dataset))) # type: ignore[arg-type]
-
- # remove tail of data to make it evenly divisible.
- indices = indices[: self.total_size]
-
- # subsample
- indices = indices[self.rank : self.total_size : self.num_replicas]
- assert len(indices) == self.num_samples
- return iter(indices)
-
-
-@torch.inference_mode()
-def main(args):
- # ======================================================
- # 1. init environment
- # ======================================================
- # we set a very large timeout to avoid some processes exit early
- dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
- torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
- set_seed(1024)
- coordinator = DistCoordinator()
-
- # prepare the dp and tp groups
- assert (
- args.dp_size * args.tp_size == coordinator.world_size
- ), f"DP size {args.dp_size} * TP size {args.tp_size} must equal to world size {coordinator.world_size}"
- mesh = ProcessGroupMesh(args.dp_size, args.tp_size)
- dp_group = mesh.get_group_along_axis(0)
- tp_group = mesh.get_group_along_axis(1)
-
- # ======================================================
- # 2. load model
- # ======================================================
- model_path = args.model_path
- with warnings.catch_warnings():
- warnings.simplefilter("ignore") # Pytorch non-meta copying warning fills out the console
- tokenizer, model, image_processor, context_len = load_pretrained_model(
- model_path=model_path,
- model_base=None,
- model_name=get_model_name_from_path(model_path),
- device=get_current_device(),
- torch_dtype=torch.float16,
- attn_implementation="flash_attention_2" if args.flash_attention else "eager",
- )
- dist.barrier()
-
- # ======================================================
- # 3. Apply system optimization
- # ======================================================
- tp_size = dist.get_world_size(tp_group)
- shard_config = ShardConfig(
- tensor_parallel_process_group=tp_group if tp_size > 1 else None,
- enable_tensor_parallelism=True if tp_size > 1 else False,
- )
- shard_former = ShardFormer(shard_config=shard_config)
-
- # check the model type
- model_name = model.__class__.__name__
- print(model_name)
- if model_name == "LlavaLlamaForCausalLM":
- model = shard_former.optimize(model, policy=LlavaLlamaForCausalLMPolicy())[0].cuda()
- elif model_name == "LlavaMistralForCausalLM":
- model = shard_former.optimize(model, policy=LlavaMistralForCausalLMPolicy())[0].cuda()
- else:
- print(f"The shardformer policy for {model_name} is not implemented, skip")
- torch.cuda.empty_cache()
-
- # ======================================================
- # 4. Prepare dataloader
- # ======================================================
- # prepare prompt
- query = PROMPTS[args.prompt]["text"]
- if dist.get_rank() == 0:
- print(f"Prompt: {query}")
-
- if "text" in args.prompt:
-
- def get_text_input_ids(text):
- conv = conv_templates["chatml_direct"].copy()
- query_text = query.format(text)
- conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query_text)
- prompt = conv.get_prompt()
- # add num_frames images
- t = prompt.split("")
- prompt = t[0] + "" * args.num_frames + t[1]
- input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
- input_ids = input_ids.unsqueeze(0)
- return input_ids
-
- else:
- conv = conv_templates["chatml_direct"].copy()
- conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query)
- prompt = conv.get_prompt()
- # add num_frames images
- t = prompt.split("")
- prompt = t[0] + "" * args.num_frames + t[1]
- input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
- input_ids = input_ids.unsqueeze(0)
-
- def get_text_input_ids(*args):
- return input_ids
-
- # build dataset
- def transform(imgs):
- imgs = process_images(imgs, image_processor, model.config)
- imgs = imgs.to(dtype=torch.float16)
- return imgs
-
- dataset = VideoTextDataset(
- args.input,
- transform=transform,
- num_frames=args.num_frames,
- get_text_input_ids=get_text_input_ids,
- resize=args.resize,
- )
-
- # make sure that the prompt type matches the data type
- data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1]
- prompt_type = PROMPTS[args.prompt]["type"]
- if prompt_type == "image":
- assert (
- data_extension.lower() in IMG_EXTENSIONS
- ), f"The prompt is suitable for an image dataset but the data is not image. The first data is of format {data_extension}"
- elif prompt_type == "video":
- assert (
- data_extension.lower() in VID_EXTENSIONS
- ), f"The prompt is suitable for a video dataset but the data is not video. The first data is of format {data_extension}"
- else:
- raise ValueError(f"Found invalid prompt type {prompt_type}")
-
- total_num_videos = len(dataset)
-
- # build sampler
- dp_rank = dist.get_rank(dp_group)
- dp_size = dist.get_world_size(dp_group)
- sampler = NoPaddingDistributedSampler(dataset, rank=dp_rank, num_replicas=dp_size)
-
- # build dataloader
- dataloader = torch.utils.data.DataLoader(
- dataset,
- batch_size=args.bs,
- shuffle=False,
- num_workers=args.num_workers,
- pin_memory=True,
- prefetch_factor=args.prefetch_factor,
- sampler=sampler,
- collate_fn=collate_fn,
- )
-
- # prepare output file reader
- output_file = args.input.replace(".csv", "_caption.csv")
-
- # create csv writer
- has_dp_writter = dist.get_rank(tp_group) == 0
-
- if has_dp_writter:
- # the dp writer takes care of the files processed on the current dp rank
- # so we use write mode
- output_file_split = output_file.replace(".csv", f"_part{dp_rank}.csv")
- dp_file = open(output_file_split, "w")
- dp_writer = csv.writer(dp_file)
- dp_writer.writerow(["path", "text", "num_frames"])
-
- # ======================================================
- # 5. generate captions
- # ======================================================
- if dist.get_rank(tp_group) == 0:
- pbar = tqdm(dataloader, position=dp_rank, desc=f"Data Parallel Rank {dist.get_rank(dp_group)}")
- else:
- pbar = dataloader
-
- if args.profile:
- encode_time = []
- generate_time = []
- output_length = []
- total_time = []
-
- for i, batch in enumerate(pbar):
- # measure time
- if args.profile:
- torch.cuda.synchronize()
- start_time = time.time()
-
- video_files, frames, video_lengths, img_size_list, texts = batch
-
- # encode the batch of inputs
- with Timer() as encode_timer:
- samples = []
- for imgs, imgs_size, input_ids in zip(frames, img_size_list, texts):
- imgs = imgs.cuda()
- input_ids = input_ids.cuda()
- _, _, _, _, inputs_embeds, _ = model.prepare_inputs_labels_for_multimodal(
- input_ids, None, None, None, None, images=imgs, image_sizes=imgs_size
- )
- samples.append(inputs_embeds)
-
- # padding
- max_len = max([sample.shape[1] for sample in samples])
- attention_mask = torch.tensor(
- [[0] * (max_len - samples[i].shape[1]) + [1] * samples[i].shape[1] for i in range(len(samples))]
- ).to(model.device)
- inputs_embeds = [
- torch.cat(
- [
- torch.zeros(
- (1, max_len - samples[i].shape[1], samples[i].shape[-1]),
- device=model.device,
- dtype=torch.float16,
- ),
- samples[i],
- ],
- dim=1,
- )
- for i in range(len(samples))
- ]
- inputs_embeds = torch.cat(inputs_embeds, dim=0)
-
- # generate outputs
- with Timer() as generate_timer:
- output_ids = super(type(model), model).generate(
- inputs_embeds=inputs_embeds,
- attention_mask=attention_mask,
- do_sample=False, # sampling is not deterministic and may cause TP to hang
- max_new_tokens=args.max_tokens,
- use_cache=True,
- )
-
- # skip warmup and add profiling data
- if args.profile and i >= args.profile_warmup:
- output_length.append(output_ids.size(0) * output_ids.size(1))
-
- outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
- outputs = [output.replace("\n", " ").strip() for output in outputs]
-
- # skip warmup and add profiling data
- if args.profile and i >= args.profile_warmup:
- # measure time
- torch.cuda.synchronize()
- time_taken = time.time() - start_time
-
- total_time.append(time_taken)
- encode_time.append(encode_timer.time_taken)
- generate_time.append(generate_timer.time_taken)
-
- # save results
- if has_dp_writter:
- result = list(zip(video_files, outputs, video_lengths))
- for t in result:
- dp_writer.writerow(t)
-
- # display profiling info
- if args.profile:
- print(output_length)
- num_samples_after_warmup = total_num_videos - args.bs * args.profile_warmup * dp_size
- print(f"throughput (samples/s): {num_samples_after_warmup / sum(total_time)}")
- print(f"average encode time per sample: {sum(encode_time) / num_samples_after_warmup}")
- print(f"average generate time per sample: {sum(generate_time) / num_samples_after_warmup}")
- print(f"average number of tokens characters per sample: {sum(output_length) / num_samples_after_warmup}")
- print(f"Max GPU allocated / GB: {torch.cuda.max_memory_allocated() / 1024**3}")
- print(f"Max GPU reserved / GB: {torch.cuda.max_memory_reserved() / 1024**3}")
-
- # ======================================================
- # 6. shutdown
- # ======================================================
- # close file writing
- if has_dp_writter:
- dp_file.close()
- dist.barrier()
-
- # terminate distributed env
- dist.destroy_process_group()
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("input", type=str, help="Path to the input CSV file")
- parser.add_argument("--model-path", type=str, default="liuhaotian/llava-v1.6-34b")
- parser.add_argument("--prompt", type=str, default="video-f1-detail-3ex")
- parser.add_argument("--resize", type=int, default=336)
- parser.add_argument("--num-frames", type=int, default=1)
- parser.add_argument("--max-tokens", type=int, default=300)
- # speed related
- parser.add_argument("--bs", type=int, default=16)
- parser.add_argument("--tp-size", type=int, default=2)
- parser.add_argument("--dp-size", type=int, default=4)
- parser.add_argument("--num-workers", type=int, default=8)
- parser.add_argument("--prefetch-factor", type=int, default=8, help="Prefetch factor")
- parser.add_argument(
- "--flash-attention",
- action="store_true",
- help="Whether to use flash attention. You can turn on this flag for llama model and off for mistral model.",
- )
- # debug related
- parser.add_argument("--profile", action="store_true")
- parser.add_argument("--profile-warmup", type=int, default=1)
-
- args = parser.parse_args()
- main(args)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/utils.py b/PyTorch/built-in/mm/OpenSora1.1/tools/caption/utils.py
deleted file mode 100644
index f50cdc271c036e8fdcb2c69f80fb39634ea94675..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/caption/utils.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import time
-
-import pandas as pd
-import torch
-import torchvision.transforms as transforms
-from torchvision.datasets.folder import pil_loader
-
-from tools.datasets.utils import extract_frames, is_video
-
-PROMPTS = {
- "image": {
- "text": "Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than five sentences. Remember do not exceed 5 sentences.",
- "type": "image",
- },
- "image-text": {
- "text": "Describe this image and its style in a very detailed manner. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than six sentences. Some information about the image is '{}'.",
- "type": "image",
- },
- "image-3ex": {
- "text": "An image is given. Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the video. The description should be no more than five sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick and walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
- "type": "image",
- },
- "video": {
- "text": "Describe this video and its style in a very detailed manner. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.",
- "type": "video",
- },
- "video-text": {
- "text": "Describe this video and its style in a very detailed manner. Some information about the image is '{}'. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.",
- "type": "video",
- },
- "video-f1-detail-3ex": {
- "text": "A video is given by providing the middle frame. Describe this video and its style to generate a description. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
- "type": "video",
- },
- "video-f1-detail-2ex-text": {
- "text": "A video is given by providing the middle frame. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.",
- "type": "video",
- },
- "video-f3-detail-3ex": {
- "text": "A video is given by providing three frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
- "type": "video",
- },
- "video-f3-detail-2ex-text": {
- "text": "A video is given by providing three frames in chronological order. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.",
- "type": "video",
- },
-}
-
-
-NUM_FRAMES_POINTS = {
- 1: (0.5,),
- 2: (0.25, 0.75),
- 3: (0.1, 0.5, 0.9),
-}
-
-
-def read_file(input_path):
- if input_path.endswith(".csv"):
- return pd.read_csv(input_path)
- elif input_path.endswith(".parquet"):
- return pd.read_parquet(input_path)
- else:
- raise NotImplementedError(f"Unsupported file format: {input_path}")
-
-
-class VideoTextDataset(torch.utils.data.Dataset):
- def __init__(self, csv_path, transform=None, num_frames=3, get_text_input_ids=None, resize=None):
- self.csv_path = csv_path
- self.transform = transform
- self.data = read_file(csv_path)
- self.points = NUM_FRAMES_POINTS[num_frames]
- self.get_text_input_ids = get_text_input_ids
- self.use_text = False
- self.resize_size = resize
- self.resize = transforms.Resize(resize, transforms.InterpolationMode.BICUBIC) if resize is not None else None
- if "text" in self.data.columns:
- self.use_text = True
-
- def getitem(self, index):
- sample = self.data.iloc[index]
- path = sample["path"]
- if not is_video(path):
- images = [pil_loader(path)]
- length = 1
- else:
- images, length = extract_frames(sample["path"], points=self.points, backend="opencv", return_length=True)
- if self.resize_size is not None:
- images_r = []
- for img in images:
- if img.size[0] > self.resize_size or img.size[1] > self.resize_size:
- img = self.resize(img)
- images_r.append(img)
- images = images_r
- imgs_size = [img.size for img in images]
- if self.transform is not None:
- images = self.transform(images)
-
- # we put images into a list as pytorch dataloader does not accept Pill
- out = dict(path=path, image=images, length=length, img_size=imgs_size)
- if self.get_text_input_ids is not None:
- if self.use_text:
- out["text"] = self.get_text_input_ids(sample["text"])
- else:
- out["text"] = self.get_text_input_ids()
- else:
- if self.use_text:
- out["text"] = sample["text"]
- else:
- out["text"] = ""
- return out
-
- def __len__(self):
- return len(self.data)
-
- def __getitem__(self, index):
- return self.getitem(index)
-
-
-def collate_fn(batch):
- paths = [item["path"] for item in batch]
- images = [item["image"] for item in batch]
- lengths = [item["length"] for item in batch]
- img_sizes = [item["img_size"] for item in batch]
- texts = [item["text"] for item in batch]
- return paths, images, lengths, img_sizes, texts
-
-
-class Timer:
- def __init__(self):
- self.time_taken = 0
- self.start_time = 0
- self.end_time = 0
-
- def __enter__(self):
- self.start_time = time.time()
- return self
-
- def __exit__(self, exc_type, exc_value, exc_tb):
- self.end_time = time.time()
- self.time_taken = self.end_time - self.start_time
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/README.md b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/README.md
deleted file mode 100644
index ac14f3337696b8134428bd193502fd5933d23773..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/README.md
+++ /dev/null
@@ -1,281 +0,0 @@
-# Dataset Management
-
-- [Dataset Management](#dataset-management)
- - [Dataset Format](#dataset-format)
- - [Dataset to CSV](#dataset-to-csv)
- - [Manage datasets](#manage-datasets)
- - [Requirement](#requirement)
- - [Basic Usage](#basic-usage)
- - [Score filtering](#score-filtering)
- - [Documentation](#documentation)
- - [Transform datasets](#transform-datasets)
- - [Resize](#resize)
- - [Frame extraction](#frame-extraction)
- - [Crop Midjourney 4 grid](#crop-midjourney-4-grid)
- - [Analyze datasets](#analyze-datasets)
- - [Data Process Pipeline](#data-process-pipeline)
-
-After preparing the raw dataset according to the [instructions](/docs/datasets.md), you can use the following commands to manage the dataset.
-
-## Dataset Format
-
-All dataset should be provided in a `.csv` file (or `parquet.gzip` to save space), which is used for both training and data preprocessing. The columns should follow the words below:
-
-- `path`: the relative/absolute path or url to the image or video file. Required.
-- `text`: the caption or description of the image or video. Required for training.
-- `num_frames`: the number of frames in the video. Required for training.
-- `width`: the width of the video frame. Required for dynamic bucket.
-- `height`: the height of the video frame. Required for dynamic bucket.
-- `aspect_ratio`: the aspect ratio of the video frame (height / width). Required for dynamic bucket.
-- `resolution`: height x width. For analysis.
-- `text_len`: the number of tokens in the text. For analysis.
-- `aes`: aesthetic score calculated by [asethetic scorer](/tools/aesthetic/README.md). For filtering.
-- `flow`: optical flow score calculated by [UniMatch](/tools/scoring/README.md). For filtering.
-- `match`: matching score of a image-text/video-text pair calculated by [CLIP](/tools/scoring/README.md). For filtering.
-- `fps`: the frame rate of the video. Optional.
-- `cmotion`: the camera motion.
-
-An example ready for training:
-
-```csv
-path, text, num_frames, width, height, aspect_ratio
-/absolute/path/to/image1.jpg, caption, 1, 720, 1280, 0.5625
-/absolute/path/to/video1.mp4, caption, 120, 720, 1280, 0.5625
-/absolute/path/to/video2.mp4, caption, 20, 256, 256, 1
-```
-
-We use pandas to manage the `.csv` or `.parquet` files. The following code is for reading and writing files:
-
-```python
-df = pd.read_csv(input_path)
-df = df.to_csv(output_path, index=False)
-# or use parquet, which is smaller
-df = pd.read_parquet(input_path)
-df = df.to_parquet(output_path, index=False)
-```
-
-## Dataset to CSV
-
-As a start point, `convert.py` is used to convert the dataset to a CSV file. You can use the following commands to convert the dataset to a CSV file:
-
-```bash
-python -m tools.datasets.convert DATASET-TYPE DATA_FOLDER
-
-# general video folder
-python -m tools.datasets.convert video VIDEO_FOLDER --output video.csv
-# general image folder
-python -m tools.datasets.convert image IMAGE_FOLDER --output image.csv
-# imagenet
-python -m tools.datasets.convert imagenet IMAGENET_FOLDER --split train
-# ucf101
-python -m tools.datasets.convert ucf101 UCF101_FOLDER --split videos
-# vidprom
-python -m tools.datasets.convert vidprom VIDPROM_FOLDER --info VidProM_semantic_unique.csv
-```
-
-## Manage datasets
-
-Use `datautil` to manage the dataset.
-
-### Requirement
-
-To accelerate processing speed, you can install [pandarallel](https://github.com/nalepae/pandarallel):
-
-```bash
-pip install pandarallel
-```
-
-To get image and video information, you need to install [opencv-python](https://github.com/opencv/opencv-python):
-
-```bash
-pip install opencv-python
-# If your videos are in av1 codec instead of h264, you need to
-# - install ffmpeg first
-# - install via conda to support av1 codec
-conda install -c conda-forge opencv
-```
-
-Or to get video information, you can install ffmpeg and ffmpeg-python:
-
-```bash
-pip install ffmpeg-python
-```
-
-To filter a specific language, you need to install [lingua](https://github.com/pemistahl/lingua-py):
-
-```bash
-pip install lingua-language-detector
-```
-
-### Basic Usage
-
-You can use the following commands to process the `csv` or `parquet` files. The output file will be saved in the same directory as the input, with different suffixes indicating the processed method.
-
-```bash
-# datautil takes multiple CSV files as input and merge them into one CSV file
-# output: DATA1+DATA2.csv
-python -m tools.datasets.datautil DATA1.csv DATA2.csv
-
-# shard CSV files into multiple CSV files
-# output: DATA1_0.csv, DATA1_1.csv, ...
-python -m tools.datasets.datautil DATA1.csv --shard 10
-
-# filter frames between 128 and 256, with captions
-# output: DATA1_fmin_128_fmax_256.csv
-python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256
-
-# Disable parallel processing
-python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256 --disable-parallel
-
-# Compute num_frames, height, width, fps, aspect_ratio for videos or images
-# output: IMG_DATA+VID_DATA_vinfo.csv
-python -m tools.datasets.datautil IMG_DATA.csv VID_DATA.csv --video-info
-
-# You can run multiple operations at the same time.
-python -m tools.datasets.datautil DATA.csv --video-info --remove-empty-caption --remove-url --lang en
-```
-
-### Score filtering
-
-To examine and filter the quality of the dataset by aesthetic score and clip score, you can use the following commands:
-
-```bash
-# sort the dataset by aesthetic score
-# output: DATA_sort.csv
-python -m tools.datasets.datautil DATA.csv --sort aesthetic_score
-# View examples of high aesthetic score
-head -n 10 DATA_sort.csv
-# View examples of low aesthetic score
-tail -n 10 DATA_sort.csv
-
-# sort the dataset by clip score
-# output: DATA_sort.csv
-python -m tools.datasets.datautil DATA.csv --sort clip_score
-
-# filter the dataset by aesthetic score
-# output: DATA_aesmin_0.5.csv
-python -m tools.datasets.datautil DATA.csv --aesmin 0.5
-# filter the dataset by clip score
-# output: DATA_matchmin_0.5.csv
-python -m tools.datasets.datautil DATA.csv --matchmin 0.5
-```
-
-### Documentation
-
-You can also use `python -m tools.datasets.datautil --help` to see usage.
-
-| Args | File suffix | Description |
-| --------------------------- | -------------- | ------------------------------------------------------------- |
-| `--output OUTPUT` | | Output path |
-| `--format FORMAT` | | Output format (csv, parquet, parquet.gzip) |
-| `--disable-parallel` | | Disable `pandarallel` |
-| `--seed SEED` | | Random seed |
-| `--shard SHARD` | `_0`,`_1`, ... | Shard the dataset |
-| `--sort KEY` | `_sort` | Sort the dataset by KEY |
-| `--sort-descending KEY` | `_sort` | Sort the dataset by KEY in descending order |
-| `--difference DATA.csv` | | Remove the paths in DATA.csv from the dataset |
-| `--intersection DATA.csv` | | Keep the paths in DATA.csv from the dataset and merge columns |
-| `--info` | `_info` | Get the basic information of each video and image (cv2) |
-| `--ext` | `_ext` | Remove rows if the file does not exist |
-| `--relpath` | `_relpath` | Modify the path to relative path by root given |
-| `--abspath` | `_abspath` | Modify the path to absolute path by root given |
-| `--remove-empty-caption` | `_noempty` | Remove rows with empty caption |
-| `--remove-url` | `_nourl` | Remove rows with url in caption |
-| `--lang LANG` | `_lang` | Remove rows with other language |
-| `--remove-path-duplication` | `_noduppath` | Remove rows with duplicated path |
-| `--remove-text-duplication` | `_noduptext` | Remove rows with duplicated caption |
-| `--refine-llm-caption` | `_llm` | Modify the caption generated by LLM |
-| `--clean-caption MODEL` | `_clean` | Modify the caption according to T5 pipeline to suit training |
-| `--unescape` | `_unescape` | Unescape the caption |
-| `--merge-cmotion` | `_cmotion` | Merge the camera motion to the caption |
-| `--count-num-token` | `_ntoken` | Count the number of tokens in the caption |
-| `--load-caption EXT` | `_load` | Load the caption from the file |
-| `--fmin FMIN` | `_fmin` | Filter the dataset by minimum number of frames |
-| `--fmax FMAX` | `_fmax` | Filter the dataset by maximum number of frames |
-| `--hwmax HWMAX` | `_hwmax` | Filter the dataset by maximum height x width |
-| `--aesmin AESMIN` | `_aesmin` | Filter the dataset by minimum aesthetic score |
-| `--matchmin MATCHMIN` | `_matchmin` | Filter the dataset by minimum clip score |
-| `--flowmin FLOWMIN` | `_flowmin` | Filter the dataset by minimum optical flow score |
-
-## Transform datasets
-
-The `tools.datasets.transform` module provides a set of tools to transform the dataset. The general usage is as follows:
-
-```bash
-python -m tools.datasets.transform TRANSFORM_TYPE META.csv ORIGINAL_DATA_FOLDER DATA_FOLDER_TO_SAVE_RESULTS --additional-args
-```
-
-### Resize
-
-Sometimes you may need to resize the images or videos to a specific resolution. You can use the following commands to resize the dataset:
-
-```bash
-python -m tools.datasets.transform meta.csv /path/to/raw/data /path/to/new/data --length 2160
-```
-
-### Frame extraction
-
-To extract frames from videos, you can use the following commands:
-
-```bash
-python -m tools.datasets.transform vid_frame_extract meta.csv /path/to/raw/data /path/to/new/data --points 0.1 0.5 0.9
-```
-
-### Crop Midjourney 4 grid
-
-Randomly select one of the 4 images in the 4 grid generated by Midjourney.
-
-```bash
-python -m tools.datasets.transform img_rand_crop meta.csv /path/to/raw/data /path/to/new/data
-```
-
-## Analyze datasets
-
-You can easily get basic information about a `.csv` dataset by using the following commands:
-
-```bash
-# examine the first 10 rows of the CSV file
-head -n 10 DATA1.csv
-# count the number of data in the CSV file (approximately)
-wc -l DATA1.csv
-```
-
-For the dataset provided in a `.csv` or `.parquet` file, you can easily analyze the dataset using the following commands. Plots will be automatically saved.
-
-```python
-pyhton -m tools.datasets.analyze DATA_info.csv
-```
-
-## Data Process Pipeline
-
-```bash
-# Suppose videos and images under ~/dataset/
-# 1. Convert dataset to CSV
-python -m tools.datasets.convert video ~/dataset --output meta.csv
-
-# 2. Get video information
-python -m tools.datasets.datautil meta.csv --info --fmin 1
-
-# 3. Get caption
-# 3.1. generate caption
-torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava meta_info_fmin1.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
-# merge generated results
-python -m tools.datasets.datautil meta_info_fmin1_caption_part*.csv --output meta_caption.csv
-# merge caption and info
-python -m tools.datasets.datautil meta_info_fmin1.csv --intersection meta_caption.csv --output meta_caption_info.csv
-# clean caption
-python -m tools.datasets.datautil meta_caption_info.csv --clean-caption --refine-llm-caption --remove-empty-caption --output meta_caption_processed.csv
-# 3.2. extract caption
-python -m tools.datasets.datautil meta_info_fmin1.csv --load-caption json --remove-empty-caption --clean-caption
-
-# 4. Scoring
-# aesthetic scoring
-torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference meta_caption_processed.csv
-python -m tools.datasets.datautil meta_caption_processed_part*.csv --output meta_caption_processed_aes.csv
-# optical flow scoring
-torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference meta_caption_processed.csv
-# matching scoring
-torchrun --standalone --nproc_per_node 8 -m tools.scoring.matching.inference meta_caption_processed.csv
-# camera motion
-python -m tools.caption.camera_motion_detect meta_caption_processed.csv
-```
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/analyze.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/analyze.py
deleted file mode 100644
index 7151689a4d309e5516f1a461fe4bec47dbff97e2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/analyze.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import argparse
-import os
-
-import matplotlib.pyplot as plt
-import pandas as pd
-
-
-def read_file(input_path):
- if input_path.endswith(".csv"):
- return pd.read_csv(input_path)
- elif input_path.endswith(".parquet"):
- return pd.read_parquet(input_path)
- else:
- raise NotImplementedError(f"Unsupported file format: {input_path}")
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("input", type=str, help="Path to the input dataset")
- parser.add_argument("--save-img", type=str, default="samples/infos/", help="Path to save the image")
- return parser.parse_args()
-
-
-def plot_data(data, column, bins, name):
- plt.clf()
- data.hist(column=column, bins=bins)
- os.makedirs(os.path.dirname(name), exist_ok=True)
- plt.savefig(name)
- print(f"Saved {name}")
-
-
-def plot_categorical_data(data, column, name):
- plt.clf()
- data[column].value_counts().plot(kind="bar")
- os.makedirs(os.path.dirname(name), exist_ok=True)
- plt.savefig(name)
- print(f"Saved {name}")
-
-
-COLUMNS = {
- "num_frames": 100,
- "resolution": 100,
- "text_len": 100,
- "aes": 100,
- "match": 100,
- "flow": 100,
- "cmotion": None,
-}
-
-
-def main(args):
- data = read_file(args.input)
-
- # === Image Data Info ===
- image_index = data["num_frames"] == 1
- if image_index.sum() > 0:
- print("=== Image Data Info ===")
- img_data = data[image_index]
- print(f"Number of images: {len(img_data)}")
- print(img_data.head())
- print(img_data.describe())
- if args.save_img:
- for column in COLUMNS:
- if column in img_data.columns and column not in ["num_frames", "cmotion"]:
- if COLUMNS[column] is None:
- plot_categorical_data(img_data, column, os.path.join(args.save_img, f"image_{column}.png"))
- else:
- plot_data(img_data, column, COLUMNS[column], os.path.join(args.save_img, f"image_{column}.png"))
-
- # === Video Data Info ===
- if not image_index.all():
- print("=== Video Data Info ===")
- video_data = data[~image_index]
- print(f"Number of videos: {len(video_data)}")
- if "num_frames" in video_data.columns:
- total_num_frames = video_data["num_frames"].sum()
- print(f"Number of frames: {total_num_frames}")
- DEFAULT_FPS = 30
- total_hours = total_num_frames / DEFAULT_FPS / 3600
- print(f"Total hours (30 FPS): {int(total_hours)}")
- print(video_data.head())
- print(video_data.describe())
- if args.save_img:
- for column in COLUMNS:
- if column in video_data.columns:
- if COLUMNS[column] is None:
- plot_categorical_data(video_data, column, os.path.join(args.save_img, f"video_{column}.png"))
- else:
- plot_data(
- video_data, column, COLUMNS[column], os.path.join(args.save_img, f"video_{column}.png")
- )
-
-
-if __name__ == "__main__":
- args = parse_args()
- main(args)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/collate_msr_vtt_dataset.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/collate_msr_vtt_dataset.py
deleted file mode 100644
index 69bd7669e1329725cf250df0f9e87a5a6e99dbaa..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/collate_msr_vtt_dataset.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-import argparse
-import json
-import multiprocessing
-import os
-import shutil
-import warnings
-from typing import Dict, Tuple
-
-from tqdm import tqdm
-
-DEFAULT_TYPES = ["train", "val", "test"]
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("-d", "--data-path", type=str, help="The path to the MSR-VTT dataset")
- parser.add_argument("-o", "--output-path", type=str, help="The output to the collated MSR-VTT dataset")
- return parser.parse_args()
-
-
-def get_annotations(root_path: str):
- """
- Get the annotation data from the MSR-VTT dataset. The annotations are in the format of:
-
- {
- "annotations": [
- {
- "image_id": "video1",
- "caption": "some
- }
- ]
- }
-
- Args:
- root_path (str): The root path to the MSR-VTT dataset
- """
- annotation_json_file = os.path.join(root_path, "annotation/MSR_VTT.json")
- with open(annotation_json_file, "r") as f:
- data = json.load(f)
- return data
-
-
-def get_video_list(root_path: str, dataset_type: str):
- """
- Get the list of videos in the dataset split.
-
- Args:
- root_path (str): The root path to the MSR-VTT dataset
- dataset_type (str): The dataset split type. It should be one of "train", "val", or "test"
- """
- assert dataset_type in DEFAULT_TYPES, f"Expected the dataset type to be in {DEFAULT_TYPES}, but got {dataset_type}"
- dataset_file_path = os.path.join(root_path, f"structured-symlinks/{dataset_type}_list_full.txt")
- with open(dataset_file_path, "r") as f:
- video_list = f.readlines()
- video_list = [x.strip() for x in video_list]
- return video_list
-
-
-def copy_video(video_id: str, root_path: str, output_path: str, dataset_type: str):
- """
- Copy the video from the source path to the destination path.
-
- Args:
- video_id (str): The video id
- root_path (str): The root path to the MSR-VTT dataset
- output_path (str): The output path to the collated MSR-VTT dataset
- dataset_type (str): The dataset split type. It should be one of "train", "val", or "test"
- """
- assert dataset_type in DEFAULT_TYPES, f"Expected the dataset type to be in {DEFAULT_TYPES}, but got {dataset_type}"
- src_file = os.path.join(root_path, f"videos/all/{video_id}.mp4")
- dst_folder = os.path.join(output_path, f"{dataset_type}/videos")
- dst_file = os.path.join(dst_folder, f"{video_id}.mp4")
- os.makedirs(dst_folder, exist_ok=True)
-
- # create symlink
- assert os.path.isfile(src_file), f"Expected the source file {src_file} to exist"
- if not os.path.islink(dst_file):
- shutil.copy(src_file, dst_file)
-
-
-def get_annotation_file_path(output_path: str, dataset_type: str):
- file_path = os.path.join(output_path, f"{dataset_type}/annotations.json")
- return file_path
-
-
-def collate_annotation_files(
- annotations: Dict,
- root_path: str,
- output_path: str,
-):
- """
- Collate the video and caption data into a single folder.
-
- Args:
- annotations (Dict): The annotations data
- root_path (str): The root path to the MSR-VTT dataset
- output_path (str): The output path to the collated MSR-VTT dataset
- """
- # get all video list
- train_video_list = get_video_list(root_path, "train")
- val_video_list = get_video_list(root_path, "val")
- test_video_list = get_video_list(root_path, "test")
-
- # iterate over annotations
- collated_train_data = []
- collated_val_data = []
- collated_test_data = []
-
- print("Collating annotations files")
-
- for anno in tqdm(annotations["annotations"]):
- video_id = anno["image_id"]
- caption = anno["caption"]
-
- obj = {"file": f"{video_id}.mp4", "captions": [caption]}
-
- if video_id in train_video_list:
- collated_train_data.append(obj)
- elif video_id in val_video_list:
- collated_val_data.append(obj)
- elif video_id in test_video_list:
- collated_test_data.append(obj)
- else:
- warnings.warn(f"Video {video_id} not found in any of the dataset splits")
-
- def _save_caption_files(obj, dataset_type):
- dst_file = get_annotation_file_path(output_path, dataset_type)
- os.makedirs(os.path.dirname(dst_file), exist_ok=True)
- with open(dst_file, "w") as f:
- json.dump(obj, f, indent=4)
-
- _save_caption_files(collated_train_data, "train")
- _save_caption_files(collated_val_data, "val")
- _save_caption_files(collated_test_data, "test")
-
-
-def copy_file(path_pair: Tuple[str, str]):
- src_path, dst_path = path_pair
- shutil.copyfile(src_path, dst_path)
-
-
-def copy_videos(root_path: str, output_path: str, num_workers: int = 8):
- """
- Batch copy the video files to the output path.
-
- Args:
- root_path (str): The root path to the MSR-VTT dataset
- output_path (str): The output path to the collated MSR-VTT dataset
- num_workers (int): The number of workers to use for the copy operation
- """
- pool = multiprocessing.Pool(num_workers)
-
- for dataset_type in DEFAULT_TYPES:
- print(f"Copying videos for the {dataset_type} dataset")
- annotation_file_path = get_annotation_file_path(output_path, dataset_type)
- output_video_folder_path = os.path.join(output_path, f"{dataset_type}/videos")
- os.makedirs(output_video_folder_path, exist_ok=True)
-
- with open(annotation_file_path, "r") as f:
- annotation_data = json.load(f)
-
- video_ids = [obj["file"] for obj in annotation_data]
- unique_video_ids = list(set(video_ids))
-
- path_pairs = [
- (os.path.join(root_path, f"videos/all/{video_id}"), os.path.join(output_video_folder_path, video_id))
- for video_id in unique_video_ids
- ]
-
- for _ in tqdm(pool.imap_unordered(copy_file, path_pairs), total=len(path_pairs)):
- pass
-
-
-def main():
- args = parse_args()
- annotations = get_annotations(args.data_path)
- collate_annotation_files(annotations, args.data_path, args.output_path)
- copy_videos(args.data_path, args.output_path)
-
-
-if __name__ == "__main__":
- main()
\ No newline at end of file
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/convert.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/convert.py
deleted file mode 100644
index ef6eee3ffa31cc7e0c2620ae826e69c8e8960631..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/convert.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import argparse
-import os
-import time
-
-import pandas as pd
-from torchvision.datasets import ImageNet
-
-IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
-VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
-
-
-def scan_recursively(root):
- num = 0
- for entry in os.scandir(root):
- if entry.is_file():
- yield entry
- elif entry.is_dir():
- num += 1
- if num % 100 == 0:
- print(f"Scanned {num} directories.")
- yield from scan_recursively(entry.path)
-
-
-def get_filelist(file_path, exts=None):
- filelist = []
- time_start = time.time()
-
- # == OS Walk ==
- # for home, dirs, files in os.walk(file_path):
- # for filename in files:
- # ext = os.path.splitext(filename)[-1].lower()
- # if exts is None or ext in exts:
- # filelist.append(os.path.join(home, filename))
-
- # == Scandir ==
- obj = scan_recursively(file_path)
- for entry in obj:
- if entry.is_file():
- ext = os.path.splitext(entry.name)[-1].lower()
- if exts is None or ext in exts:
- filelist.append(entry.path)
-
- time_end = time.time()
- print(f"Scanned {len(filelist)} files in {time_end - time_start:.2f} seconds.")
- return filelist
-
-
-def split_by_capital(name):
- # BoxingPunchingBag -> Boxing Punching Bag
- new_name = ""
- for i in range(len(name)):
- if name[i].isupper() and i != 0:
- new_name += " "
- new_name += name[i]
- return new_name
-
-
-def process_imagenet(root, split):
- root = os.path.expanduser(root)
- data = ImageNet(root, split=split)
- samples = [(path, data.classes[label][0]) for path, label in data.samples]
- output = f"imagenet_{split}.csv"
-
- df = pd.DataFrame(samples, columns=["path", "text"])
- df.to_csv(output, index=False)
- print(f"Saved {len(samples)} samples to {output}.")
-
-
-def process_ucf101(root, split):
- root = os.path.expanduser(root)
- video_lists = get_filelist(os.path.join(root, split))
- classes = [x.split("/")[-2] for x in video_lists]
- classes = [split_by_capital(x) for x in classes]
- samples = list(zip(video_lists, classes))
- output = f"ucf101_{split}.csv"
-
- df = pd.DataFrame(samples, columns=["path", "text"])
- df.to_csv(output, index=False)
- print(f"Saved {len(samples)} samples to {output}.")
-
-
-def process_vidprom(root, info):
- root = os.path.expanduser(root)
- video_lists = get_filelist(root)
- video_set = set(video_lists)
- # read info csv
- infos = pd.read_csv(info)
- abs_path = infos["uuid"].apply(lambda x: os.path.join(root, f"pika-{x}.mp4"))
- is_exist = abs_path.apply(lambda x: x in video_set)
- df = pd.DataFrame(dict(path=abs_path[is_exist], text=infos["prompt"][is_exist]))
- df.to_csv("vidprom.csv", index=False)
- print(f"Saved {len(df)} samples to vidprom.csv.")
-
-
-def process_general_images(root, output):
- root = os.path.expanduser(root)
- image_lists = get_filelist(root, IMG_EXTENSIONS)
- df = pd.DataFrame(dict(path=image_lists))
- if output is None:
- output = "images.csv"
- df.to_csv(output, index=False)
- print(f"Saved {len(df)} samples to {output}.")
-
-
-def process_general_videos(root, output):
- root = os.path.expanduser(root)
- video_lists = get_filelist(root, VID_EXTENSIONS)
- df = pd.DataFrame(dict(path=video_lists))
- if output is None:
- output = "videos.csv"
- df.to_csv(output, index=False)
- print(f"Saved {len(df)} samples to {output}.")
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101", "vidprom", "image", "video"])
- parser.add_argument("root", type=str)
- parser.add_argument("--split", type=str, default="train")
- parser.add_argument("--info", type=str, default=None)
- parser.add_argument("--output", type=str, default=None)
- args = parser.parse_args()
-
- if args.dataset == "imagenet":
- process_imagenet(args.root, args.split)
- elif args.dataset == "ucf101":
- process_ucf101(args.root, args.split)
- elif args.dataset == "vidprom":
- process_vidprom(args.root, args.info)
- elif args.dataset == "image":
- process_general_images(args.root, args.output)
- elif args.dataset == "video":
- process_general_videos(args.root, args.output)
- else:
- raise ValueError("Invalid dataset")
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/convert_dataset.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/convert_dataset.py
deleted file mode 100644
index 4ff904fc20cbe58e696c9d606fbfb871561ed45c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/convert_dataset.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import argparse
-import csv
-import os
-
-from torchvision.datasets import ImageNet
-
-
-def get_filelist(file_path):
- Filelist = []
- for home, dirs, files in os.walk(file_path):
- for filename in files:
- Filelist.append(os.path.join(home, filename))
- return Filelist
-
-
-def split_by_capital(name):
- # BoxingPunchingBag -> Boxing Punching Bag
- new_name = ""
- for i in range(len(name)):
- if name[i].isupper() and i != 0:
- new_name += " "
- new_name += name[i]
- return new_name
-
-
-def process_imagenet(root, split):
- root = os.path.expanduser(root)
- data = ImageNet(root, split=split)
- samples = [(path, data.classes[label][0]) for path, label in data.samples]
- output = f"imagenet_{split}.csv"
-
- with open(output, "w") as f:
- writer = csv.writer(f)
- writer.writerows(samples)
-
- print(f"Saved {len(samples)} samples to {output}.")
-
-
-def process_ucf101(root, split):
- root = os.path.expanduser(root)
- video_lists = get_filelist(os.path.join(root, split))
- classes = [x.split("/")[-2] for x in video_lists]
- classes = [split_by_capital(x) for x in classes]
- samples = list(zip(video_lists, classes))
- output = f"ucf101_{split}.csv"
-
- with open(output, "w") as f:
- writer = csv.writer(f)
- writer.writerows(samples)
-
- print(f"Saved {len(samples)} samples to {output}.")
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101"])
- parser.add_argument("root", type=str)
- parser.add_argument("--split", type=str, default="train")
- args = parser.parse_args()
-
- if args.dataset == "imagenet":
- process_imagenet(args.root, args.split)
- elif args.dataset == "ucf101":
- process_ucf101(args.root, args.split)
- else:
- raise ValueError("Invalid dataset")
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/csvutil.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/csvutil.py
deleted file mode 100644
index 4bbd22db24962ce2c66656445a043c35fbeed38b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/csvutil.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import argparse
-import csv
-import os
-
-from tqdm import tqdm
-
-# path, name, #frames
-PREFIX = [
- "The video shows",
- "The video captures",
- "The video features",
- "The video depicts",
- "The video presents",
- "The video features",
- "The video is ",
- "In the video,",
-]
-
-
-def get_video_length(path):
- import cv2
-
- cap = cv2.VideoCapture(path)
- return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-
-
-def main(args):
- input_path = args.input
- output_path = args.output
- if output_path is None:
- name = os.path.basename(input_path)
- name, ext = os.path.splitext(name)
- if args.fmin is not None:
- name += f"_fmin_{args.fmin}"
- if args.fmax is not None:
- name += f"_fmax_{args.fmax}"
- if args.remove_empty_caption:
- name += "_rec"
- if args.remove_caption_prefix:
- name += "_rcp"
- if args.root is not None:
- name += f"_root"
- if args.relength:
- name += "_relength"
- output_path = os.path.join(os.path.dirname(input_path), name + ext)
-
- with open(input_path, "r") as f:
- reader = csv.reader(f)
- data = list(reader)
- print("Number of videos before filtering:", len(data))
-
- data_new = []
- for i, row in tqdm(enumerate(data)):
- path = row[0]
- caption = row[1]
- n_frames = int(row[2])
- if args.fmin is not None and n_frames < args.fmin:
- continue
- if args.fmax is not None and n_frames > args.fmax:
- continue
- if args.remove_empty_caption and len(caption) == 0:
- continue
- if args.remove_caption_prefix:
- for prefix in PREFIX:
- if caption.startswith(prefix):
- caption = caption[len(prefix) :].strip()
- if caption[0].islower():
- caption = caption[0].upper() + caption[1:]
- row[1] = caption
- break
- if args.root is not None:
- row[0] = os.path.join(args.root, path)
- if args.relength:
- n_frames = get_video_length(row[0])
- row[2] = n_frames
- data_new.append(row)
-
- print("Number of videos after filtering:", len(data_new))
- with open(output_path, "w") as f:
- writer = csv.writer(f)
- writer.writerows(data_new)
- print("Output saved to", output_path)
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("input", type=str)
- parser.add_argument("--output", type=str, default=None)
- parser.add_argument("--fmin", type=int, default=None)
- parser.add_argument("--fmax", type=int, default=None)
- parser.add_argument("--root", type=str, default=None)
- parser.add_argument("--remove-empty-caption", action="store_true")
- parser.add_argument("--remove-caption-prefix", action="store_true")
- parser.add_argument("--relength", action="store_true")
- args = parser.parse_args()
- main(args)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/datautil.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/datautil.py
deleted file mode 100644
index 475b847258e384c8c82af65a9e41fe15641d26ad..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/datautil.py
+++ /dev/null
@@ -1,695 +0,0 @@
-import argparse
-import html
-import json
-import os
-import random
-import re
-from functools import partial
-from glob import glob
-
-import cv2
-import numpy as np
-import pandas as pd
-import torchvision
-from tqdm import tqdm
-
-from .utils import IMG_EXTENSIONS
-
-tqdm.pandas()
-
-try:
- from pandarallel import pandarallel
-
- PANDA_USE_PARALLEL = True
-except ImportError:
- PANDA_USE_PARALLEL = False
-
-
-def apply(df, func, **kwargs):
- if PANDA_USE_PARALLEL:
- return df.parallel_apply(func, **kwargs)
- return df.progress_apply(func, **kwargs)
-
-
-TRAIN_COLUMNS = ["path", "text", "num_frames", "fps", "height", "width", "aspect_ratio", "resolution", "text_len"]
-
-# ======================================================
-# --info
-# ======================================================
-
-
-def get_video_length(cap, method="header"):
- assert method in ["header", "set"]
- if method == "header":
- length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
- else:
- cap.set(cv2.CAP_PROP_POS_AVI_RATIO, 1)
- length = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
- return length
-
-
-def get_info(path):
- try:
- ext = os.path.splitext(path)[1].lower()
- if ext in IMG_EXTENSIONS:
- im = cv2.imread(path)
- if im is None:
- return 0, 0, 0, np.nan, np.nan
- height, width = im.shape[:2]
- num_frames, fps = 1, np.nan
- else:
- cap = cv2.VideoCapture(path)
- num_frames, height, width, fps = (
- get_video_length(cap, method="header"),
- int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
- int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
- float(cap.get(cv2.CAP_PROP_FPS)),
- )
- hw = height * width
- aspect_ratio = height / width if width > 0 else np.nan
- return num_frames, height, width, aspect_ratio, fps, hw
- except:
- return 0, 0, 0, np.nan, np.nan, np.nan
-
-
-def get_video_info(path):
- try:
- vframes, _, _ = torchvision.io.read_video(filename=path, pts_unit="sec", output_format="TCHW")
- num_frames, height, width = vframes.shape[0], vframes.shape[2], vframes.shape[3]
- aspect_ratio = height / width
- fps = np.nan
- resolution = height * width
- return num_frames, height, width, aspect_ratio, fps, resolution
- except:
- return 0, 0, 0, np.nan, np.nan, np.nan
-
-
-# ======================================================
-# --refine-llm-caption
-# ======================================================
-
-LLAVA_PREFIX = [
- "The video shows",
- "The video captures",
- "The video features",
- "The video depicts",
- "The video presents",
- "The video features",
- "The video is ",
- "In the video,",
- "The image shows",
- "The image captures",
- "The image features",
- "The image depicts",
- "The image presents",
- "The image features",
- "The image is ",
- "The image portrays",
- "In the image,",
-]
-
-
-def remove_caption_prefix(caption):
- for prefix in LLAVA_PREFIX:
- if caption.startswith(prefix) or caption.startswith(prefix.lower()):
- caption = caption[len(prefix) :].strip()
- if caption[0].islower():
- caption = caption[0].upper() + caption[1:]
- return caption
- return caption
-
-
-# ======================================================
-# --merge-cmotion
-# ======================================================
-
-CMOTION_TEXT = {
- "static": "The camera is static.",
- "dynamic": "The camera is moving.",
- "unknown": None,
- "zoom in": "The camera is zooming in.",
- "zoom out": "The camera is zooming out.",
- "pan left": "The camera is panning left.",
- "pan right": "The camera is panning right.",
- "tilt up": "The camera is tilting up.",
- "tilt down": "The camera is tilting down.",
- "pan/tilt": "The camera is panning.",
-}
-CMOTION_PROBS = {
- # hard-coded probabilities
- "static": 1.0,
- "dynamic": 1.0,
- "unknown": 0.0,
- "zoom in": 1.0,
- "zoom out": 1.0,
- "pan left": 1.0,
- "pan right": 1.0,
- "tilt up": 1.0,
- "tilt down": 1.0,
- "pan/tilt": 1.0,
-}
-
-
-def merge_cmotion(caption, cmotion):
- text = CMOTION_TEXT[cmotion]
- prob = CMOTION_PROBS[cmotion]
- if text is not None and random.random() < prob:
- caption = f"{caption} {text}"
- return caption
-
-
-# ======================================================
-# --lang
-# ======================================================
-
-
-def build_lang_detector(lang_to_detect):
- from lingua import Language, LanguageDetectorBuilder
-
- lang_dict = dict(en=Language.ENGLISH)
- assert lang_to_detect in lang_dict
- valid_lang = lang_dict[lang_to_detect]
- detector = LanguageDetectorBuilder.from_all_spoken_languages().with_low_accuracy_mode().build()
-
- def detect_lang(caption):
- confidence_values = detector.compute_language_confidence_values(caption)
- confidence = [x.language for x in confidence_values[:5]]
- if valid_lang not in confidence:
- return False
- return True
-
- return detect_lang
-
-
-# ======================================================
-# --clean-caption
-# ======================================================
-
-
-def basic_clean(text):
- import ftfy
-
- text = ftfy.fix_text(text)
- text = html.unescape(html.unescape(text))
- return text.strip()
-
-
-BAD_PUNCT_REGEX = re.compile(
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
-) # noqa
-
-
-def clean_caption(caption):
- import urllib.parse as ul
-
- from bs4 import BeautifulSoup
-
- caption = str(caption)
- caption = ul.unquote_plus(caption)
- caption = caption.strip().lower()
- caption = re.sub("", "person", caption)
- # urls:
- caption = re.sub(
- r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
- "",
- caption,
- ) # regex for urls
- caption = re.sub(
- r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
- "",
- caption,
- ) # regex for urls
- # html:
- caption = BeautifulSoup(caption, features="html.parser").text
-
- # @
- caption = re.sub(r"@[\w\d]+\b", "", caption)
-
- # 31C0—31EF CJK Strokes
- # 31F0—31FF Katakana Phonetic Extensions
- # 3200—32FF Enclosed CJK Letters and Months
- # 3300—33FF CJK Compatibility
- # 3400—4DBF CJK Unified Ideographs Extension A
- # 4DC0—4DFF Yijing Hexagram Symbols
- # 4E00—9FFF CJK Unified Ideographs
- caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
- caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
- caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
- caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
- caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
- caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
- caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
- #######################################################
-
- # все виды тире / all types of dash --> "-"
- caption = re.sub(
- r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
- "-",
- caption,
- )
-
- # кавычки к одному стандарту
- caption = re.sub(r"[`´«»“”¨]", '"', caption)
- caption = re.sub(r"[‘’]", "'", caption)
-
- # "
- caption = re.sub(r""?", "", caption)
- # &
- caption = re.sub(r"&", "", caption)
-
- # ip adresses:
- caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
-
- # article ids:
- caption = re.sub(r"\d:\d\d\s+$", "", caption)
-
- # \n
- caption = re.sub(r"\\n", " ", caption)
-
- # "#123"
- caption = re.sub(r"#\d{1,3}\b", "", caption)
- # "#12345.."
- caption = re.sub(r"#\d{5,}\b", "", caption)
- # "123456.."
- caption = re.sub(r"\b\d{6,}\b", "", caption)
- # filenames:
- caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
-
- #
- caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
- caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
-
- caption = re.sub(BAD_PUNCT_REGEX, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
- caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
-
- # this-is-my-cute-cat / this_is_my_cute_cat
- regex2 = re.compile(r"(?:\-|\_)")
- if len(re.findall(regex2, caption)) > 3:
- caption = re.sub(regex2, " ", caption)
-
- caption = basic_clean(caption)
-
- caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640
- caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc
- caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231
-
- caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
- caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
- caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
- caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
- caption = re.sub(r"\bpage\s+\d+\b", "", caption)
-
- caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
-
- caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
-
- caption = re.sub(r"\b\s+\:\s+", r": ", caption)
- caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
- caption = re.sub(r"\s+", " ", caption)
-
- caption.strip()
-
- caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
- caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
- caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
- caption = re.sub(r"^\.\S+$", "", caption)
-
- return caption.strip()
-
-
-def text_preprocessing(text, use_text_preprocessing: bool = True):
- if use_text_preprocessing:
- # The exact text cleaning as was in the training stage:
- text = clean_caption(text)
- text = clean_caption(text)
- return text
- else:
- return text.lower().strip()
-
-
-# ======================================================
-# load caption
-# ======================================================
-
-
-def load_caption(path, ext):
- try:
- assert ext in ["json"]
- json_path = path.split(".")[0] + ".json"
- with open(json_path, "r") as f:
- data = json.load(f)
- caption = data["caption"]
- return caption
- except:
- return ""
-
-
-# ======================================================
-# read & write
-# ======================================================
-
-
-def read_file(input_path):
- if input_path.endswith(".csv"):
- return pd.read_csv(input_path)
- elif input_path.endswith(".parquet"):
- return pd.read_parquet(input_path)
- else:
- raise NotImplementedError(f"Unsupported file format: {input_path}")
-
-
-def save_file(data, output_path):
- output_dir = os.path.dirname(output_path)
- if not os.path.exists(output_dir) and output_dir != "":
- os.makedirs(output_dir)
- if output_path.endswith(".csv"):
- return data.to_csv(output_path, index=False)
- elif output_path.endswith(".parquet"):
- return data.to_parquet(output_path, index=False)
- else:
- raise NotImplementedError(f"Unsupported file format: {output_path}")
-
-
-def read_data(input_paths):
- data = []
- input_name = ""
- input_list = []
- for input_path in input_paths:
- input_list.extend(glob(input_path))
- print("Input files:", input_list)
- for i, input_path in enumerate(input_list):
- assert os.path.exists(input_path)
- data.append(read_file(input_path))
- input_name += os.path.basename(input_path).split(".")[0]
- if i != len(input_list) - 1:
- input_name += "+"
- print(f"Loaded {len(data[-1])} samples from {input_path}.")
- data = pd.concat(data, ignore_index=True, sort=False)
- print(f"Total number of samples: {len(data)}.")
- return data, input_name
-
-
-# ======================================================
-# main
-# ======================================================
-# To add a new method, register it in the main, parse_args, and get_output_path functions, and update the doc at /tools/datasets/README.md#documentation
-
-
-def main(args):
- # reading data
- data, input_name = read_data(args.input)
-
- # make difference
- if args.difference is not None:
- data_diff = pd.read_csv(args.difference)
- print(f"Difference csv contains {len(data_diff)} samples.")
- data = data[~data["path"].isin(data_diff["path"])]
- input_name += f"-{os.path.basename(args.difference).split('.')[0]}"
- print(f"Filtered number of samples: {len(data)}.")
-
- # make intersection
- if args.intersection is not None:
- data_new = pd.read_csv(args.intersection)
- print(f"Intersection csv contains {len(data_new)} samples.")
- cols_to_use = data_new.columns.difference(data.columns)
- cols_to_use = cols_to_use.insert(0, "path")
- data = pd.merge(data, data_new[cols_to_use], on="path", how="inner")
- print(f"Intersection number of samples: {len(data)}.")
-
- # train columns
- if args.train_column:
- all_columns = data.columns
- columns_to_drop = all_columns.difference(TRAIN_COLUMNS)
- data = data.drop(columns=columns_to_drop)
-
- # get output path
- output_path = get_output_path(args, input_name)
-
- # preparation
- if args.lang is not None:
- detect_lang = build_lang_detector(args.lang)
- if args.count_num_token == "t5":
- from transformers import AutoTokenizer
-
- tokenizer = AutoTokenizer.from_pretrained("DeepFloyd/t5-v1_1-xxl")
-
- # IO-related
- if args.load_caption is not None:
- assert "path" in data.columns
- data["text"] = apply(data["path"], load_caption, ext=args.load_caption)
- if args.info:
- info = apply(data["path"], get_info)
- (
- data["num_frames"],
- data["height"],
- data["width"],
- data["aspect_ratio"],
- data["fps"],
- data["resolution"],
- ) = zip(*info)
- if args.video_info:
- info = apply(data["path"], get_video_info)
- (
- data["num_frames"],
- data["height"],
- data["width"],
- data["aspect_ratio"],
- data["fps"],
- data["resolution"],
- ) = zip(*info)
- if args.ext:
- assert "path" in data.columns
- data = data[apply(data["path"], os.path.exists)]
-
- # filtering
- if args.remove_url:
- assert "text" in data.columns
- data = data[~data["text"].str.contains(r"(?Phttps?://[^\s]+)", regex=True)]
- if args.lang is not None:
- assert "text" in data.columns
- data = data[data["text"].progress_apply(detect_lang)] # cannot parallelize
- if args.remove_empty_caption:
- assert "text" in data.columns
- data = data[data["text"].str.len() > 0]
- data = data[~data["text"].isna()]
- if args.remove_path_duplication:
- assert "path" in data.columns
- data = data.drop_duplicates(subset=["path"])
-
- # processing
- if args.relpath is not None:
- data["path"] = apply(data["path"], lambda x: os.path.relpath(x, args.relpath))
- if args.abspath is not None:
- data["path"] = apply(data["path"], lambda x: os.path.join(args.abspath, x))
- if args.merge_cmotion:
- data["text"] = apply(data, lambda x: merge_cmotion(x["text"], x["cmotion"]), axis=1)
- if args.refine_llm_caption:
- assert "text" in data.columns
- data["text"] = apply(data["text"], remove_caption_prefix)
- if args.clean_caption:
- assert "text" in data.columns
- data["text"] = apply(
- data["text"],
- partial(text_preprocessing, use_text_preprocessing=True),
- )
-
- if args.count_num_token is not None:
- assert "text" in data.columns
- data["text_len"] = apply(data["text"], lambda x: len(tokenizer(x)["input_ids"]))
-
- # sort
- if args.sort is not None:
- data = data.sort_values(by=args.sort, ascending=False)
- if args.sort_ascending is not None:
- data = data.sort_values(by=args.sort_ascending, ascending=True)
-
- # filtering
- if args.remove_empty_caption:
- assert "text" in data.columns
- data = data[data["text"].str.len() > 0]
- data = data[~data["text"].isna()]
- if args.fmin is not None:
- assert "num_frames" in data.columns
- data = data[data["num_frames"] >= args.fmin]
- if args.fmax is not None:
- assert "num_frames" in data.columns
- data = data[data["num_frames"] <= args.fmax]
- if args.hwmax is not None:
- if "resolution" not in data.columns:
- height = data["height"]
- width = data["width"]
- data["resolution"] = height * width
- data = data[data["resolution"] <= args.hwmax]
- if args.aesmin is not None:
- assert "aes" in data.columns
- data = data[data["aes"] >= args.aesmin]
- if args.matchmin is not None:
- assert "match" in data.columns
- data = data[data["match"] >= args.matchmin]
- if args.flowmin is not None:
- assert "flow" in data.columns
- data = data[data["flow"] >= args.flowmin]
- if args.remove_text_duplication:
- data = data.drop_duplicates(subset=["text"], keep="first")
- print(f"Filtered number of samples: {len(data)}.")
-
- # shard data
- if args.shard is not None:
- sharded_data = np.array_split(data, args.shard)
- for i in range(args.shard):
- output_path_part = output_path.split(".")
- output_path_s = ".".join(output_path_part[:-1]) + f"_{i}." + output_path_part[-1]
- save_file(sharded_data[i], output_path_s)
- print(f"Saved {len(sharded_data[i])} samples to {output_path_s}.")
- else:
- save_file(data, output_path)
- print(f"Saved {len(data)} samples to {output_path}.")
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("input", type=str, nargs="+", help="path to the input dataset")
- parser.add_argument("--output", type=str, default=None, help="output path")
- parser.add_argument("--format", type=str, default="csv", help="output format", choices=["csv", "parquet"])
- parser.add_argument("--disable-parallel", action="store_true", help="disable parallel processing")
- parser.add_argument("--num-workers", type=int, default=None, help="number of workers")
- parser.add_argument("--seed", type=int, default=None, help="random seed")
-
- # special case
- parser.add_argument("--shard", type=int, default=None, help="shard the dataset")
- parser.add_argument("--sort", type=str, default=None, help="sort by column")
- parser.add_argument("--sort-ascending", type=str, default=None, help="sort by column (ascending order)")
- parser.add_argument("--difference", type=str, default=None, help="get difference from the dataset")
- parser.add_argument(
- "--intersection", type=str, default=None, help="keep the paths in csv from the dataset and merge columns"
- )
- parser.add_argument("--train-column", action="store_true", help="only keep the train column")
-
- # IO-related
- parser.add_argument("--info", action="store_true", help="get the basic information of each video and image")
- parser.add_argument("--video-info", action="store_true", help="get the basic information of each video")
- parser.add_argument("--ext", action="store_true", help="check if the file exists")
- parser.add_argument(
- "--load-caption", type=str, default=None, choices=["json", "txt"], help="load the caption from json or txt"
- )
-
- # path processing
- parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given")
- parser.add_argument("--abspath", type=str, default=None, help="modify the path to absolute path by root given")
-
- # caption filtering
- parser.add_argument(
- "--remove-empty-caption",
- action="store_true",
- help="remove rows with empty caption",
- )
- parser.add_argument("--remove-url", action="store_true", help="remove rows with url in caption")
- parser.add_argument("--lang", type=str, default=None, help="remove rows with other language")
- parser.add_argument("--remove-path-duplication", action="store_true", help="remove rows with duplicated path")
- parser.add_argument("--remove-text-duplication", action="store_true", help="remove rows with duplicated caption")
-
- # caption processing
- parser.add_argument("--refine-llm-caption", action="store_true", help="modify the caption generated by LLM")
- parser.add_argument(
- "--clean-caption", action="store_true", help="modify the caption according to T5 pipeline to suit training"
- )
- parser.add_argument("--merge-cmotion", action="store_true", help="merge the camera motion to the caption")
- parser.add_argument(
- "--count-num-token", type=str, choices=["t5"], default=None, help="Count the number of tokens in the caption"
- )
-
- # score filtering
- parser.add_argument("--fmin", type=int, default=None, help="filter the dataset by minimum number of frames")
- parser.add_argument("--fmax", type=int, default=None, help="filter the dataset by maximum number of frames")
- parser.add_argument("--hwmax", type=int, default=None, help="filter the dataset by maximum resolution")
- parser.add_argument("--aesmin", type=float, default=None, help="filter the dataset by minimum aes score")
- parser.add_argument("--matchmin", type=float, default=None, help="filter the dataset by minimum match score")
- parser.add_argument("--flowmin", type=float, default=None, help="filter the dataset by minimum flow score")
-
- return parser.parse_args()
-
-
-def get_output_path(args, input_name):
- if args.output is not None:
- return args.output
- name = input_name
- dir_path = os.path.dirname(args.input[0])
-
- # sort
- if args.sort is not None:
- assert args.sort_ascending is None
- name += "_sort"
- if args.sort_ascending is not None:
- assert args.sort is None
- name += "_sort"
-
- # IO-related
- # for IO-related, the function must be wrapped in try-except
- if args.info:
- name += "_info"
- if args.video_info:
- name += "_vinfo"
- if args.ext:
- name += "_ext"
- if args.load_caption:
- name += f"_load{args.load_caption}"
-
- # path processing
- if args.relpath is not None:
- name += "_relpath"
- if args.abspath is not None:
- name += "_abspath"
-
- # caption filtering
- if args.remove_empty_caption:
- name += "_noempty"
- if args.remove_url:
- name += "_nourl"
- if args.lang is not None:
- name += f"_{args.lang}"
- if args.remove_path_duplication:
- name += "_noduppath"
- if args.remove_text_duplication:
- name += "_noduptext"
-
- # caption processing
- if args.refine_llm_caption:
- name += "_llm"
- if args.clean_caption:
- name += "_clean"
- if args.merge_cmotion:
- name += "_cmcaption"
- if args.count_num_token:
- name += "_ntoken"
-
- # score filtering
- if args.fmin is not None:
- name += f"_fmin{args.fmin}"
- if args.fmax is not None:
- name += f"_fmax{args.fmax}"
- if args.hwmax is not None:
- name += f"_hwmax{args.hwmax}"
- if args.aesmin is not None:
- name += f"_aesmin{args.aesmin}"
- if args.matchmin is not None:
- name += f"_matchmin{args.matchmin}"
- if args.flowmin is not None:
- name += f"_flowmin{args.flowmin}"
-
- output_path = os.path.join(dir_path, f"{name}.{args.format}")
- return output_path
-
-
-if __name__ == "__main__":
- args = parse_args()
- if args.disable_parallel:
- PANDA_USE_PARALLEL = False
- if PANDA_USE_PARALLEL:
- if args.num_workers is not None:
- pandarallel.initialize(nb_workers=args.num_workers, progress_bar=True)
- else:
- pandarallel.initialize(progress_bar=True)
- if args.seed is not None:
- random.seed(args.seed)
- np.random.seed(args.seed)
- main(args)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/filter_panda10m.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/filter_panda10m.py
deleted file mode 100644
index 0544f2c3613be04bcec8f0892e4abac5b0ced9a6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/filter_panda10m.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# TODO: remove this file before releasing
-
-import argparse
-import os
-import pandas as pd
-import json
-import html
-from tqdm import tqdm
-import re
-
-tqdm.pandas()
-
-try:
- from pandarallel import pandarallel
-
- pandarallel.initialize(progress_bar=True)
- pandas_has_parallel = True
-except ImportError:
- pandas_has_parallel = False
-
-
-def apply(df, func, **kwargs):
- if pandas_has_parallel:
- return df.parallel_apply(func, **kwargs)
- return df.progress_apply(func, **kwargs)
-
-
-def basic_clean(text):
- import ftfy
-
- text = ftfy.fix_text(text)
- text = html.unescape(html.unescape(text))
- return text.strip()
-
-
-BAD_PUNCT_REGEX = re.compile(
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
-) # noqa
-
-
-def clean_caption(caption):
- import urllib.parse as ul
-
- from bs4 import BeautifulSoup
-
- caption = str(caption)
- caption = ul.unquote_plus(caption)
- caption = caption.strip().lower()
- caption = re.sub("", "person", caption)
- # urls:
- caption = re.sub(
- r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
- "",
- caption,
- ) # regex for urls
- caption = re.sub(
- r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
- "",
- caption,
- ) # regex for urls
- # html:
- caption = BeautifulSoup(caption, features="html.parser").text
-
- # @
- caption = re.sub(r"@[\w\d]+\b", "", caption)
-
- # 31C0—31EF CJK Strokes
- # 31F0—31FF Katakana Phonetic Extensions
- # 3200—32FF Enclosed CJK Letters and Months
- # 3300—33FF CJK Compatibility
- # 3400—4DBF CJK Unified Ideographs Extension A
- # 4DC0—4DFF Yijing Hexagram Symbols
- # 4E00—9FFF CJK Unified Ideographs
- caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
- caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
- caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
- caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
- caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
- caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
- caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
- #######################################################
-
- # все виды тире / all types of dash --> "-"
- caption = re.sub(
- r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
- "-",
- caption,
- )
-
- # кавычки к одному стандарту
- caption = re.sub(r"[`´«»“”¨]", '"', caption)
- caption = re.sub(r"[‘’]", "'", caption)
-
- # "
- caption = re.sub(r""?", "", caption)
- # &
- caption = re.sub(r"&", "", caption)
-
- # ip adresses:
- caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
-
- # article ids:
- caption = re.sub(r"\d:\d\d\s+$", "", caption)
-
- # \n
- caption = re.sub(r"\\n", " ", caption)
-
- # "#123"
- caption = re.sub(r"#\d{1,3}\b", "", caption)
- # "#12345.."
- caption = re.sub(r"#\d{5,}\b", "", caption)
- # "123456.."
- caption = re.sub(r"\b\d{6,}\b", "", caption)
- # filenames:
- caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
-
- #
- caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
- caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
-
- caption = re.sub(BAD_PUNCT_REGEX, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
- caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
-
- # this-is-my-cute-cat / this_is_my_cute_cat
- regex2 = re.compile(r"(?:\-|\_)")
- if len(re.findall(regex2, caption)) > 3:
- caption = re.sub(regex2, " ", caption)
-
- caption = basic_clean(caption)
-
- caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640
- caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc
- caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231
-
- caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
- caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
- caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
- caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
- caption = re.sub(r"\bpage\s+\d+\b", "", caption)
-
- caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
-
- caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
-
- caption = re.sub(r"\b\s+\:\s+", r": ", caption)
- caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
- caption = re.sub(r"\s+", " ", caption)
-
- caption.strip()
-
- caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
- caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
- caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
- caption = re.sub(r"^\.\S+$", "", caption)
-
- return caption.strip()
-
-
-def get_10m_set():
- meta_path_10m = '/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv'
- meta_10m = pd.read_csv(meta_path_10m)
-
- def process_single_caption(row):
- text_list = eval(row['caption'])
- clean_list = [clean_caption(x) for x in text_list]
- return str(clean_list)
-
- ret = apply(meta_10m, process_single_caption, axis=1)
- # ret = meta_10m.progress_apply(process_single_caption, axis=1)
- print('==> text processed.')
-
- text_list = []
- for x in ret:
- text_list += eval(x)
- # text_set = text_set.union(set(eval(x)))
- text_set = set(text_list)
- # meta_10m['caption_new'] = ret
- # meta_10m.to_csv('/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m_new-cap.csv')
-
- # video_id_set = set(meta_10m['videoID'])
- # id2t = {}
- # for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)):
- # video_id = row['videoID']
- # text_list = eval(row['caption'])
- # id2t[video_id] = set(text_list)
-
- print(f"==> Loaded meta_10m from '{meta_path_10m}'")
- return text_set
-
-
-def filter_panda10m_text(meta_path, text_set):
- def process_single_row(row):
- # path = row['path']
- t = row['text']
- # fname = os.path.basename(path)
- # video_id = fname[:fname.rindex('_')]
- if t not in text_set:
- return False
- return True
-
- meta = pd.read_csv(meta_path)
- ret = apply(meta, process_single_row, axis=1)
- # ret = meta.progress_apply(process_single_row, axis=1)
-
- meta = meta[ret]
- wo_ext, ext = os.path.splitext(meta_path)
- out_path = f"{wo_ext}_filter-10m{ext}"
- meta.to_csv(out_path, index=False)
- print(f"New meta (shape={meta.shape}) saved to '{out_path}'.")
-
-
-def filter_panda10m_timestamp(meta_path):
- meta_path_10m = '/mnt/hdd/data/Panda-70M/raw/meta/train/panda70m_training_10m.csv'
- meta_10m = pd.read_csv(meta_path_10m)
-
- id2t = {}
- for idx, row in tqdm(meta_10m.iterrows(), total=len(meta_10m)):
- video_id = row['videoID']
- timestamp = eval(row['timestamp'])
- timestamp = [str(tuple(x)) for x in timestamp]
- id2t[video_id] = timestamp
-
- # video_id_set_10m = set(meta_10m['videoID'])
- print(f"==> Loaded meta_10m from '{meta_path_10m}'")
-
- def process_single_row(row):
- path = row['path']
- t = row['timestamp']
- fname = os.path.basename(path)
- video_id = fname[:fname.rindex('_')]
- if video_id not in id2t:
- return False
- if t not in id2t[video_id]:
- return False
- return True
- # return video_id in video_id_set_10m
-
- meta = pd.read_csv(meta_path)
- ret = apply(meta, process_single_row, axis=1)
-
- meta = meta[ret]
- wo_ext, ext = os.path.splitext(meta_path)
- out_path = f"{wo_ext}_filter-10m{ext}"
- meta.to_csv(out_path, index=False)
- print(f"New meta (shape={meta.shape}) saved to '{out_path}'.")
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument('--meta_path', type=str, nargs='+')
- parser.add_argument('--num_workers', default=5, type=int)
-
- args = parser.parse_args()
- return args
-
-
-if __name__ == '__main__':
- args = parse_args()
-
- text_set = get_10m_set()
- for x in args.meta_path:
- filter_panda10m_text(x, text_set)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/preprocess_msrvtt.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/preprocess_msrvtt.py
deleted file mode 100644
index 65bdac485f37c8c14de8bd2ff686ee29dcad3ce4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/preprocess_msrvtt.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-# coding=utf-8
-import csv
-import json
-import argparse
-
-
-def trans(path):
- jsonfile = path
- csvfile = path.replace(".json", ".csv")
- with open(jsonfile, "r") as f:
- json_obj = json.loads(f.read())
-
- with open(csvfile, "w") as f:
- writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_ALL)
- for row in json_obj:
- writer.writerow(list(row.values()))
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
-
- # model config
- parser.add_argument("--data_path", help="data annotation file path")
-
- return parser.parse_args()
-
-
-if __name__ == "__main__":
- args = parse_args()
- trans(args.data_path)
\ No newline at end of file
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/split.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/split.py
deleted file mode 100644
index 4e312b2bd55adcbe834e8897affd7e3099fd2d42..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/split.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import argparse
-from typing import List
-
-import pandas as pd
-from mmengine.config import Config
-
-from opensora.datasets.bucket import Bucket
-
-
-def split_by_bucket(
- bucket: Bucket,
- input_files: List[str],
- output_path: str,
- limit: int,
- frame_interval: int,
-):
- print(f"Split {len(input_files)} files into {len(bucket)} buckets")
- total_limit = len(bucket) * limit
- bucket_cnt = {}
- # get all bucket id
- for hw_id, d in bucket.ar_criteria.items():
- for t_id, v in d.items():
- for ar_id in v.keys():
- bucket_id = (hw_id, t_id, ar_id)
- bucket_cnt[bucket_id] = 0
- output_df = None
- # split files
- for path in input_files:
- df = pd.read_csv(path)
- if output_df is None:
- output_df = pd.DataFrame(columns=df.columns)
- for i in range(len(df)):
- row = df.iloc[i]
- t, h, w = row["num_frames"], row["height"], row["width"]
- bucket_id = bucket.get_bucket_id(t, h, w, frame_interval)
- if bucket_id is None:
- continue
- if bucket_cnt[bucket_id] < limit:
- bucket_cnt[bucket_id] += 1
- output_df = pd.concat([output_df, pd.DataFrame([row])], ignore_index=True)
- if len(output_df) >= total_limit:
- break
- if len(output_df) >= total_limit:
- break
- assert len(output_df) <= total_limit
- if len(output_df) == total_limit:
- print(f"All buckets are full ({total_limit} samples)")
- else:
- print(f"Only {len(output_df)} files are used")
- output_df.to_csv(output_path, index=False)
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("input", type=str, nargs="+")
- parser.add_argument("-o", "--output", required=True)
- parser.add_argument("-c", "--config", required=True)
- parser.add_argument("-l", "--limit", default=200, type=int)
- args = parser.parse_args()
- assert args.limit > 0
-
- cfg = Config.fromfile(args.config)
- bucket_config = cfg.bucket_config
- # rewrite bucket_config
- for ar, d in bucket_config.items():
- for frames, t in d.items():
- p, bs = t
- if p > 0.0:
- p = 1.0
- d[frames] = (p, bs)
- bucket = Bucket(bucket_config)
- split_by_bucket(bucket, args.input, args.output, args.limit, cfg.dataset.frame_interval)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/transform.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/transform.py
deleted file mode 100644
index 94a42766d7178e5dec5a0906a3252cde7af8d815..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/transform.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import argparse
-import os
-import random
-
-import cv2
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-
-from .utils import IMG_EXTENSIONS, extract_frames
-
-tqdm.pandas()
-
-try:
- from pandarallel import pandarallel
-
- pandarallel.initialize(progress_bar=True)
- pandas_has_parallel = True
-except ImportError:
- pandas_has_parallel = False
-
-
-def apply(df, func, **kwargs):
- if pandas_has_parallel:
- return df.parallel_apply(func, **kwargs)
- return df.progress_apply(func, **kwargs)
-
-
-def get_new_path(path, input_dir, output):
- path_new = os.path.join(output, os.path.relpath(path, input_dir))
- os.makedirs(os.path.dirname(path_new), exist_ok=True)
- return path_new
-
-
-def resize(path, length, input_dir, output):
- path_new = get_new_path(path, input_dir, output)
- ext = os.path.splitext(path)[1].lower()
- assert ext in IMG_EXTENSIONS
- img = cv2.imread(path)
- h, w = img.shape[:2]
- if min(h, w) > length:
- if h > w:
- new_h = length
- new_w = int(w * new_h / h)
- else:
- new_w = length
- new_h = int(h * new_w / w)
- img = cv2.resize(img, (new_w, new_h))
- cv2.imwrite(path_new, img)
- return path_new
-
-
-def rand_crop(path, input_dir, output):
- ext = os.path.splitext(path)[1].lower()
- path_new = get_new_path(path, input_dir, output)
- assert ext in IMG_EXTENSIONS
- img = cv2.imread(path)
- h, w = img.shape[:2]
- width, height, _ = img.shape
- pos = random.randint(0, 3)
- if pos == 0:
- img_cropped = img[: width // 2, : height // 2]
- elif pos == 1:
- img_cropped = img[width // 2 :, : height // 2]
- elif pos == 2:
- img_cropped = img[: width // 2, height // 2 :]
- else:
- img_cropped = img[width // 2 :, height // 2 :]
- cv2.imwrite(path_new, img_cropped)
- return path_new
-
-
-def main(args):
- data = pd.read_csv(args.input)
- if args.method == "img_rand_crop":
- data["path"] = apply(data["path"], lambda x: rand_crop(x, args.input_dir, args.output))
- elif args.method == "img_resize":
- data["path"] = apply(data["path"], lambda x: resize(x, args.length, args.input_dir, args.output))
- elif args.method == "vid_frame_extract":
- points = args.points if args.points is not None else args.points_index
- data = pd.DataFrame(np.repeat(data.values, 3, axis=0), columns=data.columns)
- num_points = len(points)
- data["point"] = np.nan
- for i, point in enumerate(points):
- if isinstance(point, int):
- data.loc[i::num_points, "point"] = point
- else:
- data.loc[i::num_points, "point"] = data.loc[i::num_points, "num_frames"] * point
- data["path"] = apply(data, lambda x: extract_frames(x["path"], args.input_dir, args.output, x["point"]), axis=1)
-
- output_csv = args.input.replace(".csv", f"_resized{args.length}.csv")
- data.to_csv(output_csv, index=False)
- print(f"Saved to {output_csv}")
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("method", type=str, choices=["img_resize", "img_rand_crop", "vid_frame_extract"])
- parser.add_argument("input", type=str)
- parser.add_argument("input_dir", type=str)
- parser.add_argument("output", type=str)
- parser.add_argument("--disable-parallel", action="store_true")
- parser.add_argument("--length", type=int, default=2160)
- parser.add_argument("--seed", type=int, default=42, help="seed for random")
- parser.add_argument("--points", nargs="+", type=float, default=None)
- parser.add_argument("--points_index", nargs="+", type=int, default=None)
- args = parser.parse_args()
- return args
-
-
-if __name__ == "__main__":
- args = parse_args()
- random.seed(args.seed)
- if args.disable_parallel:
- pandas_has_parallel = False
- main(args)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/utils.py b/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/utils.py
deleted file mode 100644
index c91691b0a988d820de099fdb400f5e956b58562b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/datasets/utils.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import os
-
-import cv2
-from PIL import Image
-
-IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
-VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
-
-
-def is_video(filename):
- ext = os.path.splitext(filename)[-1].lower()
- return ext in VID_EXTENSIONS
-
-
-def extract_frames(
- video_path,
- frame_inds=None,
- points=None,
- backend="opencv",
- return_length=False,
- num_frames=None,
-):
- """
- Args:
- video_path (str): path to video
- frame_inds (List[int]): indices of frames to extract
- points (List[float]): values within [0, 1); multiply #frames to get frame indices
- Return:
- List[PIL.Image]
- """
- assert backend in ["av", "opencv", "decord"]
- assert (frame_inds is None) or (points is None)
-
- if backend == "av":
- import av
-
- container = av.open(video_path)
- if num_frames is not None:
- total_frames = num_frames
- else:
- total_frames = container.streams.video[0].frames
-
- if points is not None:
- frame_inds = [int(p * total_frames) for p in points]
-
- frames = []
- for idx in frame_inds:
- if idx >= total_frames:
- idx = total_frames - 1
- target_timestamp = int(idx * av.time_base / container.streams.video[0].average_rate)
- container.seek(target_timestamp)
- frame = next(container.decode(video=0)).to_image()
- frames.append(frame)
-
- if return_length:
- return frames, total_frames
- return frames
-
- elif backend == "decord":
- import decord
-
- container = decord.VideoReader(video_path, num_threads=1)
- if num_frames is not None:
- total_frames = num_frames
- else:
- total_frames = len(container)
-
- if points is not None:
- frame_inds = [int(p * total_frames) for p in points]
-
- frame_inds = np.array(frame_inds).astype(np.int32)
- frame_inds[frame_inds >= total_frames] = total_frames - 1
- frames = container.get_batch(frame_inds).asnumpy() # [N, H, W, C]
- frames = [Image.fromarray(x) for x in frames]
-
- if return_length:
- return frames, total_frames
- return frames
-
- elif backend == "opencv":
- cap = cv2.VideoCapture(video_path)
- if num_frames is not None:
- total_frames = num_frames
- else:
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-
- if points is not None:
- frame_inds = [int(p * total_frames) for p in points]
-
- frames = []
- for idx in frame_inds:
- if idx >= total_frames:
- idx = total_frames - 1
-
- cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
- # HACK: sometimes OpenCV fails to read frames, return a black frame instead
- try:
- ret, frame = cap.read()
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
- frame = Image.fromarray(frame)
- except Exception as e:
- print(f"Error reading frame {video_path}: {e}")
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
- frame = Image.new("RGB", (width, height), (0, 0, 0))
- # HACK: if height or width is 0, return a black frame instead
- if frame.height == 0 or frame.width == 0:
- height = width = 256
- frame = Image.new("RGB", (width, height), (0, 0, 0))
-
- frames.append(frame)
-
- if return_length:
- return frames, total_frames
- return frames
- else:
- raise ValueError
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/README.md b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/README.md
deleted file mode 100644
index 8418e6679834459f1c63a425206bcfcd97667d53..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# Frame Interpolation
-
-For current version, we sample 1 frame out of 3 frames in the video. Although we are going to use VAE to avoid frame loss, we provide a frame interpolation tool to interpolate the video now. The frame interpolation tool is based on [AMT](https://github.com/MCG-NKU/AMT).
-
-Interpolation can be useful for scenery videos, but it may not be suitable for videos with fast motion.
-
-## Requirement
-
-```bash
-conda install -c conda-forge opencv
-pip install imageio
-```
-
-## Model
-
-We use **AMT** as our frame interpolation model. After sampling, you can use frame interpolation model to interpolate your video smoothly.
-
-## Usage
-
-The ckpt file will be automatically downloaded in user's `.cache` directory. You can use frame interpolation to your video file or a video folder.
-
-1. Process a video file
-
-```python
-python -m tools.frame_interpolation.interpolation your_video.mp4
-```
-
-2. Process all video file in target directory
-
-```python
-python -m tools.frame_interpolation.interpolation your_video_dir --output_path samples/interpolation
-```
-
-The output video will be stored at `output_path` and its duration time is equal `the total number of frames after frame interpolation / the frame rate`
-
-### Command Line Arguments
-
-* `input`: Path of the input video. **Video path** or **Folder path(with --folder)**
-* `--ckpt`: Pretrained model of [AMT](https://github.com/MCG-NKU/AMT). Default path: `~/.cache/amt-g.pth`.
-* `--niter`: Iterations of interpolation. With $m$ input frames, `[N_ITER]` $=n$ corresponds to $2^n\times (m-1)+1$ output frames.
-* `--fps`: Frame rate of the input video. (Default: 8)
-* `--output_path`: **Folder Path** of the output video.
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/interpolation.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/interpolation.py
deleted file mode 100644
index 0cd822b6638bcc6f4d9e94b0a779be1e51cc23e4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/interpolation.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# this script is modified from https://github.com/MCG-NKU/AMT/blob/main/demos/demo_2x.py
-import argparse
-import os
-import os.path as osp
-
-import cv2
-import numpy as np
-import torch
-
-from opensora.utils.ckpt_utils import download_model
-
-from .networks.amt_g import Model
-from .utils.utils import InputPadder, img2tensor, tensor2img
-
-hf_endpoint = os.environ.get("HF_ENDPOINT")
-if hf_endpoint is None:
- hf_endpoint = "https://huggingface.co"
-VID_EXT = [".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm"]
-network_cfg = {
- "params": {
- "corr_radius": 3,
- "corr_lvls": 4,
- "num_flows": 5,
- },
-}
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-
-def init():
- """
- initialize the device and the anchor resolution.
- """
-
- if device == "cuda":
- anchor_resolution = 1024 * 512
- anchor_memory = 1500 * 1024**2
- anchor_memory_bias = 2500 * 1024**2
- vram_avail = torch.cuda.get_device_properties(device).total_memory
- print("VRAM available: {:.1f} MB".format(vram_avail / 1024**2))
- else:
- # Do not resize in cpu mode
- anchor_resolution = 8192 * 8192
- anchor_memory = 1
- anchor_memory_bias = 0
- vram_avail = 1
-
- return anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail
-
-
-def get_input_video_from_path(input_path):
- """
- Get the input video from the input_path.
-
- params:
- input_path: str, the path of the input video.
- devices: str, the device to run the model.
- returns:
- inputs: list, the list of the input frames.
- scale: float, the scale of the input frames.
- padder: InputPadder, the padder to pad the input frames.
- """
-
- anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail = init()
-
- if osp.splitext(input_path)[-1].lower() in VID_EXT:
- vcap = cv2.VideoCapture(input_path)
-
- inputs = []
- w = int(vcap.get(cv2.CAP_PROP_FRAME_WIDTH))
- h = int(vcap.get(cv2.CAP_PROP_FRAME_HEIGHT))
- scale = anchor_resolution / (h * w) * np.sqrt((vram_avail - anchor_memory_bias) / anchor_memory)
- scale = 1 if scale > 1 else scale
- scale = 1 / np.floor(1 / np.sqrt(scale) * 16) * 16
- if scale < 1:
- print(f"Due to the limited VRAM, the video will be scaled by {scale:.2f}")
- padding = int(16 / scale)
- padder = InputPadder((h, w), padding)
- while True:
- ret, frame = vcap.read()
- if ret is False:
- break
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
- frame_t = img2tensor(frame).to(device)
- frame_t = padder.pad(frame_t)
- inputs.append(frame_t)
- print(f"Loading the [video] from {input_path}, the number of frames [{len(inputs)}]")
- else:
- raise TypeError("Input should be a video.")
-
- return inputs, scale, padder
-
-
-def load_model(ckpt):
- """
- load the frame interpolation model.
- """
- params = network_cfg.get("params", {})
- model = Model(**params)
- model.load_state_dict(ckpt["state_dict"])
- model = model.to(device)
- model.eval()
- return model
-
-
-def interpolater(model, inputs, scale, padder, iters=1):
- """
- interpolating with the interpolation model.
-
- params:
- model: nn.Module, the frame interpolation model.
- inputs: list, the list of the input frames.
- scale: float, the scale of the input frames.
- iters: int, the number of iterations of interpolation. The final frames model generating is 2 ** iters * (m - 1) + 1 and m is input frames.
- returns:
- outputs: list, the list of the output frames.
- """
-
- print("Start frame interpolation:")
- embt = torch.tensor(1 / 2).float().view(1, 1, 1, 1).to(device)
-
- for i in range(iters):
- print(f"Iter {i+1}. input_frames={len(inputs)} output_frames={2*len(inputs)-1}")
- outputs = [inputs[0]]
- for in_0, in_1 in zip(inputs[:-1], inputs[1:]):
- in_0 = in_0.to(device)
- in_1 = in_1.to(device)
- with torch.no_grad():
- imgt_pred = model(in_0, in_1, embt, scale_factor=scale, eval=True)["imgt_pred"]
- outputs += [imgt_pred.cpu(), in_1.cpu()]
- inputs = outputs
-
- outputs = padder.unpad(*outputs)
- return outputs
-
-
-def write(outputs, input_path, output_path, fps=30):
- """
- write results to the output_path.
- """
-
- if osp.exists(output_path) is False:
- os.makedirs(output_path)
-
- size = outputs[0].shape[2:][::-1]
-
- _, file_name_with_extension = os.path.split(input_path)
- file_name, _ = os.path.splitext(file_name_with_extension)
-
- save_video_path = f"{output_path}/fps{fps}_{file_name}.mp4"
- fourcc = cv2.VideoWriter_fourcc(*"avc1")
- writer = cv2.VideoWriter(save_video_path, fourcc, fps, size)
-
- for i, imgt_pred in enumerate(outputs):
- imgt_pred = tensor2img(imgt_pred)
- imgt_pred = cv2.cvtColor(imgt_pred, cv2.COLOR_RGB2BGR)
- writer.write(imgt_pred)
- print(f"Demo video is saved to [{save_video_path}]")
-
- writer.release()
-
-
-def process(
- model,
- image_path,
- output_path,
- fps,
- iters,
-):
- inputs, scale, padder = get_input_video_from_path(image_path)
- outputs = interpolater(model, inputs, scale, padder, iters)
- write(outputs, image_path, output_path, fps)
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("input", help="Input video.")
- parser.add_argument("--ckpt", type=str, default="./pretrained_models/amt-g.pth", help="The pretrained model.")
- parser.add_argument(
- "--niters",
- type=int,
- default=1,
- help="Iter of Interpolation. The number of frames will be double after per iter.",
- )
- parser.add_argument("--output_path", type=str, default="samples", help="Output path.")
- parser.add_argument("--fps", type=int, default=8, help="Frames rate of the output video.")
- parser.add_argument("--folder", action="store_true", help="If the input is a folder, set this flag.")
- args = parser.parse_args()
-
- times_frame = 2**args.niters
- old_fps = args.fps
- args.fps = args.fps * times_frame
- print(f"Interpolation will turn {old_fps}fps video to {args.fps}fps video.")
- args.input = os.path.expanduser(args.input)
- args.ckpt = os.path.expanduser(args.ckpt)
- args.folder = osp.splitext(args.input)[-1].lower() not in VID_EXT
- args.ckpt = download_model(local_path=args.ckpt, url=hf_endpoint + "/lalala125/AMT/resolve/main/amt-g.pth")
- return args
-
-
-if __name__ == "__main__":
- args = parse_args()
- ckpt_path = args.ckpt
- input_path = args.input
- output_path = args.output_path
- iters = int(args.niters)
- fps = int(args.fps)
-
- model = load_model(ckpt_path)
-
- if args.folder:
- for file in os.listdir(input_path):
- if osp.splitext(file)[-1].lower() in VID_EXT:
- vid_path = os.path.join(input_path, file)
- process(model, vid_path, output_path, fps, iters)
- else:
- process(model, input_path, output_path, fps, iters)
-
- print("Interpolation is done.")
- print(f"Output path: {output_path}")
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/__init__.py
deleted file mode 100644
index 4db0516c70c506c454be74855adffa9ba686e0fe..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .amt_g import Model
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/amt_g.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/amt_g.py
deleted file mode 100644
index 84b28cbfabfd469be5ff47815babc49cd7ddbe12..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/amt_g.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .blocks.feat_enc import LargeEncoder
-from .blocks.ifrnet import Encoder, InitDecoder, IntermediateDecoder, resize
-from .blocks.multi_flow import MultiFlowDecoder, multi_flow_combine
-from .blocks.raft import BasicUpdateBlock, BidirCorrBlock, coords_grid
-
-
-class Model(nn.Module):
- def __init__(self, corr_radius=3, corr_lvls=4, num_flows=5, channels=[84, 96, 112, 128], skip_channels=84):
- super(Model, self).__init__()
- self.radius = corr_radius
- self.corr_levels = corr_lvls
- self.num_flows = num_flows
-
- self.feat_encoder = LargeEncoder(output_dim=128, norm_fn="instance", dropout=0.0)
- self.encoder = Encoder(channels, large=True)
- self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels)
- self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels)
- self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels)
- self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows)
-
- self.update4 = self._get_updateblock(112, None)
- self.update3_low = self._get_updateblock(96, 2.0)
- self.update2_low = self._get_updateblock(84, 4.0)
-
- self.update3_high = self._get_updateblock(96, None)
- self.update2_high = self._get_updateblock(84, None)
-
- self.comb_block = nn.Sequential(
- nn.Conv2d(3 * self.num_flows, 6 * self.num_flows, 7, 1, 3),
- nn.PReLU(6 * self.num_flows),
- nn.Conv2d(6 * self.num_flows, 3, 7, 1, 3),
- )
-
- def _get_updateblock(self, cdim, scale_factor=None):
- return BasicUpdateBlock(
- cdim=cdim,
- hidden_dim=192,
- flow_dim=64,
- corr_dim=256,
- corr_dim2=192,
- fc_dim=188,
- scale_factor=scale_factor,
- corr_levels=self.corr_levels,
- radius=self.radius,
- )
-
- def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1):
- # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0
- # based on linear assumption
- t1_scale = 1.0 / embt
- t0_scale = 1.0 / (1.0 - embt)
- if downsample != 1:
- inv = 1 / downsample
- flow0 = inv * resize(flow0, scale_factor=inv)
- flow1 = inv * resize(flow1, scale_factor=inv)
-
- corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale)
- corr = torch.cat([corr0, corr1], dim=1)
- flow = torch.cat([flow0, flow1], dim=1)
- return corr, flow
-
- def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs):
- mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True)
- img0 = img0 - mean_
- img1 = img1 - mean_
- img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0
- img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1
- b, _, h, w = img0_.shape
- coord = coords_grid(b, h // 8, w // 8, img0.device)
-
- fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8]
- corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels)
-
- # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4]
- # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16]
- f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
- f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)
-
- ######################################### the 4th decoder #########################################
- up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt)
- corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, up_flow0_4, up_flow1_4, embt, downsample=1)
-
- # residue update with lookup corr
- delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4)
- delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1)
- up_flow0_4 = up_flow0_4 + delta_flow0_4
- up_flow1_4 = up_flow1_4 + delta_flow1_4
- ft_3_ = ft_3_ + delta_ft_3_
-
- ######################################### the 3rd decoder #########################################
- up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
- corr_3, flow_3 = self._corr_scale_lookup(corr_fn, coord, up_flow0_3, up_flow1_3, embt, downsample=2)
-
- # residue update with lookup corr
- delta_ft_2_, delta_flow_3 = self.update3_low(ft_2_, flow_3, corr_3)
- delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1)
- up_flow0_3 = up_flow0_3 + delta_flow0_3
- up_flow1_3 = up_flow1_3 + delta_flow1_3
- ft_2_ = ft_2_ + delta_ft_2_
-
- # residue update with lookup corr (hr)
- corr_3 = resize(corr_3, scale_factor=2.0)
- up_flow_3 = torch.cat([up_flow0_3, up_flow1_3], dim=1)
- delta_ft_2_, delta_up_flow_3 = self.update3_high(ft_2_, up_flow_3, corr_3)
- ft_2_ += delta_ft_2_
- up_flow0_3 += delta_up_flow_3[:, 0:2]
- up_flow1_3 += delta_up_flow_3[:, 2:4]
-
- ######################################### the 2nd decoder #########################################
- up_flow0_2, up_flow1_2, ft_1_ = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
- corr_2, flow_2 = self._corr_scale_lookup(corr_fn, coord, up_flow0_2, up_flow1_2, embt, downsample=4)
-
- # residue update with lookup corr
- delta_ft_1_, delta_flow_2 = self.update2_low(ft_1_, flow_2, corr_2)
- delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1)
- up_flow0_2 = up_flow0_2 + delta_flow0_2
- up_flow1_2 = up_flow1_2 + delta_flow1_2
- ft_1_ = ft_1_ + delta_ft_1_
-
- # residue update with lookup corr (hr)
- corr_2 = resize(corr_2, scale_factor=4.0)
- up_flow_2 = torch.cat([up_flow0_2, up_flow1_2], dim=1)
- delta_ft_1_, delta_up_flow_2 = self.update2_high(ft_1_, up_flow_2, corr_2)
- ft_1_ += delta_ft_1_
- up_flow0_2 += delta_up_flow_2[:, 0:2]
- up_flow1_2 += delta_up_flow_2[:, 2:4]
-
- ######################################### the 1st decoder #########################################
- up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)
-
- if scale_factor != 1.0:
- up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor)
- up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor)
- mask = resize(mask, scale_factor=(1.0 / scale_factor))
- img_res = resize(img_res, scale_factor=(1.0 / scale_factor))
-
- # Merge multiple predictions
- imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, mask, img_res, mean_)
- imgt_pred = torch.clamp(imgt_pred, 0, 1)
-
- if eval:
- return {
- "imgt_pred": imgt_pred,
- }
- else:
- up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w)
- up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w)
- return {
- "imgt_pred": imgt_pred,
- "flow0_pred": [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4],
- "flow1_pred": [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4],
- "ft_pred": [ft_1_, ft_2_, ft_3_],
- }
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/feat_enc.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/feat_enc.py
deleted file mode 100644
index 479833824b8b2da7e9e3ba05c84b0359b8c79c37..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/feat_enc.py
+++ /dev/null
@@ -1,335 +0,0 @@
-import torch
-import torch.nn as nn
-
-
-class BottleneckBlock(nn.Module):
- def __init__(self, in_planes, planes, norm_fn="group", stride=1):
- super(BottleneckBlock, self).__init__()
-
- self.conv1 = nn.Conv2d(in_planes, planes // 4, kernel_size=1, padding=0)
- self.conv2 = nn.Conv2d(planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride)
- self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0)
- self.relu = nn.ReLU(inplace=True)
-
- num_groups = planes // 8
-
- if norm_fn == "group":
- self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4)
- self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4)
- self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
- if not stride == 1:
- self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
-
- elif norm_fn == "batch":
- self.norm1 = nn.BatchNorm2d(planes // 4)
- self.norm2 = nn.BatchNorm2d(planes // 4)
- self.norm3 = nn.BatchNorm2d(planes)
- if not stride == 1:
- self.norm4 = nn.BatchNorm2d(planes)
-
- elif norm_fn == "instance":
- self.norm1 = nn.InstanceNorm2d(planes // 4)
- self.norm2 = nn.InstanceNorm2d(planes // 4)
- self.norm3 = nn.InstanceNorm2d(planes)
- if not stride == 1:
- self.norm4 = nn.InstanceNorm2d(planes)
-
- elif norm_fn == "none":
- self.norm1 = nn.Sequential()
- self.norm2 = nn.Sequential()
- self.norm3 = nn.Sequential()
- if not stride == 1:
- self.norm4 = nn.Sequential()
-
- if stride == 1:
- self.downsample = None
-
- else:
- self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4)
-
- def forward(self, x):
- y = x
- y = self.relu(self.norm1(self.conv1(y)))
- y = self.relu(self.norm2(self.conv2(y)))
- y = self.relu(self.norm3(self.conv3(y)))
-
- if self.downsample is not None:
- x = self.downsample(x)
-
- return self.relu(x + y)
-
-
-class ResidualBlock(nn.Module):
- def __init__(self, in_planes, planes, norm_fn="group", stride=1):
- super(ResidualBlock, self).__init__()
-
- self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
- self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
- self.relu = nn.ReLU(inplace=True)
-
- num_groups = planes // 8
-
- if norm_fn == "group":
- self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
- self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
- if not stride == 1:
- self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
-
- elif norm_fn == "batch":
- self.norm1 = nn.BatchNorm2d(planes)
- self.norm2 = nn.BatchNorm2d(planes)
- if not stride == 1:
- self.norm3 = nn.BatchNorm2d(planes)
-
- elif norm_fn == "instance":
- self.norm1 = nn.InstanceNorm2d(planes)
- self.norm2 = nn.InstanceNorm2d(planes)
- if not stride == 1:
- self.norm3 = nn.InstanceNorm2d(planes)
-
- elif norm_fn == "none":
- self.norm1 = nn.Sequential()
- self.norm2 = nn.Sequential()
- if not stride == 1:
- self.norm3 = nn.Sequential()
-
- if stride == 1:
- self.downsample = None
-
- else:
- self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
-
- def forward(self, x):
- y = x
- y = self.relu(self.norm1(self.conv1(y)))
- y = self.relu(self.norm2(self.conv2(y)))
-
- if self.downsample is not None:
- x = self.downsample(x)
-
- return self.relu(x + y)
-
-
-class SmallEncoder(nn.Module):
- def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
- super(SmallEncoder, self).__init__()
- self.norm_fn = norm_fn
-
- if self.norm_fn == "group":
- self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
-
- elif self.norm_fn == "batch":
- self.norm1 = nn.BatchNorm2d(32)
-
- elif self.norm_fn == "instance":
- self.norm1 = nn.InstanceNorm2d(32)
-
- elif self.norm_fn == "none":
- self.norm1 = nn.Sequential()
-
- self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
- self.relu1 = nn.ReLU(inplace=True)
-
- self.in_planes = 32
- self.layer1 = self._make_layer(32, stride=1)
- self.layer2 = self._make_layer(64, stride=2)
- self.layer3 = self._make_layer(96, stride=2)
-
- self.dropout = None
- if dropout > 0:
- self.dropout = nn.Dropout2d(p=dropout)
-
- self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)
-
- for m in self.modules():
- if isinstance(m, nn.Conv2d):
- nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
- elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
- if m.weight is not None:
- nn.init.constant_(m.weight, 1)
- if m.bias is not None:
- nn.init.constant_(m.bias, 0)
-
- def _make_layer(self, dim, stride=1):
- layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
- layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
- layers = (layer1, layer2)
-
- self.in_planes = dim
- return nn.Sequential(*layers)
-
- def forward(self, x):
- # if input is list, combine batch dimension
- is_list = isinstance(x, tuple) or isinstance(x, list)
- if is_list:
- batch_dim = x[0].shape[0]
- x = torch.cat(x, dim=0)
-
- x = self.conv1(x)
- x = self.norm1(x)
- x = self.relu1(x)
-
- x = self.layer1(x)
- x = self.layer2(x)
- x = self.layer3(x)
- x = self.conv2(x)
-
- if self.training and self.dropout is not None:
- x = self.dropout(x)
-
- if is_list:
- x = torch.split(x, [batch_dim, batch_dim], dim=0)
-
- return x
-
-
-class BasicEncoder(nn.Module):
- def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
- super(BasicEncoder, self).__init__()
- self.norm_fn = norm_fn
-
- if self.norm_fn == "group":
- self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
-
- elif self.norm_fn == "batch":
- self.norm1 = nn.BatchNorm2d(64)
-
- elif self.norm_fn == "instance":
- self.norm1 = nn.InstanceNorm2d(64)
-
- elif self.norm_fn == "none":
- self.norm1 = nn.Sequential()
-
- self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
- self.relu1 = nn.ReLU(inplace=True)
-
- self.in_planes = 64
- self.layer1 = self._make_layer(64, stride=1)
- self.layer2 = self._make_layer(72, stride=2)
- self.layer3 = self._make_layer(128, stride=2)
-
- # output convolution
- self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)
-
- self.dropout = None
- if dropout > 0:
- self.dropout = nn.Dropout2d(p=dropout)
-
- for m in self.modules():
- if isinstance(m, nn.Conv2d):
- nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
- elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
- if m.weight is not None:
- nn.init.constant_(m.weight, 1)
- if m.bias is not None:
- nn.init.constant_(m.bias, 0)
-
- def _make_layer(self, dim, stride=1):
- layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
- layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
- layers = (layer1, layer2)
-
- self.in_planes = dim
- return nn.Sequential(*layers)
-
- def forward(self, x):
- # if input is list, combine batch dimension
- is_list = isinstance(x, tuple) or isinstance(x, list)
- if is_list:
- batch_dim = x[0].shape[0]
- x = torch.cat(x, dim=0)
-
- x = self.conv1(x)
- x = self.norm1(x)
- x = self.relu1(x)
-
- x = self.layer1(x)
- x = self.layer2(x)
- x = self.layer3(x)
-
- x = self.conv2(x)
-
- if self.training and self.dropout is not None:
- x = self.dropout(x)
-
- if is_list:
- x = torch.split(x, [batch_dim, batch_dim], dim=0)
-
- return x
-
-
-class LargeEncoder(nn.Module):
- def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
- super(LargeEncoder, self).__init__()
- self.norm_fn = norm_fn
-
- if self.norm_fn == "group":
- self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
-
- elif self.norm_fn == "batch":
- self.norm1 = nn.BatchNorm2d(64)
-
- elif self.norm_fn == "instance":
- self.norm1 = nn.InstanceNorm2d(64)
-
- elif self.norm_fn == "none":
- self.norm1 = nn.Sequential()
-
- self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
- self.relu1 = nn.ReLU(inplace=True)
-
- self.in_planes = 64
- self.layer1 = self._make_layer(64, stride=1)
- self.layer2 = self._make_layer(112, stride=2)
- self.layer3 = self._make_layer(160, stride=2)
- self.layer3_2 = self._make_layer(160, stride=1)
-
- # output convolution
- self.conv2 = nn.Conv2d(self.in_planes, output_dim, kernel_size=1)
-
- self.dropout = None
- if dropout > 0:
- self.dropout = nn.Dropout2d(p=dropout)
-
- for m in self.modules():
- if isinstance(m, nn.Conv2d):
- nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
- elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
- if m.weight is not None:
- nn.init.constant_(m.weight, 1)
- if m.bias is not None:
- nn.init.constant_(m.bias, 0)
-
- def _make_layer(self, dim, stride=1):
- layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
- layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
- layers = (layer1, layer2)
-
- self.in_planes = dim
- return nn.Sequential(*layers)
-
- def forward(self, x):
- # if input is list, combine batch dimension
- is_list = isinstance(x, tuple) or isinstance(x, list)
- if is_list:
- batch_dim = x[0].shape[0]
- x = torch.cat(x, dim=0)
-
- x = self.conv1(x)
- x = self.norm1(x)
- x = self.relu1(x)
-
- x = self.layer1(x)
- x = self.layer2(x)
- x = self.layer3(x)
- x = self.layer3_2(x)
-
- x = self.conv2(x)
-
- if self.training and self.dropout is not None:
- x = self.dropout(x)
-
- if is_list:
- x = torch.split(x, [batch_dim, batch_dim], dim=0)
-
- return x
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/ifrnet.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/ifrnet.py
deleted file mode 100644
index 5719a040e102c36a417925e78f5acb4cf4402725..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/ifrnet.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from tools.frame_interpolation.utils.flow_utils import warp
-
-
-def resize(x, scale_factor):
- return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)
-
-
-def convrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True):
- return nn.Sequential(
- nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=bias),
- nn.PReLU(out_channels),
- )
-
-
-class ResBlock(nn.Module):
- def __init__(self, in_channels, side_channels, bias=True):
- super(ResBlock, self).__init__()
- self.side_channels = side_channels
- self.conv1 = nn.Sequential(
- nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels)
- )
- self.conv2 = nn.Sequential(
- nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias),
- nn.PReLU(side_channels),
- )
- self.conv3 = nn.Sequential(
- nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels)
- )
- self.conv4 = nn.Sequential(
- nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias),
- nn.PReLU(side_channels),
- )
- self.conv5 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias)
- self.prelu = nn.PReLU(in_channels)
-
- def forward(self, x):
- out = self.conv1(x)
-
- res_feat = out[:, : -self.side_channels, ...]
- side_feat = out[:, -self.side_channels :, :, :]
- side_feat = self.conv2(side_feat)
- out = self.conv3(torch.cat([res_feat, side_feat], 1))
-
- res_feat = out[:, : -self.side_channels, ...]
- side_feat = out[:, -self.side_channels :, :, :]
- side_feat = self.conv4(side_feat)
- out = self.conv5(torch.cat([res_feat, side_feat], 1))
-
- out = self.prelu(x + out)
- return out
-
-
-class Encoder(nn.Module):
- def __init__(self, channels, large=False):
- super(Encoder, self).__init__()
- self.channels = channels
- prev_ch = 3
- for idx, ch in enumerate(channels, 1):
- k = 7 if large and idx == 1 else 3
- p = 3 if k == 7 else 1
- self.register_module(
- f"pyramid{idx}", nn.Sequential(convrelu(prev_ch, ch, k, 2, p), convrelu(ch, ch, 3, 1, 1))
- )
- prev_ch = ch
-
- def forward(self, in_x):
- fs = []
- for idx in range(len(self.channels)):
- out_x = getattr(self, f"pyramid{idx+1}")(in_x)
- fs.append(out_x)
- in_x = out_x
- return fs
-
-
-class InitDecoder(nn.Module):
- def __init__(self, in_ch, out_ch, skip_ch) -> None:
- super().__init__()
- self.convblock = nn.Sequential(
- convrelu(in_ch * 2 + 1, in_ch * 2),
- ResBlock(in_ch * 2, skip_ch),
- nn.ConvTranspose2d(in_ch * 2, out_ch + 4, 4, 2, 1, bias=True),
- )
-
- def forward(self, f0, f1, embt):
- h, w = f0.shape[2:]
- embt = embt.repeat(1, 1, h, w)
- out = self.convblock(torch.cat([f0, f1, embt], 1))
- flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
- ft_ = out[:, 4:, ...]
- return flow0, flow1, ft_
-
-
-class IntermediateDecoder(nn.Module):
- def __init__(self, in_ch, out_ch, skip_ch) -> None:
- super().__init__()
- self.convblock = nn.Sequential(
- convrelu(in_ch * 3 + 4, in_ch * 3),
- ResBlock(in_ch * 3, skip_ch),
- nn.ConvTranspose2d(in_ch * 3, out_ch + 4, 4, 2, 1, bias=True),
- )
-
- def forward(self, ft_, f0, f1, flow0_in, flow1_in):
- f0_warp = warp(f0, flow0_in)
- f1_warp = warp(f1, flow1_in)
- f_in = torch.cat([ft_, f0_warp, f1_warp, flow0_in, flow1_in], 1)
- out = self.convblock(f_in)
- flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
- ft_ = out[:, 4:, ...]
- flow0 = flow0 + 2.0 * resize(flow0_in, scale_factor=2.0)
- flow1 = flow1 + 2.0 * resize(flow1_in, scale_factor=2.0)
- return flow0, flow1, ft_
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/multi_flow.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/multi_flow.py
deleted file mode 100644
index cbb96a9ef6bcee99627e7c844e45987bfb2d9308..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/multi_flow.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import torch
-import torch.nn as nn
-
-from tools.frame_interpolation.utils.flow_utils import warp
-
-from .ifrnet import ResBlock, convrelu, resize
-
-
-def multi_flow_combine(comb_block, img0, img1, flow0, flow1, mask=None, img_res=None, mean=None):
- """
- A parallel implementation of multiple flow field warping
- comb_block: An nn.Seqential object.
- img shape: [b, c, h, w]
- flow shape: [b, 2*num_flows, h, w]
- mask (opt):
- If 'mask' is None, the function conduct a simple average.
- img_res (opt):
- If 'img_res' is None, the function adds zero instead.
- mean (opt):
- If 'mean' is None, the function adds zero instead.
- """
- b, c, h, w = flow0.shape
- num_flows = c // 2
- flow0 = flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
- flow1 = flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
-
- mask = mask.reshape(b, num_flows, 1, h, w).reshape(-1, 1, h, w) if mask is not None else None
- img_res = img_res.reshape(b, num_flows, 3, h, w).reshape(-1, 3, h, w) if img_res is not None else 0
- img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w)
- img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w)
- mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1) if mean is not None else 0
-
- img0_warp = warp(img0, flow0)
- img1_warp = warp(img1, flow1)
- img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res
- img_warps = img_warps.reshape(b, num_flows, 3, h, w)
- imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w))
- return imgt_pred
-
-
-class MultiFlowDecoder(nn.Module):
- def __init__(self, in_ch, skip_ch, num_flows=3):
- super(MultiFlowDecoder, self).__init__()
- self.num_flows = num_flows
- self.convblock = nn.Sequential(
- convrelu(in_ch * 3 + 4, in_ch * 3),
- ResBlock(in_ch * 3, skip_ch),
- nn.ConvTranspose2d(in_ch * 3, 8 * num_flows, 4, 2, 1, bias=True),
- )
-
- def forward(self, ft_, f0, f1, flow0, flow1):
- n = self.num_flows
- f0_warp = warp(f0, flow0)
- f1_warp = warp(f1, flow1)
- out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1))
- delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2 * n, 2 * n, n, 3 * n], 1)
- mask = torch.sigmoid(mask)
-
- flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0).repeat(1, self.num_flows, 1, 1)
- flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0).repeat(1, self.num_flows, 1, 1)
-
- return flow0, flow1, mask, img_res
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/raft.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/raft.py
deleted file mode 100644
index 1576889201c49614224450c9a223b871e8031f2d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/networks/blocks/raft.py
+++ /dev/null
@@ -1,213 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def resize(x, scale_factor):
- return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)
-
-
-def bilinear_sampler(img, coords, mask=False):
- """Wrapper for grid_sample, uses pixel coordinates"""
- H, W = img.shape[-2:]
- xgrid, ygrid = coords.split([1, 1], dim=-1)
- xgrid = 2 * xgrid / (W - 1) - 1
- ygrid = 2 * ygrid / (H - 1) - 1
-
- grid = torch.cat([xgrid, ygrid], dim=-1)
- img = F.grid_sample(img, grid, align_corners=True)
-
- if mask:
- mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
- return img, mask.float()
-
- return img
-
-
-def coords_grid(batch, ht, wd, device):
- coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device), indexing="ij")
- coords = torch.stack(coords[::-1], dim=0).float()
- return coords[None].repeat(batch, 1, 1, 1)
-
-
-class SmallUpdateBlock(nn.Module):
- def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, fc_dim, corr_levels=4, radius=3, scale_factor=None):
- super(SmallUpdateBlock, self).__init__()
- cor_planes = corr_levels * (2 * radius + 1) ** 2
- self.scale_factor = scale_factor
-
- self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
- self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3)
- self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1)
- self.conv = nn.Conv2d(corr_dim + flow_dim, fc_dim, 3, padding=1)
-
- self.gru = nn.Sequential(
- nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1),
- nn.LeakyReLU(negative_slope=0.1, inplace=True),
- nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
- )
-
- self.feat_head = nn.Sequential(
- nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
- nn.LeakyReLU(negative_slope=0.1, inplace=True),
- nn.Conv2d(hidden_dim, cdim, 3, padding=1),
- )
-
- self.flow_head = nn.Sequential(
- nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
- nn.LeakyReLU(negative_slope=0.1, inplace=True),
- nn.Conv2d(hidden_dim, 4, 3, padding=1),
- )
-
- self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-
- def forward(self, net, flow, corr):
- net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net
- cor = self.lrelu(self.convc1(corr))
- flo = self.lrelu(self.convf1(flow))
- flo = self.lrelu(self.convf2(flo))
- cor_flo = torch.cat([cor, flo], dim=1)
- inp = self.lrelu(self.conv(cor_flo))
- inp = torch.cat([inp, flow, net], dim=1)
-
- out = self.gru(inp)
- delta_net = self.feat_head(out)
- delta_flow = self.flow_head(out)
-
- if self.scale_factor is not None:
- delta_net = resize(delta_net, scale_factor=self.scale_factor)
- delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)
-
- return delta_net, delta_flow
-
-
-class BasicUpdateBlock(nn.Module):
- def __init__(
- self,
- cdim,
- hidden_dim,
- flow_dim,
- corr_dim,
- corr_dim2,
- fc_dim,
- corr_levels=4,
- radius=3,
- scale_factor=None,
- out_num=1,
- ):
- super(BasicUpdateBlock, self).__init__()
- cor_planes = corr_levels * (2 * radius + 1) ** 2
-
- self.scale_factor = scale_factor
- self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
- self.convc2 = nn.Conv2d(corr_dim, corr_dim2, 3, padding=1)
- self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3)
- self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1)
- self.conv = nn.Conv2d(flow_dim + corr_dim2, fc_dim, 3, padding=1)
-
- self.gru = nn.Sequential(
- nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1),
- nn.LeakyReLU(negative_slope=0.1, inplace=True),
- nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
- )
-
- self.feat_head = nn.Sequential(
- nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
- nn.LeakyReLU(negative_slope=0.1, inplace=True),
- nn.Conv2d(hidden_dim, cdim, 3, padding=1),
- )
-
- self.flow_head = nn.Sequential(
- nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
- nn.LeakyReLU(negative_slope=0.1, inplace=True),
- nn.Conv2d(hidden_dim, 4 * out_num, 3, padding=1),
- )
-
- self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-
- def forward(self, net, flow, corr):
- net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net
- cor = self.lrelu(self.convc1(corr))
- cor = self.lrelu(self.convc2(cor))
- flo = self.lrelu(self.convf1(flow))
- flo = self.lrelu(self.convf2(flo))
- cor_flo = torch.cat([cor, flo], dim=1)
- inp = self.lrelu(self.conv(cor_flo))
- inp = torch.cat([inp, flow, net], dim=1)
-
- out = self.gru(inp)
- delta_net = self.feat_head(out)
- delta_flow = self.flow_head(out)
-
- if self.scale_factor is not None:
- delta_net = resize(delta_net, scale_factor=self.scale_factor)
- delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)
- return delta_net, delta_flow
-
-
-class BidirCorrBlock:
- def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
- self.num_levels = num_levels
- self.radius = radius
- self.corr_pyramid = []
- self.corr_pyramid_T = []
-
- corr = BidirCorrBlock.corr(fmap1, fmap2)
- batch, h1, w1, dim, h2, w2 = corr.shape
- corr_T = corr.clone().permute(0, 4, 5, 3, 1, 2)
-
- corr = corr.reshape(batch * h1 * w1, dim, h2, w2)
- corr_T = corr_T.reshape(batch * h2 * w2, dim, h1, w1)
-
- self.corr_pyramid.append(corr)
- self.corr_pyramid_T.append(corr_T)
-
- for _ in range(self.num_levels - 1):
- corr = F.avg_pool2d(corr, 2, stride=2)
- corr_T = F.avg_pool2d(corr_T, 2, stride=2)
- self.corr_pyramid.append(corr)
- self.corr_pyramid_T.append(corr_T)
-
- def __call__(self, coords0, coords1):
- r = self.radius
- coords0 = coords0.permute(0, 2, 3, 1)
- coords1 = coords1.permute(0, 2, 3, 1)
- assert coords0.shape == coords1.shape, f"coords0 shape: [{coords0.shape}] is not equal to [{coords1.shape}]"
- batch, h1, w1, _ = coords0.shape
-
- out_pyramid = []
- out_pyramid_T = []
- for i in range(self.num_levels):
- corr = self.corr_pyramid[i]
- corr_T = self.corr_pyramid_T[i]
-
- dx = torch.linspace(-r, r, 2 * r + 1, device=coords0.device)
- dy = torch.linspace(-r, r, 2 * r + 1, device=coords0.device)
- delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1)
- delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
-
- centroid_lvl_0 = coords0.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
- centroid_lvl_1 = coords1.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
- coords_lvl_0 = centroid_lvl_0 + delta_lvl
- coords_lvl_1 = centroid_lvl_1 + delta_lvl
-
- corr = bilinear_sampler(corr, coords_lvl_0)
- corr_T = bilinear_sampler(corr_T, coords_lvl_1)
- corr = corr.view(batch, h1, w1, -1)
- corr_T = corr_T.view(batch, h1, w1, -1)
- out_pyramid.append(corr)
- out_pyramid_T.append(corr_T)
-
- out = torch.cat(out_pyramid, dim=-1)
- out_T = torch.cat(out_pyramid_T, dim=-1)
- return out.permute(0, 3, 1, 2).contiguous().float(), out_T.permute(0, 3, 1, 2).contiguous().float()
-
- @staticmethod
- def corr(fmap1, fmap2):
- batch, dim, ht, wd = fmap1.shape
- fmap1 = fmap1.view(batch, dim, ht * wd)
- fmap2 = fmap2.view(batch, dim, ht * wd)
-
- corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
- corr = corr.view(batch, ht, wd, 1, ht, wd)
- return corr / torch.sqrt(torch.tensor(dim).float())
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/dist_utils.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/dist_utils.py
deleted file mode 100644
index d754d4fc7a6ed1a9bae246b2f895456218d815ea..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/dist_utils.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import os
-
-import torch
-
-
-def get_world_size():
- """Find OMPI world size without calling mpi functions
- :rtype: int
- """
- if os.environ.get("PMI_SIZE") is not None:
- return int(os.environ.get("PMI_SIZE") or 1)
- elif os.environ.get("OMPI_COMM_WORLD_SIZE") is not None:
- return int(os.environ.get("OMPI_COMM_WORLD_SIZE") or 1)
- else:
- return torch.cuda.device_count()
-
-
-def get_global_rank():
- """Find OMPI world rank without calling mpi functions
- :rtype: int
- """
- if os.environ.get("PMI_RANK") is not None:
- return int(os.environ.get("PMI_RANK") or 0)
- elif os.environ.get("OMPI_COMM_WORLD_RANK") is not None:
- return int(os.environ.get("OMPI_COMM_WORLD_RANK") or 0)
- else:
- return 0
-
-
-def get_local_rank():
- """Find OMPI local rank without calling mpi functions
- :rtype: int
- """
- if os.environ.get("MPI_LOCALRANKID") is not None:
- return int(os.environ.get("MPI_LOCALRANKID") or 0)
- elif os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") is not None:
- return int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") or 0)
- else:
- return 0
-
-
-def get_master_ip():
- if os.environ.get("AZ_BATCH_MASTER_NODE") is not None:
- return os.environ.get("AZ_BATCH_MASTER_NODE").split(":")[0]
- elif os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") is not None:
- return os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE")
- else:
- return "127.0.0.1"
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/flow_utils.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/flow_utils.py
deleted file mode 100644
index 4edee465ab5e16459358c3c4c2a1ac20b468d90e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/flow_utils.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import numpy as np
-import torch
-import torch.nn.functional as F
-from PIL import ImageFile
-
-ImageFile.LOAD_TRUNCATED_IMAGES = True
-
-
-def warp(img, flow):
- B, _, H, W = flow.shape
- xx = torch.linspace(-1.0, 1.0, W).view(1, 1, 1, W).expand(B, -1, H, -1)
- yy = torch.linspace(-1.0, 1.0, H).view(1, 1, H, 1).expand(B, -1, -1, W)
- grid = torch.cat([xx, yy], 1).to(img)
- flow_ = torch.cat([flow[:, 0:1, :, :] / ((W - 1.0) / 2.0), flow[:, 1:2, :, :] / ((H - 1.0) / 2.0)], 1)
- grid_ = (grid + flow_).permute(0, 2, 3, 1)
- output = F.grid_sample(input=img, grid=grid_, mode="bilinear", padding_mode="border", align_corners=True)
- return output
-
-
-def make_colorwheel():
- """
- Generates a color wheel for optical flow visualization as presented in:
- Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
- URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
- Code follows the original C++ source code of Daniel Scharstein.
- Code follows the the Matlab source code of Deqing Sun.
- Returns:
- np.ndarray: Color wheel
- """
-
- RY = 15
- YG = 6
- GC = 4
- CB = 11
- BM = 13
- MR = 6
-
- ncols = RY + YG + GC + CB + BM + MR
- colorwheel = np.zeros((ncols, 3))
- col = 0
-
- # RY
- colorwheel[0:RY, 0] = 255
- colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY)
- col = col + RY
- # YG
- colorwheel[col : col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG)
- colorwheel[col : col + YG, 1] = 255
- col = col + YG
- # GC
- colorwheel[col : col + GC, 1] = 255
- colorwheel[col : col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC)
- col = col + GC
- # CB
- colorwheel[col : col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB)
- colorwheel[col : col + CB, 2] = 255
- col = col + CB
- # BM
- colorwheel[col : col + BM, 2] = 255
- colorwheel[col : col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM)
- col = col + BM
- # MR
- colorwheel[col : col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR)
- colorwheel[col : col + MR, 0] = 255
- return colorwheel
-
-
-def flow_uv_to_colors(u, v, convert_to_bgr=False):
- """
- Applies the flow color wheel to (possibly clipped) flow components u and v.
- According to the C++ source code of Daniel Scharstein
- According to the Matlab source code of Deqing Sun
- Args:
- u (np.ndarray): Input horizontal flow of shape [H,W]
- v (np.ndarray): Input vertical flow of shape [H,W]
- convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
- Returns:
- np.ndarray: Flow visualization image of shape [H,W,3]
- """
- flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
- colorwheel = make_colorwheel() # shape [55x3]
- ncols = colorwheel.shape[0]
- rad = np.sqrt(np.square(u) + np.square(v))
- a = np.arctan2(-v, -u) / np.pi
- fk = (a + 1) / 2 * (ncols - 1)
- k0 = np.floor(fk).astype(np.int32)
- k1 = k0 + 1
- k1[k1 == ncols] = 0
- f = fk - k0
- for i in range(colorwheel.shape[1]):
- tmp = colorwheel[:, i]
- col0 = tmp[k0] / 255.0
- col1 = tmp[k1] / 255.0
- col = (1 - f) * col0 + f * col1
- idx = rad <= 1
- col[idx] = 1 - rad[idx] * (1 - col[idx])
- col[~idx] = col[~idx] * 0.75 # out of range
- # Note the 2-i => BGR instead of RGB
- ch_idx = 2 - i if convert_to_bgr else i
- flow_image[:, :, ch_idx] = np.floor(255 * col)
- return flow_image
-
-
-def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
- """
- Expects a two dimensional flow image of shape.
- Args:
- flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
- clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
- convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
- Returns:
- np.ndarray: Flow visualization image of shape [H,W,3]
- """
- assert flow_uv.ndim == 3, "input flow must have three dimensions"
- assert flow_uv.shape[2] == 2, "input flow must have shape [H,W,2]"
- if clip_flow is not None:
- flow_uv = np.clip(flow_uv, 0, clip_flow)
- u = flow_uv[:, :, 0]
- v = flow_uv[:, :, 1]
- rad = np.sqrt(np.square(u) + np.square(v))
- rad_max = np.max(rad)
- epsilon = 1e-5
- u = u / (rad_max + epsilon)
- v = v / (rad_max + epsilon)
- return flow_uv_to_colors(u, v, convert_to_bgr)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/utils.py b/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/utils.py
deleted file mode 100644
index 285a65fd454e034ce672dcea82d1449bc77ef953..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/frame_interpolation/utils/utils.py
+++ /dev/null
@@ -1,314 +0,0 @@
-import random
-import re
-import sys
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from imageio import imread, imwrite
-from PIL import ImageFile
-
-ImageFile.LOAD_TRUNCATED_IMAGES = True
-
-
-class AverageMeter:
- def __init__(self):
- self.reset()
-
- def reset(self):
- self.val = 0.0
- self.avg = 0.0
- self.sum = 0.0
- self.count = 0
-
- def update(self, val, n=1):
- self.val = val
- self.sum += val * n
- self.count += n
- self.avg = self.sum / self.count
-
-
-class AverageMeterGroups:
- def __init__(self) -> None:
- self.meter_dict = dict()
-
- def update(self, dict, n=1):
- for name, val in dict.items():
- if self.meter_dict.get(name) is None:
- self.meter_dict[name] = AverageMeter()
- self.meter_dict[name].update(val, n)
-
- def reset(self, name=None):
- if name is None:
- for v in self.meter_dict.values():
- v.reset()
- else:
- meter = self.meter_dict.get(name)
- if meter is not None:
- meter.reset()
-
- def avg(self, name):
- meter = self.meter_dict.get(name)
- if meter is not None:
- return meter.avg
-
-
-class InputPadder:
- """Pads images such that dimensions are divisible by divisor"""
-
- def __init__(self, dims, divisor=16):
- self.ht, self.wd = dims[-2:]
- pad_ht = (((self.ht // divisor) + 1) * divisor - self.ht) % divisor
- pad_wd = (((self.wd // divisor) + 1) * divisor - self.wd) % divisor
- self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2]
-
- def pad(self, *inputs):
- if len(inputs) == 1:
- return F.pad(inputs[0], self._pad, mode="replicate")
- else:
- return [F.pad(x, self._pad, mode="replicate") for x in inputs]
-
- def unpad(self, *inputs):
- if len(inputs) == 1:
- return self._unpad(inputs[0])
- else:
- return [self._unpad(x) for x in inputs]
-
- def _unpad(self, x):
- ht, wd = x.shape[-2:]
- c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
- return x[..., c[0] : c[1], c[2] : c[3]]
-
-
-def img2tensor(img):
- if img.shape[-1] > 3:
- img = img[:, :, :3]
- return torch.tensor(img).permute(2, 0, 1).unsqueeze(0) / 255.0
-
-
-def tensor2img(img_t):
- return (img_t * 255.0).detach().squeeze(0).permute(1, 2, 0).cpu().numpy().clip(0, 255).astype(np.uint8)
-
-
-def seed_all(seed):
- random.seed(seed)
- np.random.seed(seed)
- torch.manual_seed(seed)
- torch.cuda.manual_seed_all(seed)
-
-
-def read(file):
- if file.endswith(".float3"):
- return readFloat(file)
- elif file.endswith(".flo"):
- return readFlow(file)
- elif file.endswith(".ppm"):
- return readImage(file)
- elif file.endswith(".pgm"):
- return readImage(file)
- elif file.endswith(".png"):
- return readImage(file)
- elif file.endswith(".jpg"):
- return readImage(file)
- elif file.endswith(".pfm"):
- return readPFM(file)[0]
- else:
- raise Exception("don't know how to read %s" % file)
-
-
-def write(file, data):
- if file.endswith(".float3"):
- return writeFloat(file, data)
- elif file.endswith(".flo"):
- return writeFlow(file, data)
- elif file.endswith(".ppm"):
- return writeImage(file, data)
- elif file.endswith(".pgm"):
- return writeImage(file, data)
- elif file.endswith(".png"):
- return writeImage(file, data)
- elif file.endswith(".jpg"):
- return writeImage(file, data)
- elif file.endswith(".pfm"):
- return writePFM(file, data)
- else:
- raise Exception("don't know how to write %s" % file)
-
-
-def readPFM(file):
- file = open(file, "rb")
-
- color = None
- width = None
- height = None
- scale = None
- endian = None
-
- header = file.readline().rstrip()
- if header.decode("ascii") == "PF":
- color = True
- elif header.decode("ascii") == "Pf":
- color = False
- else:
- raise Exception("Not a PFM file.")
-
- dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
- if dim_match:
- width, height = list(map(int, dim_match.groups()))
- else:
- raise Exception("Malformed PFM header.")
-
- scale = float(file.readline().decode("ascii").rstrip())
- if scale < 0:
- endian = "<"
- scale = -scale
- else:
- endian = ">"
-
- data = np.fromfile(file, endian + "f")
- shape = (height, width, 3) if color else (height, width)
-
- data = np.reshape(data, shape)
- data = np.flipud(data)
- return data, scale
-
-
-def writePFM(file, image, scale=1):
- file = open(file, "wb")
-
- color = None
-
- if image.dtype.name != "float32":
- raise Exception("Image dtype must be float32.")
-
- image = np.flipud(image)
-
- if len(image.shape) == 3 and image.shape[2] == 3:
- color = True
- elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1:
- color = False
- else:
- raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
-
- file.write("PF\n" if color else "Pf\n".encode())
- file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
-
- endian = image.dtype.byteorder
-
- if endian == "<" or endian == "=" and sys.byteorder == "little":
- scale = -scale
-
- file.write("%f\n".encode() % scale)
-
- image.tofile(file)
-
-
-def readFlow(name):
- if name.endswith(".pfm") or name.endswith(".PFM"):
- return readPFM(name)[0][:, :, 0:2]
-
- f = open(name, "rb")
-
- header = f.read(4)
- if header.decode("utf-8") != "PIEH":
- raise Exception("Flow file header does not contain PIEH")
-
- width = np.fromfile(f, np.int32, 1).squeeze()
- height = np.fromfile(f, np.int32, 1).squeeze()
-
- flow = np.fromfile(f, np.float32, width * height * 2).reshape((height, width, 2))
-
- return flow.astype(np.float32)
-
-
-def readImage(name):
- if name.endswith(".pfm") or name.endswith(".PFM"):
- data = readPFM(name)[0]
- if len(data.shape) == 3:
- return data[:, :, 0:3]
- else:
- return data
- return imread(name)
-
-
-def writeImage(name, data):
- if name.endswith(".pfm") or name.endswith(".PFM"):
- return writePFM(name, data, 1)
- return imwrite(name, data)
-
-
-def writeFlow(name, flow):
- f = open(name, "wb")
- f.write("PIEH".encode("utf-8"))
- np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
- flow = flow.astype(np.float32)
- flow.tofile(f)
-
-
-def readFloat(name):
- f = open(name, "rb")
-
- if (f.readline().decode("utf-8")) != "float\n":
- raise Exception("float file %s did not contain keyword" % name)
-
- dim = int(f.readline())
-
- dims = []
- count = 1
- for i in range(0, dim):
- d = int(f.readline())
- dims.append(d)
- count *= d
-
- dims = list(reversed(dims))
-
- data = np.fromfile(f, np.float32, count).reshape(dims)
- if dim > 2:
- data = np.transpose(data, (2, 1, 0))
- data = np.transpose(data, (1, 0, 2))
-
- return data
-
-
-def writeFloat(name, data):
- f = open(name, "wb")
-
- dim = len(data.shape)
- if dim > 3:
- raise Exception("bad float file dimension: %d" % dim)
-
- f.write(("float\n").encode("ascii"))
- f.write(("%d\n" % dim).encode("ascii"))
-
- if dim == 1:
- f.write(("%d\n" % data.shape[0]).encode("ascii"))
- else:
- f.write(("%d\n" % data.shape[1]).encode("ascii"))
- f.write(("%d\n" % data.shape[0]).encode("ascii"))
- for i in range(2, dim):
- f.write(("%d\n" % data.shape[i]).encode("ascii"))
-
- data = data.astype(np.float32)
- if dim == 2:
- data.tofile(f)
-
- else:
- np.transpose(data, (2, 0, 1)).tofile(f)
-
-
-def check_dim_and_resize(tensor_list):
- shape_list = []
- for t in tensor_list:
- shape_list.append(t.shape[2:])
-
- if len(set(shape_list)) > 1:
- desired_shape = shape_list[0]
- print(f"Inconsistent size of input video frames. All frames will be resized to {desired_shape}")
-
- resize_tensor_list = []
- for t in tensor_list:
- resize_tensor_list.append(torch.nn.functional.interpolate(t, size=tuple(desired_shape), mode="bilinear"))
-
- tensor_list = resize_tensor_list
-
- return tensor_list
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/intepolate/README.md b/PyTorch/built-in/mm/OpenSora1.1/tools/intepolate/README.md
deleted file mode 100644
index cd406e140267824d8f30405e09e7dfcb591eb207..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/intepolate/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# To be added
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/README.md b/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/README.md
deleted file mode 100644
index bb8c254384291ad8e6a1cadc1688f8ec22056a83..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Scene Detection and Video Splitting
-
-- [Scene Detection and Video Splitting](#scene-detection-and-video-splitting)
- - [Prepare Meta Files](#prepare-meta-files)
- - [Scene Detection](#scene-detection)
- - [Video Splitting](#video-splitting)
-
-In many cases, raw videos contain several scenes and are too long for training. Thus, it is essential to split them into shorter
-clips based on scenes. Here, we provide code for scene detection and video splitting.
-
-## Prepare Meta Files
-At this step, you should have a raw video dataset prepared. A meta file of the dataset information is needed for data processing. To create a meta file from a folder, run:
-
-```bash
-python -m tools.datasets.convert video /path/to/video/folder --output /path/to/save/meta.csv
-```
-This should output a `.csv` file with column `path`.
-
-If you already have a meta file for the videos and want to keep the information.
-**Make sure** the meta file has column `id`, which is the id for each video, and the video is named as `{id}.mp4`.
-The following command will add a new column `path` to the meta file.
-
-```bash
-python tools/scene_cut/convert_id_to_path.py /path/to/meta.csv --folder_path /path/to/video/folder
-```
-This should output
-- `{prefix}_path-filtered.csv` with column `path` (broken videos filtered)
-- `{prefix}_path_intact.csv` with column `path` and `intact` (`intact` indicating a video is intact or not)
-
-
-## Scene Detection
-The next step is to detect scenes in a video.
-We use [`PySceneDetect`](https://github.com/Breakthrough/PySceneDetect) for this job.
-**Make sure** the input meta file has column `path`, which is the path of a video.
-
-```bash
-python tools/scene_cut/scene_detect.py /path/to/meta.csv
-```
-The output is `{prefix}_timestamp.csv` with column `timestamp`. Each cell in column `timestamp` is a list of tuples,
-with each tuple indicating the start and end timestamp of a scene
-(e.g., `[('00:00:01.234', '00:00:02.345'), ('00:00:03.456', '00:00:04.567')]`).
-
-## Video Splitting
-After obtaining timestamps for scenes, we conduct video splitting (cutting).
-**Make sure** the meta file contains column `timestamp`.
-
-```bash
-python tools/scene_cut/cut.py /path/to/meta.csv --save_dir /path/to/output/dir
-```
-
-This will save video clips to `/path/to/output/dir`. The video clips are named as `{video_id}_scene-{scene_id}.mp4`
-
-To create a new meta file for the generated clips, run:
-```bash
-python -m tools.datasets.convert video /path/to/video/folder --output /path/to/save/meta.csv
-```
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/convert_id_to_path.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/convert_id_to_path.py
deleted file mode 100644
index eb7b1cb4e27e1de54738545e880544fac55a889f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/convert_id_to_path.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import os
-
-import argparse
-import json
-from functools import partial
-
-import numpy as np
-import pandas as pd
-from pandarallel import pandarallel
-import cv2
-from mmengine.logging import print_log
-from moviepy.editor import VideoFileClip
-from tqdm import tqdm
-
-tqdm.pandas()
-
-
-def is_intact_video(video_path, mode="moviepy", verbose=False, logger=None):
- if not os.path.exists(video_path):
- if verbose:
- print_log(f"Could not find '{video_path}'", logger=logger)
- return False
-
- if mode == "moviepy":
- try:
- VideoFileClip(video_path)
- if verbose:
- print_log(f"The video file '{video_path}' is intact.", logger=logger)
- return True
- except Exception as e:
- if verbose:
- print_log(f"Error: {e}", logger=logger)
- print_log(f"The video file '{video_path}' is not intact.", logger=logger)
- return False
- elif mode == "cv2":
- try:
- cap = cv2.VideoCapture(video_path)
- if cap.isOpened():
- if verbose:
- print_log(f"The video file '{video_path}' is intact.", logger=logger)
- return True
- except Exception as e:
- if verbose:
- print_log(f"Error: {e}", logger=logger)
- print_log(f"The video file '{video_path}' is not intact.", logger=logger)
- return False
- else:
- raise ValueError
-
-
-def has_downloaded_success(json_path):
- if not os.path.exists(json_path):
- return False
-
- try:
- with open(json_path, "r") as f:
- data = json.load(f)
- if "success" not in data or isinstance(data["success"], bool) is False or data["success"] is False:
- return False
- except Exception:
- return False
-
- return True
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("meta_path", type=str)
- parser.add_argument("--folder_path", type=str, required=True)
- parser.add_argument("--mode", type=str, default=None)
-
- args = parser.parse_args()
- return args
-
-
-def main():
- args = parse_args()
-
- meta_path = args.meta_path
- folder_path = args.folder_path
- mode = args.mode
-
- def is_intact(row, mode=None):
- video_id = row["id"]
- video_path = os.path.join(folder_path, f"{video_id}.mp4")
- row["path"] = video_path
-
- if mode == ".mp4":
- if is_intact_video(video_path):
- return True, video_path
- return False, video_path
- elif mode == ".json":
- # json_path = os.path.join(root_raw, f"data/{split}/{video_id}.json")
- json_path = os.path.join(folder_path, f"{video_id}.json")
- if has_downloaded_success(json_path):
- return True, video_path
- return False, video_path
- elif mode is None:
- return True, video_path
- else:
- raise ValueError
-
- meta_dirpath = os.path.dirname(meta_path)
- meta_fname = os.path.basename(meta_path)
- wo_ext, ext = os.path.splitext(meta_fname)
-
- pandarallel.initialize(progress_bar=True)
- is_intact_partial = partial(is_intact, mode=mode)
-
- meta = pd.read_csv(meta_path)
- ret = meta.parallel_apply(is_intact_partial, axis=1)
- intact, paths = list(zip(*ret))
-
- meta["intact"] = intact
- meta["path"] = paths
- out_path = os.path.join(meta_dirpath, f"{wo_ext}_path_intact.csv")
- meta.to_csv(out_path, index=False)
- print(f"New meta (shape={meta.shape}) with intact info saved to '{out_path}'")
-
- meta_format = meta[np.array(intact)]
- meta_format.drop("intact", axis=1, inplace=True)
- out_path = os.path.join(meta_dirpath, f"{wo_ext}_path-filtered.csv")
- meta_format.to_csv(out_path, index=False)
- print(f"New meta (shape={meta_format.shape}) with format info saved to '{out_path}'")
-
-
-if __name__ == "__main__":
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/cut.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/cut.py
deleted file mode 100644
index b3ecbe00b87c905d269c629bbd1b4fe7ada1e5ae..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/cut.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import argparse
-import os
-import subprocess
-import time
-from functools import partial
-
-import pandas as pd
-from imageio_ffmpeg import get_ffmpeg_exe
-from mmengine.logging import MMLogger, print_log
-from pandarallel import pandarallel
-from scenedetect import FrameTimecode
-from tqdm import tqdm
-
-tqdm.pandas()
-
-
-def process_single_row(row, args, log_name=None):
- video_path = row["path"]
-
- logger = None
- if log_name is not None:
- logger = MMLogger.get_instance(log_name)
-
- # check mp4 integrity
- # if not is_intact_video(video_path, logger=logger):
- # return False
-
- timestamp = row["timestamp"]
- if not (timestamp.startswith("[") and timestamp.endswith("]")):
- return False
- scene_list = eval(timestamp)
- scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
- split_video(
- video_path,
- scene_list,
- save_dir=args.save_dir,
- min_seconds=args.min_seconds,
- max_seconds=args.max_seconds,
- target_fps=args.target_fps,
- shorter_size=args.shorter_size,
- logger=logger,
- )
-
-
-def split_video(
- video_path,
- scene_list,
- save_dir,
- min_seconds=2.0,
- max_seconds=15.0,
- target_fps=30,
- shorter_size=720,
- verbose=False,
- logger=None,
-):
- """
- scenes shorter than min_seconds will be ignored;
- scenes longer than max_seconds will be cut to save the beginning max_seconds.
- Currently, the saved file name pattern is f'{fname}_scene-{idx}'.mp4
-
- Args:
- scene_list (List[Tuple[FrameTimecode, FrameTimecode]]): each element is (s, t): start and end of a scene.
- min_seconds (float | None)
- max_seconds (float | None)
- target_fps (int | None)
- shorter_size (int | None)
- """
- FFMPEG_PATH = get_ffmpeg_exe()
-
- save_path_list = []
- for idx, scene in enumerate(scene_list):
- s, t = scene # FrameTimecode
- if min_seconds is not None:
- if (t - s).get_seconds() < min_seconds:
- continue
-
- duration = t - s
- if max_seconds is not None:
- fps = s.framerate
- max_duration = FrameTimecode(timecode="00:00:00", fps=fps)
- max_duration.frame_num = round(fps * max_seconds)
- duration = min(max_duration, duration)
-
- # save path
- fname = os.path.basename(video_path)
- fname_wo_ext = os.path.splitext(fname)[0]
- # TODO: fname pattern
- save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
-
- # ffmpeg cmd
- cmd = [FFMPEG_PATH]
-
- # Only show ffmpeg output for the first call, which will display any
- # errors if it fails, and then break the loop. We only show error messages
- # for the remaining calls.
- # cmd += ['-v', 'error']
-
- # clip to cut
- # -ss after -i is very slow; put -ss before -i
- cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-i", video_path, "-t", str(duration.get_seconds())]
-
- # target fps
- if target_fps is not None:
- cmd += ["-r", f"{target_fps}"]
-
- # aspect ratio
- if shorter_size is not None:
- cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"]
- # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
-
- cmd += ["-map", "0", save_path]
-
- proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
- stdout, stderr = proc.communicate()
- # stdout = stdout.decode("utf-8")
- # print_log(stdout, logger=logger)
-
- save_path_list.append(video_path)
- if verbose:
- print_log(f"Video clip saved to '{save_path}'", logger=logger)
-
- return save_path_list
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("meta_path", type=str)
- parser.add_argument("--save_dir", type=str)
- parser.add_argument("--min_seconds", type=float, default=None,
- help='if not None, clip shorter than min_seconds is ignored')
- parser.add_argument("--max_seconds", type=float, default=None,
- help='if not None, clip longer than max_seconds is truncated')
- parser.add_argument("--target_fps", type=int, default=30, help='target fps of clips')
- parser.add_argument("--shorter_size", type=int, default=720, help='resize the shorter size by keeping ratio')
-
- args = parser.parse_args()
- return args
-
-
-def main():
- args = parse_args()
-
- save_dir = args.save_dir
- os.makedirs(save_dir, exist_ok=True)
-
- # create logger
- log_dir = os.path.dirname(save_dir)
- log_name = os.path.basename(save_dir)
- timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
- log_path = os.path.join(log_dir, f"{log_name}_{timestamp}.log")
- logger = MMLogger.get_instance(log_name, log_file=log_path)
- # logger = None
-
- # initialize pandarallel
- pandarallel.initialize(progress_bar=True)
- process_single_row_partial = partial(process_single_row, args=args, log_name=log_name)
-
- # process
- meta = pd.read_csv(args.meta_path)
- meta.parallel_apply(process_single_row_partial, axis=1)
-
-
-if __name__ == "__main__":
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/scene_detect.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/scene_detect.py
deleted file mode 100644
index eb7b003b5ebf932840fa1f038d0c3050273e81b3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scene_cut/scene_detect.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import argparse
-import os
-
-import numpy as np
-import pandas as pd
-from pandarallel import pandarallel
-from scenedetect import AdaptiveDetector, detect
-from tqdm import tqdm
-
-tqdm.pandas()
-
-
-def process_single_row(row):
- # windows
- # from scenedetect import detect, ContentDetector, AdaptiveDetector
-
- video_path = row["path"]
-
- detector = AdaptiveDetector(
- adaptive_threshold=3.0,
- # luma_only=True,
- )
- # detector = ContentDetector()
- # TODO: catch error here
- try:
- scene_list = detect(video_path, detector, start_in_scene=True)
- timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list]
- return True, str(timestamp)
- except Exception as e:
- print(f"Video '{video_path}' with error {e}")
- return False, ""
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("meta_path", type=str)
-
- args = parser.parse_args()
- return args
-
-
-def main():
- args = parse_args()
- meta_path = args.meta_path
-
- pandarallel.initialize(progress_bar=True)
-
- meta = pd.read_csv(meta_path)
- ret = meta.parallel_apply(process_single_row, axis=1)
-
- succ, timestamps = list(zip(*ret))
- meta["timestamp"] = timestamps
- meta = meta[np.array(succ)]
-
- wo_ext, ext = os.path.splitext(meta_path)
- out_path = f"{wo_ext}_timestamp{ext}"
- meta.to_csv(out_path, index=False)
- print(f"New meta (shape={meta.shape}) with timestamp saved to '{out_path}'.")
-
-
-if __name__ == "__main__":
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scenedetect/README.md b/PyTorch/built-in/mm/OpenSora1.1/tools/scenedetect/README.md
deleted file mode 100644
index 8052733739109237ea620a1983ba31b7473fee92..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scenedetect/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Scene Detection and Video Split
-
-Raw videos from the Internet may be too long for training.
-Thus, we detect scenes in raw videos and split them into short clips based on the scenes.
-First prepare the video processing packages.
-```bash
-pip install scenedetect moviepy opencv-python
-```
-Then run `scene_detect.py`. We provide efficient processing using `multiprocessing`. Don't forget to specify your own dataset path.
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scenedetect/scene_detect.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scenedetect/scene_detect.py
deleted file mode 100644
index c46e59d5abce24575d62a3e3bdffb2aed49efa0b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scenedetect/scene_detect.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import os
-from multiprocessing import Pool
-
-from mmengine.logging import MMLogger
-from scenedetect import ContentDetector, detect
-from tqdm import tqdm
-
-from opensora.utils.misc import get_timestamp
-
-from .utils import check_mp4_integrity, clone_folder_structure, iterate_files, split_video
-
-# config
-target_fps = 30 # int
-shorter_size = 512 # int
-min_seconds = 1 # float
-max_seconds = 5 # float
-assert max_seconds > min_seconds
-cfg = dict(
- target_fps=target_fps,
- min_seconds=min_seconds,
- max_seconds=max_seconds,
- shorter_size=shorter_size,
-)
-
-
-def process_folder(root_src, root_dst):
- # create logger
- folder_path_log = os.path.dirname(root_dst)
- log_name = os.path.basename(root_dst)
- timestamp = get_timestamp()
- log_path = os.path.join(folder_path_log, f"{log_name}_{timestamp}.log")
- logger = MMLogger.get_instance(log_name, log_file=log_path)
-
- # clone folder structure
- clone_folder_structure(root_src, root_dst)
-
- # all source videos
- mp4_list = [x for x in iterate_files(root_src) if x.endswith(".mp4")]
- mp4_list = sorted(mp4_list)
-
- for idx, sample_path in tqdm(enumerate(mp4_list)):
- folder_src = os.path.dirname(sample_path)
- folder_dst = os.path.join(root_dst, os.path.relpath(folder_src, root_src))
-
- # check src video integrity
- if not check_mp4_integrity(sample_path, logger=logger):
- continue
-
- # detect scenes
- scene_list = detect(sample_path, ContentDetector(), start_in_scene=True)
-
- # split scenes
- save_path_list = split_video(sample_path, scene_list, save_dir=folder_dst, **cfg, logger=logger)
-
- # check integrity of generated clips
- for x in save_path_list:
- check_mp4_integrity(x, logger=logger)
-
-
-def scene_detect():
- """detect & cut scenes using a single process
- Expected dataset structure:
- data/
- your_dataset/
- raw_videos/
- xxx.mp4
- yyy.mp4
-
- This function results in:
- data/
- your_dataset/
- raw_videos/
- xxx.mp4
- yyy.mp4
- zzz.mp4
- clips/
- xxx_scene-0.mp4
- yyy_scene-0.mp4
- yyy_scene-1.mp4
- """
- # TODO: specify your dataset root
- root_src = f"./data/your_dataset/raw_videos"
- root_dst = f"./data/your_dataset/clips"
-
- process_folder(root_src, root_dst)
-
-
-def scene_detect_mp():
- """detect & cut scenes using multiple processes
- Expected dataset structure:
- data/
- your_dataset/
- raw_videos/
- split_0/
- xxx.mp4
- yyy.mp4
- split_1/
- xxx.mp4
- yyy.mp4
-
- This function results in:
- data/
- your_dataset/
- raw_videos/
- split_0/
- xxx.mp4
- yyy.mp4
- split_1/
- xxx.mp4
- yyy.mp4
- clips/
- split_0/
- xxx_scene-0.mp4
- yyy_scene-0.mp4
- split_1/
- xxx_scene-0.mp4
- yyy_scene-0.mp4
- yyy_scene-1.mp4
- """
- # TODO: specify your dataset root
- root_src = f"./data/your_dataset/raw_videos"
- root_dst = f"./data/your_dataset/clips"
-
- # TODO: specify your splits
- splits = ["split_0", "split_1"]
-
- # process folders
- root_src_list = [os.path.join(root_src, x) for x in splits]
- root_dst_list = [os.path.join(root_dst, x) for x in splits]
-
- with Pool(processes=len(splits)) as pool:
- pool.starmap(process_folder, list(zip(root_src_list, root_dst_list)))
-
-
-if __name__ == "__main__":
- # TODO: choose single process or multiprocessing
- scene_detect()
- # scene_detect_mp()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scenedetect/utils.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scenedetect/utils.py
deleted file mode 100644
index 19eae31463bc1464b887877856bcf5c49ccde923..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scenedetect/utils.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import os
-import subprocess
-
-import cv2
-from imageio_ffmpeg import get_ffmpeg_exe
-from mmengine.logging import print_log
-from moviepy.editor import VideoFileClip
-from scenedetect import FrameTimecode
-
-
-def iterate_files(folder_path):
- for root, dirs, files in os.walk(folder_path):
- # root contains the current directory path
- # dirs contains the list of subdirectories in the current directory
- # files contains the list of files in the current directory
-
- # Process files in the current directory
- for file in files:
- file_path = os.path.join(root, file)
- # print("File:", file_path)
- yield file_path
-
- # Process subdirectories and recursively call the function
- for subdir in dirs:
- subdir_path = os.path.join(root, subdir)
- # print("Subdirectory:", subdir_path)
- iterate_files(subdir_path)
-
-
-def iterate_folders(folder_path):
- for root, dirs, files in os.walk(folder_path):
- for subdir in dirs:
- subdir_path = os.path.join(root, subdir)
- yield subdir_path
- # print("Subdirectory:", subdir_path)
- iterate_folders(subdir_path)
-
-
-def clone_folder_structure(root_src, root_dst, verbose=False):
- src_path_list = iterate_folders(root_src)
- src_relpath_list = [os.path.relpath(x, root_src) for x in src_path_list]
-
- os.makedirs(root_dst, exist_ok=True)
- dst_path_list = [os.path.join(root_dst, x) for x in src_relpath_list]
- for folder_path in dst_path_list:
- os.makedirs(folder_path, exist_ok=True)
- if verbose:
- print(f"Create folder: '{folder_path}'")
-
-
-def count_files(root, suffix=".mp4"):
- files_list = iterate_files(root)
- cnt = len([x for x in files_list if x.endswith(suffix)])
- return cnt
-
-
-def check_mp4_integrity(file_path, verbose=True, logger=None):
- try:
- VideoFileClip(file_path)
- if verbose:
- print_log(f"The MP4 file '{file_path}' is intact.", logger=logger)
- return True
- except Exception as e:
- if verbose:
- print_log(f"Error: {e}", logger=logger)
- print_log(f"The MP4 file '{file_path}' is not intact.", logger=logger)
- return False
-
-
-def count_frames(video_path):
- cap = cv2.VideoCapture(video_path)
-
- if not cap.isOpened():
- print(f"Error: Could not open video file '{video_path}'")
- return
-
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
- print(f"Total frames in the video '{video_path}': {total_frames}")
-
- cap.release()
-
-
-def split_video(
- sample_path,
- scene_list,
- save_dir,
- target_fps=30,
- min_seconds=1,
- max_seconds=10,
- shorter_size=512,
- verbose=False,
- logger=None,
-):
- FFMPEG_PATH = get_ffmpeg_exe()
-
- save_path_list = []
- for idx, scene in enumerate(scene_list):
- s, t = scene # FrameTimecode
- fps = s.framerate
- max_duration = FrameTimecode(timecode="00:00:00", fps=fps)
- max_duration.frame_num = round(fps * max_seconds)
- duration = min(max_duration, t - s)
- if duration.get_frames() < round(min_seconds * fps):
- continue
-
- # save path
- fname = os.path.basename(sample_path)
- fname_wo_ext = os.path.splitext(fname)[0]
- # TODO: fname pattern
- save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
-
- # ffmpeg cmd
- cmd = [FFMPEG_PATH]
-
- # Only show ffmpeg output for the first call, which will display any
- # errors if it fails, and then break the loop. We only show error messages
- # for the remaining calls.
- # cmd += ['-v', 'error']
-
- # input path
- cmd += ["-i", sample_path]
-
- # clip to cut
- cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-t", str(duration.get_seconds())]
-
- # target fps
- # cmd += ['-vf', 'select=mod(n\,2)']
- cmd += ["-r", f"{target_fps}"]
-
- # aspect ratio
- cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"]
- # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
-
- cmd += ["-map", "0", save_path]
-
- proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
- stdout, stderr = proc.communicate()
- if verbose:
- stdout = stdout.decode("utf-8")
- print_log(stdout, logger=logger)
-
- save_path_list.append(sample_path)
- print_log(f"Video clip saved to '{save_path}'", logger=logger)
-
- return save_path_list
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/README.md b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/README.md
deleted file mode 100644
index a944d5cf79b993245a0b77108f4a83ce5ff2e8ac..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/README.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Scoring and Filtering
-
-- [Scoring and Filtering](#scoring-and-filtering)
- - [Aesthetic Score](#aesthetic-score)
- - [Optical Flow Score](#optical-flow-score)
- - [OCR](#ocr)
- - [Matching Score](#matching-score)
- - [Filtering](#filtering)
-
-## Aesthetic Score
-
-To evaluate the aesthetic quality of videos, we use the scoring model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
-
-The aesthetic score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for high aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.
-
-For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images as input.
-The throughput of our code is ~1K videos/s on a single H800 GPU. It also supports running on multiple GPUs for further acceleration.
-
-First, install the required packages and download the scoring model to `./pretrained_models/aesthetic.pth`.
-```bash
-# pip install
-pip install git+https://github.com/openai/CLIP.git
-pip install decord
-
-# get pretrained model
-wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
-```
-
-Then, run the following command. **Make sure** the meta file has column `path` (path to the sample).
-```bash
-torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference /path/to/meta.csv --bs 1024 --num_workers 16
-```
-This will generate multiple part files, each corresponding to a node . Run `python -m tools.datasets.datautil /path/to/meta_aes_part*.csv --output /path/to/meta_aes.csv` to merge them.
-
-## Optical Flow Score
-
-Optical flow scores are used to assess the motion of a video. Higher optical flow scores indicate larger movement.
-We use the [UniMatch](https://github.com/autonomousvision/unimatch) model for this task.
-
-First, download the pretrained model to `./pretrained_model/unimatch/`
-```bash
-wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P ./pretrained_models/unimatch/
-```
-
-Then, run the following command. **Make sure** the meta file has column `path` (path to the sample).
-```bash
-torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py /path/to/meta.csv
-```
-
-This should output `/path/to/meta_flow.csv` with column `flow`.
-
-## OCR
-Some videos are of dense text scenes like news broadcast and advertisement, which are not desired for training.
-We apply Optical Character Recognition (OCR) to detect texts and drop samples with dense texts. Here, we use
-the [DBNet++](https://arxiv.org/abs/2202.10304) model implemented by [MMOCR](https://github.com/open-mmlab/mmocr/).
-
-First, install [MMOCR](https://mmocr.readthedocs.io/en/dev-1.x/get_started/install.html).
-For reference, we install packages of these versions.
-```
-torch==2.0.1
-mmcv==2.0.1
-mmdet==3.1.0
-mmocr==1.0.1
-```
-
-Then, run the following command. **Make sure** the meta file has column `path` (path to the sample).
-```bash
-torchrun --standalone --nproc_per_node 8 tools/scoring/ocr/inference.py /path/to/meta.csv
-```
-This should output `/path/to/meta_ocr.csv` with column `ocr`, indicating the number of text regions with detection confidence > 0.3.
-
-
-## Matching Score
-
-Matching scores are calculated to evaluate the alignment between an image/video and its caption.
-Here, we use the [CLIP](https://github.com/openai/CLIP) model, which is trained on image-text pairs.
-We simply use the cosine similarity as the matching score.
-For videos, we extract the middle frame and compare it with the caption.
-
-First, install OpenAI CLIP.
-```bash
-pip install git+https://github.com/openai/CLIP.git
-```
-
-Then, run the following command. **Make sure** the meta file has column `path` (path to the sample) and `text` (caption of the sample).
-
-```bash
-torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py /path/to/meta.csv
-```
-
-This should output `/path/to/meta_match.csv` with column `match`. Higher matching scores indicate better image-text/video-text alignment.
-
-
-## Filtering
-Once scores are obtained, it is simple to filter samples based on these scores. Here is an example to remove
-samples of aesthetic score < 5.0.
-```
-python -m tools.datasets.datautil /path/to/meta.csv --aesmin 5.0
-```
-This should output `/path/to/meta_aesmin5.0.csv` with column `aes` >= 5.0
\ No newline at end of file
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/aesthetic/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/aesthetic/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/aesthetic/inference.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/aesthetic/inference.py
deleted file mode 100644
index a527859c3eb238195378297f6c7772851b579b75..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/aesthetic/inference.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# adapted from https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/main/simple_inference.py
-import argparse
-from datetime import timedelta
-
-import clip
-import numpy as np
-import pandas as pd
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import torch.nn.functional as F
-from colossalai.utils import set_seed
-from einops import rearrange
-from PIL import Image
-from torchvision.datasets.folder import pil_loader
-from tqdm import tqdm
-
-from tools.datasets.utils import extract_frames, is_video
-
-try:
- from torchvision.transforms import InterpolationMode
-
- BICUBIC = InterpolationMode.BICUBIC
-except ImportError:
- BICUBIC = Image.BICUBIC
-
-
-NUM_FRAMES_POINTS = {
- 1: (0.5,),
- 2: (0.25, 0.5),
- 3: (0.1, 0.5, 0.9),
-}
-
-
-class VideoTextDataset(torch.utils.data.Dataset):
- def __init__(self, csv_path, transform=None, num_frames=3):
- self.csv_path = csv_path
- self.data = pd.read_csv(csv_path)
- self.transform = transform
- self.points = NUM_FRAMES_POINTS[num_frames]
-
- def getitem(self, index):
- sample = self.data.iloc[index]
- path = sample["path"]
- if not is_video(path):
- images = [pil_loader(path)]
- else:
- num_frames = None
- if "num_frames" in sample:
- num_frames = sample["num_frames"]
- images = extract_frames(sample["path"], points=self.points, backend="opencv", num_frames=num_frames)
- images = [self.transform(img) for img in images]
- images = torch.stack(images)
- ret = dict(index=index, images=images)
- return ret
-
- def __len__(self):
- return len(self.data)
-
- def __getitem__(self, index):
- return self.getitem(index)
-
-
-class MLP(nn.Module):
- def __init__(self, input_size):
- super().__init__()
- self.input_size = input_size
- self.layers = nn.Sequential(
- nn.Linear(self.input_size, 1024),
- nn.Dropout(0.2),
- nn.Linear(1024, 128),
- nn.Dropout(0.2),
- nn.Linear(128, 64),
- nn.Dropout(0.1),
- nn.Linear(64, 16),
- nn.Linear(16, 1),
- )
-
- def forward(self, x):
- return self.layers(x)
-
-
-class AestheticScorer(nn.Module):
- def __init__(self, input_size, device):
- super().__init__()
- self.mlp = MLP(input_size)
- self.mlp.load_state_dict(torch.load("pretrained_models/aesthetic.pth"))
- self.clip, self.preprocess = clip.load("ViT-L/14", device=device)
-
- self.eval()
- self.to(device)
-
- def forward(self, x):
- image_features = self.clip.encode_image(x)
- image_features = F.normalize(image_features, p=2, dim=-1).float()
- return self.mlp(image_features)
-
-
-@torch.inference_mode()
-def main(args):
- dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
- torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
- set_seed(1024)
- rank = dist.get_rank()
- world_size = dist.get_world_size()
-
- output_file = args.input.replace(".csv", f"_aes_part{rank}.csv")
-
- # build model
- device = "cuda" if torch.cuda.is_available() else "cpu"
- model = AestheticScorer(768, device)
- preprocess = model.preprocess
-
- # build dataset
- dataset = VideoTextDataset(args.input, transform=preprocess, num_frames=args.num_frames)
- sampler = torch.utils.data.distributed.DistributedSampler(
- dataset=dataset, num_replicas=world_size, rank=rank, shuffle=False
- )
-
- dataloader = torch.utils.data.DataLoader(
- dataset,
- sampler=sampler,
- batch_size=args.bs,
- shuffle=False,
- num_workers=args.num_workers,
- pin_memory=True,
- prefetch_factor=args.prefetch_factor if args.num_workers > 0 else None,
- )
-
- # compute aesthetic scores
- dataset.data["aes"] = np.nan
-
- with tqdm(dataloader, position=rank, desc=f"Data Parallel Rank {rank}") as t:
- for idx, batch in enumerate(t):
- image_indices = batch["index"]
- images = batch["images"].to(device, non_blocking=True)
- B = images.shape[0]
- images = rearrange(images, "b p c h w -> (b p) c h w")
-
- # compute score
- scores = model(images)
- scores = rearrange(scores, "(b p) 1 -> b p", b=B)
- scores = scores.mean(dim=1)
- scores_np = scores.to(torch.float32).cpu().numpy()
-
- # assign the score
- dataset.data.loc[image_indices, "aes"] = scores_np
-
- # wait for all ranks to finish data processing
- dist.barrier()
-
- # exclude rows whose aes is nan and save file
- dataset.data = dataset.data[dataset.data["aes"] > 0]
- dataset.data.to_csv(output_file, index=False)
- print(f"New meta with aesthetic scores saved to '{output_file}'.")
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("input", type=str, help="Path to the input CSV file")
- parser.add_argument("--bs", type=int, default=1024, help="Batch size")
- parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
- parser.add_argument("--accumulate", type=int, default=1, help="batch to accumulate")
- parser.add_argument("--prefetch_factor", type=int, default=2, help="Prefetch factor")
- parser.add_argument("--num_frames", type=int, default=3, help="Number of frames to extract")
- args = parser.parse_args()
-
- main(args)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/matching/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/matching/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/matching/inference.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/matching/inference.py
deleted file mode 100644
index 7bedef1cfe96f4baa95bce7dee109739aaab9a90..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/matching/inference.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import argparse
-import os
-
-import clip
-import colossalai
-import numpy as np
-import pandas as pd
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-from torch.utils.data import DataLoader, DistributedSampler
-from torchvision.datasets.folder import pil_loader
-from tqdm import tqdm
-
-from tools.datasets.utils import extract_frames, is_video
-
-
-class VideoTextDataset(torch.utils.data.Dataset):
- def __init__(self, meta_path, transform):
- self.meta_path = meta_path
- self.meta = pd.read_csv(meta_path)
- self.transform = transform
-
- def __getitem__(self, index):
- row = self.meta.iloc[index]
- path = row["path"]
-
- if is_video(path):
- img = extract_frames(path, points=[0.5], backend="opencv")[0]
- else:
- img = pil_loader(path)
-
- img = self.transform(img)
-
- text = row["text"]
- text = clip.tokenize(text, truncate=True).squeeze()
-
- return img, text, index
-
- def __len__(self):
- return len(self.meta)
-
-
-def merge_scores(gathered_list: list, meta: pd.DataFrame):
- # reorder
- indices_list = list(map(lambda x: x[0], gathered_list))
- scores_list = list(map(lambda x: x[1], gathered_list))
- flat_indices = []
- for x in zip(*indices_list):
- flat_indices.extend(x)
- flat_scores = []
- for x in zip(*scores_list):
- flat_scores.extend(x)
- flat_indices = np.array(flat_indices)
- flat_scores = np.array(flat_scores)
- # filter duplicates
- unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
- meta.loc[unique_indices, "match"] = flat_scores[unique_indices_idx]
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
- parser.add_argument("--bs", type=int, default=16, help="Batch size")
- parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
- args = parser.parse_args()
- return args
-
-
-def main():
- colossalai.launch_from_torch({})
- args = parse_args()
-
- meta_path = args.meta_path
- wo_ext, ext = os.path.splitext(meta_path)
- out_path = f"{wo_ext}_match{ext}"
-
- # build model
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
- model, preprocess = clip.load("ViT-L/14", device=device)
- logit_scale = model.logit_scale.exp().item()
-
- # build dataset
- dataset = VideoTextDataset(meta_path=meta_path, transform=preprocess)
- dataloader = DataLoader(
- dataset,
- batch_size=args.bs,
- num_workers=args.num_workers,
- sampler=DistributedSampler(
- dataset,
- num_replicas=dist.get_world_size(),
- rank=dist.get_rank(),
- shuffle=False,
- drop_last=False,
- ),
- )
-
- # compute scores
- dataset.meta["match"] = np.nan
- indices_list = []
- scores_list = []
- model.eval()
- for imgs, text, indices in tqdm(dataloader, disable=dist.get_rank() != 0):
- imgs = imgs.to(device)
- text = text.to(device)
-
- with torch.no_grad():
- feat_img = model.encode_image(imgs)
- feat_text = model.encode_text(text)
-
- feat_img = F.normalize(feat_img, dim=1)
- feat_text = F.normalize(feat_text, dim=1)
- clip_scores = logit_scale * (feat_img * feat_text).sum(dim=1)
- clip_scores = clip_scores.cpu().tolist()
- indices_list.extend(indices)
- scores_list.extend(clip_scores)
-
- gathered_list = [None] * dist.get_world_size()
- dist.all_gather_object(gathered_list, (indices_list, scores_list))
- if dist.get_rank() == 0:
- merge_scores(gathered_list, dataset.meta)
- dataset.meta.to_csv(out_path, index=False)
- print(f"New meta with matching scores saved to '{out_path}'.")
-
-
-if __name__ == "__main__":
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/ocr/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/ocr/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/ocr/dbnetpp.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/ocr/dbnetpp.py
deleted file mode 100644
index e313fd4a5fc9ed8c073dd879a849b078966366f0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/ocr/dbnetpp.py
+++ /dev/null
@@ -1,64 +0,0 @@
-model = dict(
- type='DBNet',
- backbone=dict(
- type='CLIPResNet',
- depth=50,
- num_stages=4,
- out_indices=(0, 1, 2, 3),
- frozen_stages=-1,
- norm_cfg=dict(type='BN', requires_grad=True),
- norm_eval=False,
- style='pytorch',
- dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
- # init_cfg=dict(
- # type='Pretrained',
- # checkpoint='https://download.openmmlab.com/mmocr/backbone/resnet50-oclip-7ba0c533.pth'),
- stage_with_dcn=(False, True, True, True),
- ),
- neck=dict(
- type='FPNC',
- in_channels=[256, 512, 1024, 2048],
- lateral_channels=256,
- asf_cfg=dict(attention_type='ScaleChannelSpatial'),
- ),
- det_head=dict(
- type='DBHead',
- in_channels=256,
- module_loss=dict(type='DBModuleLoss'),
- postprocessor=dict(
- type='DBPostprocessor', text_repr_type='quad',
- epsilon_ratio=0.002,
- ),
- ),
- data_preprocessor=dict(
- type='TextDetDataPreprocessor',
- mean=[123.675, 116.28, 103.53],
- std=[58.395, 57.12, 57.375],
- bgr_to_rgb=True,
- pad_size_divisor=32,
- ),
- init_cfg=dict(
- type='Pretrained',
- checkpoint='https://download.openmmlab.com/mmocr/textdet/dbnetpp/'
- 'dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/'
- 'dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth',
- )
-)
-
-test_pipeline = [
- # dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
- dict(type='Resize', scale=(4068, 1024), keep_ratio=True),
- dict(
- type='PackTextDetInputs',
- # meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'),
- meta_keys=('img_shape', 'scale_factor'),
- )
-]
-
-# Visualization
-vis_backends = [dict(type='LocalVisBackend')]
-visualizer = dict(
- type='TextDetLocalVisualizer',
- name='visualizer',
- vis_backends=vis_backends,
-)
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/ocr/inference.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/ocr/inference.py
deleted file mode 100644
index 6ab9cc0e2e18a77dd5c1148a1b310c04ddbd9955..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/ocr/inference.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import argparse
-import os
-
-import numpy as np
-import pandas as pd
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-from torchvision.transforms import Resize, CenterCrop, Compose
-from torch.utils.data import DataLoader, DistributedSampler
-from torchvision.datasets.folder import pil_loader
-from tqdm import tqdm
-
-import colossalai
-from mmengine import Config
-from mmengine.registry import DefaultScope
-from mmengine.dataset import Compose, default_collate
-from mmocr.registry import MODELS
-from mmocr.datasets import PackTextDetInputs
-
-from tools.datasets.utils import extract_frames, is_video
-
-
-def merge_scores(gathered_list: list, meta: pd.DataFrame):
- # reorder
- indices_list = list(map(lambda x: x[0], gathered_list))
- scores_list = list(map(lambda x: x[1], gathered_list))
- flat_indices = []
- for x in zip(*indices_list):
- flat_indices.extend(x)
- flat_scores = []
- for x in zip(*scores_list):
- flat_scores.extend(x)
- flat_indices = np.array(flat_indices)
- flat_scores = np.array(flat_scores)
- # filter duplicates
- unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
- meta.loc[unique_indices, "ocr"] = flat_scores[unique_indices_idx]
-
-
-class VideoTextDataset(torch.utils.data.Dataset):
- def __init__(self, meta_path, transform):
- self.meta_path = meta_path
- self.meta = pd.read_csv(meta_path)
- self.transform = transform
- self.transform = Compose([
- Resize(1024),
- CenterCrop(1024),
- ])
- self.formatting = PackTextDetInputs(meta_keys=['scale_factor'])
-
- def __getitem__(self, index):
- row = self.meta.iloc[index]
- path = row["path"]
-
- if is_video(path):
- img = extract_frames(path, frame_inds=[10], backend="opencv")[0]
- else:
- img = pil_loader(path)
-
- img = self.transform(img)
- img_array = np.array(img)[:, :, ::-1].copy() # bgr
- results = {
- 'img': img_array,
- 'scale_factor': 1.0,
- # 'img_shape': img_array.shape[-2],
- # 'ori_shape': img_array.shape[-2],
- }
- results = self.formatting(results)
- results['index'] = index
-
- return results
-
- def __len__(self):
- return len(self.meta)
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
- parser.add_argument("--bs", type=int, default=16, help="Batch size")
- parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
- args = parser.parse_args()
-
- return args
-
-
-def main():
- args = parse_args()
- cfg = Config.fromfile('./tools/scoring/ocr/dbnetpp.py')
-
- meta_path = args.meta_path
-
- colossalai.launch_from_torch({})
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
- DefaultScope.get_instance('ocr', scope_name='mmocr') # use mmocr Registry as default
-
- # build model
- model = MODELS.build(cfg.model)
- model.init_weights()
- model.to(device) # set data_preprocessor._device
- print('==> Model built.')
-
- # build dataset
- transform = Compose(cfg.test_pipeline)
- dataset = VideoTextDataset(meta_path=meta_path, transform=transform)
- dataloader = DataLoader(
- dataset,
- batch_size=args.bs,
- num_workers=args.num_workers,
- sampler=DistributedSampler(
- dataset,
- num_replicas=dist.get_world_size(),
- rank=dist.get_rank(),
- shuffle=False,
- drop_last=False,
- ),
- collate_fn=default_collate,
- )
- print('==> Dataloader built.')
-
- # compute scores
- dataset.meta["ocr"] = np.nan
- indices_list = []
- scores_list = []
- model.eval()
- for data in tqdm(dataloader, disable=dist.get_rank() != 0):
- indices_i = data['index']
- indices_list.extend(indices_i.tolist())
- del data['index']
-
- pred = model.test_step(data) # this line will cast data to device
-
- num_texts_i = [(x.pred_instances.scores > 0.3).sum().item() for x in pred]
- scores_list.extend(num_texts_i)
-
- gathered_list = [None] * dist.get_world_size()
- dist.all_gather_object(gathered_list, (indices_list, scores_list))
-
- if dist.get_rank() == 0:
- merge_scores(gathered_list, dataset.meta)
-
- wo_ext, ext = os.path.splitext(meta_path)
- out_path = f"{wo_ext}_ocr{ext}"
- dataset.meta.to_csv(out_path, index=False)
- print(f"New meta (shape={dataset.meta.shape}) with ocr results saved to '{out_path}'.")
-
-
-if __name__ == '__main__':
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/inference.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/inference.py
deleted file mode 100644
index 170b0766f582f874cf970ff28b3cee216dcea1cb..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/inference.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import argparse
-import os
-
-import colossalai
-import numpy as np
-import pandas as pd
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-from einops import rearrange
-from torch.utils.data import DataLoader, DistributedSampler
-from torchvision.transforms.functional import pil_to_tensor
-from tqdm import tqdm
-
-from tools.datasets.utils import extract_frames
-
-from .unimatch import UniMatch
-
-
-def merge_scores(gathered_list: list, meta: pd.DataFrame):
- # reorder
- indices_list = list(map(lambda x: x[0], gathered_list))
- flow_scores_list = list(map(lambda x: x[1], gathered_list))
- flat_indices = []
- for x in zip(*indices_list):
- flat_indices.extend(x)
- flat_flow_scores = []
- for x in zip(*flow_scores_list):
- flat_flow_scores.extend(x)
- flat_indices = np.array(flat_indices)
- flat_flow_scores = np.array(flat_flow_scores)
- # filter duplicates
- unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
- meta.loc[unique_indices, "flow"] = flat_flow_scores[unique_indices_idx]
-
-
-class VideoTextDataset(torch.utils.data.Dataset):
- def __init__(self, meta_path, frame_inds=[0, 10, 20, 30]):
- self.meta_path = meta_path
- self.meta = pd.read_csv(meta_path)
- self.frame_inds = frame_inds
-
- def __getitem__(self, index):
- row = self.meta.iloc[index]
- images = extract_frames(row["path"], frame_inds=self.frame_inds, backend="opencv")
-
- # transform
- images = torch.stack([pil_to_tensor(x) for x in images]) # shape: [N, C, H, W]; dtype: torch.uint8
- images = images.float()
- H, W = images.shape[-2:]
- if H > W:
- images = rearrange(images, "N C H W -> N C W H")
- images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True)
-
- return images, index
-
- def __len__(self):
- return len(self.meta)
-
-
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
- parser.add_argument("--bs", type=int, default=4, help="Batch size")
- parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
- args = parser.parse_args()
- return args
-
-
-def main():
- torch.backends.cudnn.deterministic = True
- torch.backends.cudnn.benchmark = False
- colossalai.launch_from_torch({})
- args = parse_args()
-
- meta_path = args.meta_path
- wo_ext, ext = os.path.splitext(meta_path)
- out_path = f"{wo_ext}_flow{ext}"
-
- # build model
- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
- model = UniMatch(
- feature_channels=128,
- num_scales=2,
- upsample_factor=4,
- num_head=1,
- ffn_dim_expansion=4,
- num_transformer_layers=6,
- reg_refine=True,
- task="flow",
- ).eval()
- ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth")
- model.load_state_dict(ckpt["model"])
- model = model.to(device)
- # model = torch.nn.DataParallel(model)
-
- # build dataset
- dataset = VideoTextDataset(meta_path=meta_path, frame_inds=[0, 10, 20, 30])
- dataloader = DataLoader(
- dataset,
- batch_size=args.bs,
- num_workers=args.num_workers,
- sampler=DistributedSampler(
- dataset,
- num_replicas=dist.get_world_size(),
- rank=dist.get_rank(),
- shuffle=False,
- drop_last=False,
- ),
- )
-
- # compute optical flow scores
- dataset.meta["flow"] = np.nan
- indices_list = []
- flow_scores_list = []
- for images, indices in tqdm(dataloader, disable=dist.get_rank() != 0):
- images = images.to(device)
- B = images.shape[0]
-
- batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous()
- batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous()
-
- with torch.no_grad():
- res = model(
- batch_0,
- batch_1,
- attn_type="swin",
- attn_splits_list=[2, 8],
- corr_radius_list=[-1, 4],
- prop_radius_list=[-1, 1],
- num_reg_refine=6,
- task="flow",
- pred_bidir_flow=False,
- )
- flow_maps = res["flow_preds"][-1].cpu() # [B * (N-1), 2, H, W]
- flow_maps = rearrange(flow_maps, "(B N) C H W -> B N H W C", B=B)
- flow_scores = flow_maps.abs().mean(dim=[1, 2, 3, 4])
- flow_scores = flow_scores.tolist()
-
- indices_list.extend(indices)
- flow_scores_list.extend(flow_scores)
-
- gathered_list = [None] * dist.get_world_size()
- dist.all_gather_object(gathered_list, (indices_list, flow_scores_list))
- if dist.get_rank() == 0:
- merge_scores(gathered_list, dataset.meta)
- dataset.meta.to_csv(out_path, index=False)
- print(f"New meta with optical flow scores saved to '{out_path}'.")
-
-
-if __name__ == "__main__":
- main()
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/__init__.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/__init__.py
deleted file mode 100644
index c1f4eb2f58e4f32026f301c80331f536918fae7a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .unimatch import UniMatch
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/attention.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/attention.py
deleted file mode 100644
index 23fb9048a07fcbd5228f42de4cca0a0f5ed9b60b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/attention.py
+++ /dev/null
@@ -1,280 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .utils import merge_splits, merge_splits_1d, split_feature, split_feature_1d
-
-
-def single_head_full_attention(q, k, v):
- # q, k, v: [B, L, C]
- assert q.dim() == k.dim() == v.dim() == 3
-
- scores = torch.matmul(q, k.permute(0, 2, 1)) / (q.size(2) ** 0.5) # [B, L, L]
- attn = torch.softmax(scores, dim=2) # [B, L, L]
- out = torch.matmul(attn, v) # [B, L, C]
-
- return out
-
-
-def single_head_full_attention_1d(
- q,
- k,
- v,
- h=None,
- w=None,
-):
- # q, k, v: [B, L, C]
-
- assert h is not None and w is not None
- assert q.size(1) == h * w
-
- b, _, c = q.size()
-
- q = q.view(b, h, w, c) # [B, H, W, C]
- k = k.view(b, h, w, c)
- v = v.view(b, h, w, c)
-
- scale_factor = c**0.5
-
- scores = torch.matmul(q, k.permute(0, 1, 3, 2)) / scale_factor # [B, H, W, W]
-
- attn = torch.softmax(scores, dim=-1)
-
- out = torch.matmul(attn, v).view(b, -1, c) # [B, H*W, C]
-
- return out
-
-
-def single_head_split_window_attention(
- q,
- k,
- v,
- num_splits=1,
- with_shift=False,
- h=None,
- w=None,
- attn_mask=None,
-):
- # ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
- # q, k, v: [B, L, C]
- assert q.dim() == k.dim() == v.dim() == 3
-
- assert h is not None and w is not None
- assert q.size(1) == h * w
-
- b, _, c = q.size()
-
- b_new = b * num_splits * num_splits
-
- window_size_h = h // num_splits
- window_size_w = w // num_splits
-
- q = q.view(b, h, w, c) # [B, H, W, C]
- k = k.view(b, h, w, c)
- v = v.view(b, h, w, c)
-
- scale_factor = c**0.5
-
- if with_shift:
- assert attn_mask is not None # compute once
- shift_size_h = window_size_h // 2
- shift_size_w = window_size_w // 2
-
- q = torch.roll(q, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
- k = torch.roll(k, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
- v = torch.roll(v, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
-
- q = split_feature(q, num_splits=num_splits, channel_last=True) # [B*K*K, H/K, W/K, C]
- k = split_feature(k, num_splits=num_splits, channel_last=True)
- v = split_feature(v, num_splits=num_splits, channel_last=True)
-
- scores = (
- torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1)) / scale_factor
- ) # [B*K*K, H/K*W/K, H/K*W/K]
-
- if with_shift:
- scores += attn_mask.repeat(b, 1, 1)
-
- attn = torch.softmax(scores, dim=-1)
-
- out = torch.matmul(attn, v.view(b_new, -1, c)) # [B*K*K, H/K*W/K, C]
-
- out = merge_splits(
- out.view(b_new, h // num_splits, w // num_splits, c), num_splits=num_splits, channel_last=True
- ) # [B, H, W, C]
-
- # shift back
- if with_shift:
- out = torch.roll(out, shifts=(shift_size_h, shift_size_w), dims=(1, 2))
-
- out = out.view(b, -1, c)
-
- return out
-
-
-def single_head_split_window_attention_1d(
- q,
- k,
- v,
- relative_position_bias=None,
- num_splits=1,
- with_shift=False,
- h=None,
- w=None,
- attn_mask=None,
-):
- # q, k, v: [B, L, C]
-
- assert h is not None and w is not None
- assert q.size(1) == h * w
-
- b, _, c = q.size()
-
- b_new = b * num_splits * h
-
- window_size_w = w // num_splits
-
- q = q.view(b * h, w, c) # [B*H, W, C]
- k = k.view(b * h, w, c)
- v = v.view(b * h, w, c)
-
- scale_factor = c**0.5
-
- if with_shift:
- assert attn_mask is not None # compute once
- shift_size_w = window_size_w // 2
-
- q = torch.roll(q, shifts=-shift_size_w, dims=1)
- k = torch.roll(k, shifts=-shift_size_w, dims=1)
- v = torch.roll(v, shifts=-shift_size_w, dims=1)
-
- q = split_feature_1d(q, num_splits=num_splits) # [B*H*K, W/K, C]
- k = split_feature_1d(k, num_splits=num_splits)
- v = split_feature_1d(v, num_splits=num_splits)
-
- scores = (
- torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1)) / scale_factor
- ) # [B*H*K, W/K, W/K]
-
- if with_shift:
- # attn_mask: [K, W/K, W/K]
- scores += attn_mask.repeat(b * h, 1, 1) # [B*H*K, W/K, W/K]
-
- attn = torch.softmax(scores, dim=-1)
-
- out = torch.matmul(attn, v.view(b_new, -1, c)) # [B*H*K, W/K, C]
-
- out = merge_splits_1d(out, h, num_splits=num_splits) # [B, H, W, C]
-
- # shift back
- if with_shift:
- out = torch.roll(out, shifts=shift_size_w, dims=2)
-
- out = out.view(b, -1, c)
-
- return out
-
-
-class SelfAttnPropagation(nn.Module):
- """
- flow propagation with self-attention on feature
- query: feature0, key: feature0, value: flow
- """
-
- def __init__(
- self,
- in_channels,
- **kwargs,
- ):
- super(SelfAttnPropagation, self).__init__()
-
- self.q_proj = nn.Linear(in_channels, in_channels)
- self.k_proj = nn.Linear(in_channels, in_channels)
-
- for p in self.parameters():
- if p.dim() > 1:
- nn.init.xavier_uniform_(p)
-
- def forward(
- self,
- feature0,
- flow,
- local_window_attn=False,
- local_window_radius=1,
- **kwargs,
- ):
- # q, k: feature [B, C, H, W], v: flow [B, 2, H, W]
- if local_window_attn:
- return self.forward_local_window_attn(feature0, flow, local_window_radius=local_window_radius)
-
- b, c, h, w = feature0.size()
-
- query = feature0.view(b, c, h * w).permute(0, 2, 1) # [B, H*W, C]
-
- # a note: the ``correct'' implementation should be:
- # ``query = self.q_proj(query), key = self.k_proj(query)''
- # this problem is observed while cleaning up the code
- # however, this doesn't affect the performance since the projection is a linear operation,
- # thus the two projection matrices for key can be merged
- # so I just leave it as is in order to not re-train all models :)
- query = self.q_proj(query) # [B, H*W, C]
- key = self.k_proj(query) # [B, H*W, C]
-
- value = flow.view(b, flow.size(1), h * w).permute(0, 2, 1) # [B, H*W, 2]
-
- scores = torch.matmul(query, key.permute(0, 2, 1)) / (c**0.5) # [B, H*W, H*W]
- prob = torch.softmax(scores, dim=-1)
-
- out = torch.matmul(prob, value) # [B, H*W, 2]
- out = out.view(b, h, w, value.size(-1)).permute(0, 3, 1, 2) # [B, 2, H, W]
-
- return out
-
- def forward_local_window_attn(
- self,
- feature0,
- flow,
- local_window_radius=1,
- ):
- assert flow.size(1) == 2 or flow.size(1) == 1 # flow or disparity or depth
- assert local_window_radius > 0
-
- b, c, h, w = feature0.size()
-
- value_channel = flow.size(1)
-
- feature0_reshape = self.q_proj(feature0.view(b, c, -1).permute(0, 2, 1)).reshape(
- b * h * w, 1, c
- ) # [B*H*W, 1, C]
-
- kernel_size = 2 * local_window_radius + 1
-
- feature0_proj = self.k_proj(feature0.view(b, c, -1).permute(0, 2, 1)).permute(0, 2, 1).reshape(b, c, h, w)
-
- feature0_window = F.unfold(
- feature0_proj, kernel_size=kernel_size, padding=local_window_radius
- ) # [B, C*(2R+1)^2), H*W]
-
- feature0_window = (
- feature0_window.view(b, c, kernel_size**2, h, w)
- .permute(0, 3, 4, 1, 2)
- .reshape(b * h * w, c, kernel_size**2)
- ) # [B*H*W, C, (2R+1)^2]
-
- flow_window = F.unfold(flow, kernel_size=kernel_size, padding=local_window_radius) # [B, 2*(2R+1)^2), H*W]
-
- flow_window = (
- flow_window.view(b, value_channel, kernel_size**2, h, w)
- .permute(0, 3, 4, 2, 1)
- .reshape(b * h * w, kernel_size**2, value_channel)
- ) # [B*H*W, (2R+1)^2, 2]
-
- scores = torch.matmul(feature0_reshape, feature0_window) / (c**0.5) # [B*H*W, 1, (2R+1)^2]
-
- prob = torch.softmax(scores, dim=-1)
-
- out = (
- torch.matmul(prob, flow_window).view(b, h, w, value_channel).permute(0, 3, 1, 2).contiguous()
- ) # [B, 2, H, W]
-
- return out
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/backbone.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/backbone.py
deleted file mode 100644
index 5c2cc19f7dae5013da0c6a22d50e4bfabfed8ee6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/backbone.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import torch.nn as nn
-
-from .trident_conv import MultiScaleTridentConv
-
-
-class ResidualBlock(nn.Module):
- def __init__(
- self,
- in_planes,
- planes,
- norm_layer=nn.InstanceNorm2d,
- stride=1,
- dilation=1,
- ):
- super(ResidualBlock, self).__init__()
-
- self.conv1 = nn.Conv2d(
- in_planes, planes, kernel_size=3, dilation=dilation, padding=dilation, stride=stride, bias=False
- )
- self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, dilation=dilation, padding=dilation, bias=False)
- self.relu = nn.ReLU(inplace=True)
-
- self.norm1 = norm_layer(planes)
- self.norm2 = norm_layer(planes)
- if not stride == 1 or in_planes != planes:
- self.norm3 = norm_layer(planes)
-
- if stride == 1 and in_planes == planes:
- self.downsample = None
- else:
- self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
-
- def forward(self, x):
- y = x
- y = self.relu(self.norm1(self.conv1(y)))
- y = self.relu(self.norm2(self.conv2(y)))
-
- if self.downsample is not None:
- x = self.downsample(x)
-
- return self.relu(x + y)
-
-
-class CNNEncoder(nn.Module):
- def __init__(
- self,
- output_dim=128,
- norm_layer=nn.InstanceNorm2d,
- num_output_scales=1,
- **kwargs,
- ):
- super(CNNEncoder, self).__init__()
- self.num_branch = num_output_scales
-
- feature_dims = [64, 96, 128]
-
- self.conv1 = nn.Conv2d(3, feature_dims[0], kernel_size=7, stride=2, padding=3, bias=False) # 1/2
- self.norm1 = norm_layer(feature_dims[0])
- self.relu1 = nn.ReLU(inplace=True)
-
- self.in_planes = feature_dims[0]
- self.layer1 = self._make_layer(feature_dims[0], stride=1, norm_layer=norm_layer) # 1/2
- self.layer2 = self._make_layer(feature_dims[1], stride=2, norm_layer=norm_layer) # 1/4
-
- # highest resolution 1/4 or 1/8
- stride = 2 if num_output_scales == 1 else 1
- self.layer3 = self._make_layer(
- feature_dims[2],
- stride=stride,
- norm_layer=norm_layer,
- ) # 1/4 or 1/8
-
- self.conv2 = nn.Conv2d(feature_dims[2], output_dim, 1, 1, 0)
-
- if self.num_branch > 1:
- if self.num_branch == 4:
- strides = (1, 2, 4, 8)
- elif self.num_branch == 3:
- strides = (1, 2, 4)
- elif self.num_branch == 2:
- strides = (1, 2)
- else:
- raise ValueError
-
- self.trident_conv = MultiScaleTridentConv(
- output_dim,
- output_dim,
- kernel_size=3,
- strides=strides,
- paddings=1,
- num_branch=self.num_branch,
- )
-
- for m in self.modules():
- if isinstance(m, nn.Conv2d):
- nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
- elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
- if m.weight is not None:
- nn.init.constant_(m.weight, 1)
- if m.bias is not None:
- nn.init.constant_(m.bias, 0)
-
- def _make_layer(self, dim, stride=1, dilation=1, norm_layer=nn.InstanceNorm2d):
- layer1 = ResidualBlock(self.in_planes, dim, norm_layer=norm_layer, stride=stride, dilation=dilation)
- layer2 = ResidualBlock(dim, dim, norm_layer=norm_layer, stride=1, dilation=dilation)
-
- layers = (layer1, layer2)
-
- self.in_planes = dim
- return nn.Sequential(*layers)
-
- def forward(self, x):
- x = self.conv1(x)
- x = self.norm1(x)
- x = self.relu1(x)
-
- x = self.layer1(x) # 1/2
- x = self.layer2(x) # 1/4
- x = self.layer3(x) # 1/8 or 1/4
-
- x = self.conv2(x)
-
- if self.num_branch > 1:
- out = self.trident_conv([x] * self.num_branch) # high to low res
- else:
- out = [x]
-
- return out
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/geometry.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/geometry.py
deleted file mode 100644
index df4d8e38d8afabe7f4e8a69724c75427dec9bd2b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/geometry.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-
-def coords_grid(b, h, w, homogeneous=False, device=None):
- y, x = torch.meshgrid(torch.arange(h), torch.arange(w)) # [H, W]
-
- stacks = [x, y]
-
- if homogeneous:
- ones = torch.ones_like(x) # [H, W]
- stacks.append(ones)
-
- grid = torch.stack(stacks, dim=0).float() # [2, H, W] or [3, H, W]
-
- grid = grid[None].repeat(b, 1, 1, 1) # [B, 2, H, W] or [B, 3, H, W]
-
- if device is not None:
- grid = grid.to(device)
-
- return grid
-
-
-def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None):
- assert device is not None
-
- x, y = torch.meshgrid(
- [torch.linspace(w_min, w_max, len_w, device=device), torch.linspace(h_min, h_max, len_h, device=device)],
- )
- grid = torch.stack((x, y), -1).transpose(0, 1).float() # [H, W, 2]
-
- return grid
-
-
-def normalize_coords(coords, h, w):
- # coords: [B, H, W, 2]
- c = torch.Tensor([(w - 1) / 2.0, (h - 1) / 2.0]).float().to(coords.device)
- return (coords - c) / c # [-1, 1]
-
-
-def bilinear_sample(img, sample_coords, mode="bilinear", padding_mode="zeros", return_mask=False):
- # img: [B, C, H, W]
- # sample_coords: [B, 2, H, W] in image scale
- if sample_coords.size(1) != 2: # [B, H, W, 2]
- sample_coords = sample_coords.permute(0, 3, 1, 2)
-
- b, _, h, w = sample_coords.shape
-
- # Normalize to [-1, 1]
- x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1
- y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1
-
- grid = torch.stack([x_grid, y_grid], dim=-1) # [B, H, W, 2]
-
- img = F.grid_sample(img, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
-
- if return_mask:
- mask = (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1) # [B, H, W]
-
- return img, mask
-
- return img
-
-
-def flow_warp(feature, flow, mask=False, padding_mode="zeros"):
- b, c, h, w = feature.size()
- assert flow.size(1) == 2
-
- grid = coords_grid(b, h, w).to(flow.device) + flow # [B, 2, H, W]
-
- return bilinear_sample(feature, grid, padding_mode=padding_mode, return_mask=mask)
-
-
-def forward_backward_consistency_check(fwd_flow, bwd_flow, alpha=0.01, beta=0.5):
- # fwd_flow, bwd_flow: [B, 2, H, W]
- # alpha and beta values are following UnFlow (https://arxiv.org/abs/1711.07837)
- assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4
- assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2
- flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1) # [B, H, W]
-
- warped_bwd_flow = flow_warp(bwd_flow, fwd_flow) # [B, 2, H, W]
- warped_fwd_flow = flow_warp(fwd_flow, bwd_flow) # [B, 2, H, W]
-
- diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1) # [B, H, W]
- diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1)
-
- threshold = alpha * flow_mag + beta
-
- fwd_occ = (diff_fwd > threshold).float() # [B, H, W]
- bwd_occ = (diff_bwd > threshold).float()
-
- return fwd_occ, bwd_occ
-
-
-def back_project(depth, intrinsics):
- # Back project 2D pixel coords to 3D points
- # depth: [B, H, W]
- # intrinsics: [B, 3, 3]
- b, h, w = depth.shape
- grid = coords_grid(b, h, w, homogeneous=True, device=depth.device) # [B, 3, H, W]
-
- intrinsics_inv = torch.inverse(intrinsics) # [B, 3, 3]
-
- points = intrinsics_inv.bmm(grid.view(b, 3, -1)).view(b, 3, h, w) * depth.unsqueeze(1) # [B, 3, H, W]
-
- return points
-
-
-def camera_transform(points_ref, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None):
- # Transform 3D points from reference camera to target camera
- # points_ref: [B, 3, H, W]
- # extrinsics_ref: [B, 4, 4]
- # extrinsics_tgt: [B, 4, 4]
- # extrinsics_rel: [B, 4, 4], relative pose transform
- b, _, h, w = points_ref.shape
-
- if extrinsics_rel is None:
- extrinsics_rel = torch.bmm(extrinsics_tgt, torch.inverse(extrinsics_ref)) # [B, 4, 4]
-
- points_tgt = (
- torch.bmm(extrinsics_rel[:, :3, :3], points_ref.view(b, 3, -1)) + extrinsics_rel[:, :3, -1:]
- ) # [B, 3, H*W]
-
- points_tgt = points_tgt.view(b, 3, h, w) # [B, 3, H, W]
-
- return points_tgt
-
-
-def reproject(points_tgt, intrinsics, return_mask=False):
- # reproject to target view
- # points_tgt: [B, 3, H, W]
- # intrinsics: [B, 3, 3]
-
- b, _, h, w = points_tgt.shape
-
- proj_points = torch.bmm(intrinsics, points_tgt.view(b, 3, -1)).view(b, 3, h, w) # [B, 3, H, W]
-
- X = proj_points[:, 0]
- Y = proj_points[:, 1]
- Z = proj_points[:, 2].clamp(min=1e-3)
-
- pixel_coords = torch.stack([X / Z, Y / Z], dim=1).view(b, 2, h, w) # [B, 2, H, W] in image scale
-
- if return_mask:
- # valid mask in pixel space
- mask = (
- (pixel_coords[:, 0] >= 0)
- & (pixel_coords[:, 0] <= (w - 1))
- & (pixel_coords[:, 1] >= 0)
- & (pixel_coords[:, 1] <= (h - 1))
- ) # [B, H, W]
-
- return pixel_coords, mask
-
- return pixel_coords
-
-
-def reproject_coords(
- depth_ref, intrinsics, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None, return_mask=False
-):
- # Compute reprojection sample coords
- points_ref = back_project(depth_ref, intrinsics) # [B, 3, H, W]
- points_tgt = camera_transform(points_ref, extrinsics_ref, extrinsics_tgt, extrinsics_rel=extrinsics_rel)
-
- if return_mask:
- reproj_coords, mask = reproject(points_tgt, intrinsics, return_mask=return_mask) # [B, 2, H, W] in image scale
-
- return reproj_coords, mask
-
- reproj_coords = reproject(points_tgt, intrinsics, return_mask=return_mask) # [B, 2, H, W] in image scale
-
- return reproj_coords
-
-
-def compute_flow_with_depth_pose(
- depth_ref, intrinsics, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None, return_mask=False
-):
- b, h, w = depth_ref.shape
- coords_init = coords_grid(b, h, w, device=depth_ref.device) # [B, 2, H, W]
-
- if return_mask:
- reproj_coords, mask = reproject_coords(
- depth_ref,
- intrinsics,
- extrinsics_ref,
- extrinsics_tgt,
- extrinsics_rel=extrinsics_rel,
- return_mask=return_mask,
- ) # [B, 2, H, W]
- rigid_flow = reproj_coords - coords_init
-
- return rigid_flow, mask
-
- reproj_coords = reproject_coords(
- depth_ref, intrinsics, extrinsics_ref, extrinsics_tgt, extrinsics_rel=extrinsics_rel, return_mask=return_mask
- ) # [B, 2, H, W]
-
- rigid_flow = reproj_coords - coords_init
-
- return rigid_flow
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/matching.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/matching.py
deleted file mode 100644
index fe5e103d742b16edff87835a1cd4db45e15775ad..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/matching.py
+++ /dev/null
@@ -1,307 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-from .geometry import coords_grid, generate_window_grid, normalize_coords
-
-
-def global_correlation_softmax(
- feature0,
- feature1,
- pred_bidir_flow=False,
-):
- # global correlation
- b, c, h, w = feature0.shape
- feature0 = feature0.view(b, c, -1).permute(0, 2, 1) # [B, H*W, C]
- feature1 = feature1.view(b, c, -1) # [B, C, H*W]
-
- correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (c**0.5) # [B, H, W, H, W]
-
- # flow from softmax
- init_grid = coords_grid(b, h, w).to(correlation.device) # [B, 2, H, W]
- grid = init_grid.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2]
-
- correlation = correlation.view(b, h * w, h * w) # [B, H*W, H*W]
-
- if pred_bidir_flow:
- correlation = torch.cat((correlation, correlation.permute(0, 2, 1)), dim=0) # [2*B, H*W, H*W]
- init_grid = init_grid.repeat(2, 1, 1, 1) # [2*B, 2, H, W]
- grid = grid.repeat(2, 1, 1) # [2*B, H*W, 2]
- b = b * 2
-
- prob = F.softmax(correlation, dim=-1) # [B, H*W, H*W]
-
- correspondence = torch.matmul(prob, grid).view(b, h, w, 2).permute(0, 3, 1, 2) # [B, 2, H, W]
-
- # when predicting bidirectional flow, flow is the concatenation of forward flow and backward flow
- flow = correspondence - init_grid
-
- return flow, prob
-
-
-def local_correlation_softmax(
- feature0,
- feature1,
- local_radius,
- padding_mode="zeros",
-):
- b, c, h, w = feature0.size()
- coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W]
- coords = coords_init.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2]
-
- local_h = 2 * local_radius + 1
- local_w = 2 * local_radius + 1
-
- window_grid = generate_window_grid(
- -local_radius, local_radius, -local_radius, local_radius, local_h, local_w, device=feature0.device
- ) # [2R+1, 2R+1, 2]
- window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1)^2, 2]
- sample_coords = coords.unsqueeze(-2) + window_grid # [B, H*W, (2R+1)^2, 2]
-
- sample_coords_softmax = sample_coords
-
- # exclude coords that are out of image space
- valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w) # [B, H*W, (2R+1)^2]
- valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h) # [B, H*W, (2R+1)^2]
-
- valid = valid_x & valid_y # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax
-
- # normalize coordinates to [-1, 1]
- sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1]
- window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=True).permute(
- 0, 2, 1, 3
- ) # [B, H*W, C, (2R+1)^2]
- feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c) # [B, H*W, 1, C]
-
- corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5) # [B, H*W, (2R+1)^2]
-
- # mask invalid locations
- corr[~valid] = -1e9
-
- prob = F.softmax(corr, -1) # [B, H*W, (2R+1)^2]
-
- correspondence = (
- torch.matmul(prob.unsqueeze(-2), sample_coords_softmax).squeeze(-2).view(b, h, w, 2).permute(0, 3, 1, 2)
- ) # [B, 2, H, W]
-
- flow = correspondence - coords_init
- match_prob = prob
-
- return flow, match_prob
-
-
-def local_correlation_with_flow(
- feature0,
- feature1,
- flow,
- local_radius,
- padding_mode="zeros",
- dilation=1,
-):
- b, c, h, w = feature0.size()
- coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W]
- coords = coords_init.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2]
-
- local_h = 2 * local_radius + 1
- local_w = 2 * local_radius + 1
-
- window_grid = generate_window_grid(
- -local_radius, local_radius, -local_radius, local_radius, local_h, local_w, device=feature0.device
- ) # [2R+1, 2R+1, 2]
- window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1)^2, 2]
- sample_coords = coords.unsqueeze(-2) + window_grid * dilation # [B, H*W, (2R+1)^2, 2]
-
- # flow can be zero when using features after transformer
- if not isinstance(flow, float):
- sample_coords = sample_coords + flow.view(b, 2, -1).permute(0, 2, 1).unsqueeze(-2) # [B, H*W, (2R+1)^2, 2]
- else:
- assert flow == 0.0
-
- # normalize coordinates to [-1, 1]
- sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1]
- window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=True).permute(
- 0, 2, 1, 3
- ) # [B, H*W, C, (2R+1)^2]
- feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c) # [B, H*W, 1, C]
-
- corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5) # [B, H*W, (2R+1)^2]
-
- corr = corr.view(b, h, w, -1).permute(0, 3, 1, 2).contiguous() # [B, (2R+1)^2, H, W]
-
- return corr
-
-
-def global_correlation_softmax_stereo(
- feature0,
- feature1,
-):
- # global correlation on horizontal direction
- b, c, h, w = feature0.shape
-
- x_grid = torch.linspace(0, w - 1, w, device=feature0.device) # [W]
-
- feature0 = feature0.permute(0, 2, 3, 1) # [B, H, W, C]
- feature1 = feature1.permute(0, 2, 1, 3) # [B, H, C, W]
-
- correlation = torch.matmul(feature0, feature1) / (c**0.5) # [B, H, W, W]
-
- # mask subsequent positions to make disparity positive
- mask = torch.triu(torch.ones((w, w)), diagonal=1).type_as(feature0) # [W, W]
- valid_mask = (mask == 0).unsqueeze(0).unsqueeze(0).repeat(b, h, 1, 1) # [B, H, W, W]
-
- correlation[~valid_mask] = -1e9
-
- prob = F.softmax(correlation, dim=-1) # [B, H, W, W]
-
- correspondence = (x_grid.view(1, 1, 1, w) * prob).sum(-1) # [B, H, W]
-
- # NOTE: unlike flow, disparity is typically positive
- disparity = x_grid.view(1, 1, w).repeat(b, h, 1) - correspondence # [B, H, W]
-
- return disparity.unsqueeze(1), prob # feature resolution
-
-
-def local_correlation_softmax_stereo(
- feature0,
- feature1,
- local_radius,
-):
- b, c, h, w = feature0.size()
- coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W]
- coords = coords_init.view(b, 2, -1).permute(0, 2, 1).contiguous() # [B, H*W, 2]
-
- local_h = 1
- local_w = 2 * local_radius + 1
-
- window_grid = generate_window_grid(
- 0, 0, -local_radius, local_radius, local_h, local_w, device=feature0.device
- ) # [1, 2R+1, 2]
- window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1), 2]
- sample_coords = coords.unsqueeze(-2) + window_grid # [B, H*W, (2R+1), 2]
-
- sample_coords_softmax = sample_coords
-
- # exclude coords that are out of image space
- valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w) # [B, H*W, (2R+1)^2]
- valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h) # [B, H*W, (2R+1)^2]
-
- valid = valid_x & valid_y # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax
-
- # normalize coordinates to [-1, 1]
- sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1]
- window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode="zeros", align_corners=True).permute(
- 0, 2, 1, 3
- ) # [B, H*W, C, (2R+1)]
- feature0_view = feature0.permute(0, 2, 3, 1).contiguous().view(b, h * w, 1, c) # [B, H*W, 1, C]
-
- corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5) # [B, H*W, (2R+1)]
-
- # mask invalid locations
- corr[~valid] = -1e9
-
- prob = F.softmax(corr, -1) # [B, H*W, (2R+1)]
-
- correspondence = (
- torch.matmul(prob.unsqueeze(-2), sample_coords_softmax)
- .squeeze(-2)
- .view(b, h, w, 2)
- .permute(0, 3, 1, 2)
- .contiguous()
- ) # [B, 2, H, W]
-
- flow = correspondence - coords_init # flow at feature resolution
- match_prob = prob
-
- flow_x = -flow[:, :1] # [B, 1, H, W]
-
- return flow_x, match_prob
-
-
-def correlation_softmax_depth(
- feature0,
- feature1,
- intrinsics,
- pose,
- depth_candidates,
- depth_from_argmax=False,
- pred_bidir_depth=False,
-):
- b, c, h, w = feature0.size()
- assert depth_candidates.dim() == 4 # [B, D, H, W]
- scale_factor = c**0.5
-
- if pred_bidir_depth:
- feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0)
- intrinsics = intrinsics.repeat(2, 1, 1)
- pose = torch.cat((pose, torch.inverse(pose)), dim=0)
- depth_candidates = depth_candidates.repeat(2, 1, 1, 1)
-
- # depth candidates are actually inverse depth
- warped_feature1 = warp_with_pose_depth_candidates(
- feature1,
- intrinsics,
- pose,
- 1.0 / depth_candidates,
- ) # [B, C, D, H, W]
-
- correlation = (feature0.unsqueeze(2) * warped_feature1).sum(1) / scale_factor # [B, D, H, W]
-
- match_prob = F.softmax(correlation, dim=1) # [B, D, H, W]
-
- # for cross-task transfer (flow -> depth), extract depth with argmax at test time
- if depth_from_argmax:
- index = torch.argmax(match_prob, dim=1, keepdim=True)
- depth = torch.gather(depth_candidates, dim=1, index=index)
- else:
- depth = (match_prob * depth_candidates).sum(dim=1, keepdim=True) # [B, 1, H, W]
-
- return depth, match_prob
-
-
-def warp_with_pose_depth_candidates(
- feature1,
- intrinsics,
- pose,
- depth,
- clamp_min_depth=1e-3,
-):
- """
- feature1: [B, C, H, W]
- intrinsics: [B, 3, 3]
- pose: [B, 4, 4]
- depth: [B, D, H, W]
- """
-
- assert intrinsics.size(1) == intrinsics.size(2) == 3
- assert pose.size(1) == pose.size(2) == 4
- assert depth.dim() == 4
-
- b, d, h, w = depth.size()
- c = feature1.size(1)
-
- with torch.no_grad():
- # pixel coordinates
- grid = coords_grid(b, h, w, homogeneous=True, device=depth.device) # [B, 3, H, W]
- # back project to 3D and transform viewpoint
- points = torch.inverse(intrinsics).bmm(grid.view(b, 3, -1)) # [B, 3, H*W]
- points = torch.bmm(pose[:, :3, :3], points).unsqueeze(2).repeat(1, 1, d, 1) * depth.view(
- b, 1, d, h * w
- ) # [B, 3, D, H*W]
- points = points + pose[:, :3, -1:].unsqueeze(-1) # [B, 3, D, H*W]
- # reproject to 2D image plane
- points = torch.bmm(intrinsics, points.view(b, 3, -1)).view(b, 3, d, h * w) # [B, 3, D, H*W]
- pixel_coords = points[:, :2] / points[:, -1:].clamp(min=clamp_min_depth) # [B, 2, D, H*W]
-
- # normalize to [-1, 1]
- x_grid = 2 * pixel_coords[:, 0] / (w - 1) - 1
- y_grid = 2 * pixel_coords[:, 1] / (h - 1) - 1
-
- grid = torch.stack([x_grid, y_grid], dim=-1) # [B, D, H*W, 2]
-
- # sample features
- warped_feature = F.grid_sample(
- feature1, grid.view(b, d * h, w, 2), mode="bilinear", padding_mode="zeros", align_corners=True
- ).view(
- b, c, d, h, w
- ) # [B, C, D, H, W]
-
- return warped_feature
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/position.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/position.py
deleted file mode 100644
index 619f3568d4c81f41316010be6a866a0e115cfc80..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/position.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-# https://github.com/facebookresearch/detr/blob/main/models/position_encoding.py
-
-import math
-
-import torch
-import torch.nn as nn
-
-
-class PositionEmbeddingSine(nn.Module):
- """
- This is a more standard version of the position embedding, very similar to the one
- used by the Attention is all you need paper, generalized to work on images.
- """
-
- def __init__(self, num_pos_feats=64, temperature=10000, normalize=True, scale=None):
- super().__init__()
- self.num_pos_feats = num_pos_feats
- self.temperature = temperature
- self.normalize = normalize
- if scale is not None and normalize is False:
- raise ValueError("normalize should be True if scale is passed")
- if scale is None:
- scale = 2 * math.pi
- self.scale = scale
-
- def forward(self, x):
- # x = tensor_list.tensors # [B, C, H, W]
- # mask = tensor_list.mask # [B, H, W], input with padding, valid as 0
- b, c, h, w = x.size()
- mask = torch.ones((b, h, w), device=x.device) # [B, H, W]
- y_embed = mask.cumsum(1, dtype=torch.float32)
- x_embed = mask.cumsum(2, dtype=torch.float32)
- if self.normalize:
- eps = 1e-6
- y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
- x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
-
- dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
- dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
-
- pos_x = x_embed[:, :, :, None] / dim_t
- pos_y = y_embed[:, :, :, None] / dim_t
- pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
- pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
- pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
- return pos
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/reg_refine.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/reg_refine.py
deleted file mode 100644
index 965f4cac62a8db3b42187b9cdbc2f679a70e6ac3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/reg_refine.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class FlowHead(nn.Module):
- def __init__(
- self,
- input_dim=128,
- hidden_dim=256,
- out_dim=2,
- ):
- super(FlowHead, self).__init__()
-
- self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
- self.conv2 = nn.Conv2d(hidden_dim, out_dim, 3, padding=1)
- self.relu = nn.ReLU(inplace=True)
-
- def forward(self, x):
- out = self.conv2(self.relu(self.conv1(x)))
-
- return out
-
-
-class SepConvGRU(nn.Module):
- def __init__(
- self,
- hidden_dim=128,
- input_dim=192 + 128,
- kernel_size=5,
- ):
- padding = (kernel_size - 1) // 2
-
- super(SepConvGRU, self).__init__()
- self.convz1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
- self.convr1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
- self.convq1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
-
- self.convz2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
- self.convr2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
- self.convq2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
-
- def forward(self, h, x):
- # horizontal
- hx = torch.cat([h, x], dim=1)
- z = torch.sigmoid(self.convz1(hx))
- r = torch.sigmoid(self.convr1(hx))
- q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1)))
- h = (1 - z) * h + z * q
-
- # vertical
- hx = torch.cat([h, x], dim=1)
- z = torch.sigmoid(self.convz2(hx))
- r = torch.sigmoid(self.convr2(hx))
- q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1)))
- h = (1 - z) * h + z * q
-
- return h
-
-
-class BasicMotionEncoder(nn.Module):
- def __init__(
- self,
- corr_channels=324,
- flow_channels=2,
- ):
- super(BasicMotionEncoder, self).__init__()
-
- self.convc1 = nn.Conv2d(corr_channels, 256, 1, padding=0)
- self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
- self.convf1 = nn.Conv2d(flow_channels, 128, 7, padding=3)
- self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
- self.conv = nn.Conv2d(64 + 192, 128 - flow_channels, 3, padding=1)
-
- def forward(self, flow, corr):
- cor = F.relu(self.convc1(corr))
- cor = F.relu(self.convc2(cor))
- flo = F.relu(self.convf1(flow))
- flo = F.relu(self.convf2(flo))
-
- cor_flo = torch.cat([cor, flo], dim=1)
- out = F.relu(self.conv(cor_flo))
- return torch.cat([out, flow], dim=1)
-
-
-class BasicUpdateBlock(nn.Module):
- def __init__(
- self,
- corr_channels=324,
- hidden_dim=128,
- context_dim=128,
- downsample_factor=8,
- flow_dim=2,
- bilinear_up=False,
- ):
- super(BasicUpdateBlock, self).__init__()
-
- self.encoder = BasicMotionEncoder(
- corr_channels=corr_channels,
- flow_channels=flow_dim,
- )
-
- self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=context_dim + hidden_dim)
-
- self.flow_head = FlowHead(
- hidden_dim,
- hidden_dim=256,
- out_dim=flow_dim,
- )
-
- if bilinear_up:
- self.mask = None
- else:
- self.mask = nn.Sequential(
- nn.Conv2d(hidden_dim, 256, 3, padding=1),
- nn.ReLU(inplace=True),
- nn.Conv2d(256, downsample_factor**2 * 9, 1, padding=0),
- )
-
- def forward(self, net, inp, corr, flow):
- motion_features = self.encoder(flow, corr)
-
- inp = torch.cat([inp, motion_features], dim=1)
-
- net = self.gru(net, inp)
- delta_flow = self.flow_head(net)
-
- if self.mask is not None:
- mask = self.mask(net)
- else:
- mask = None
-
- return net, mask, delta_flow
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/transformer.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/transformer.py
deleted file mode 100644
index 7fdffd17feb0328260f1a93b778801337d14a2c3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/transformer.py
+++ /dev/null
@@ -1,339 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .attention import (
- single_head_full_attention,
- single_head_full_attention_1d,
- single_head_split_window_attention,
- single_head_split_window_attention_1d,
-)
-from .utils import generate_shift_window_attn_mask, generate_shift_window_attn_mask_1d
-
-
-class TransformerLayer(nn.Module):
- def __init__(
- self,
- d_model=128,
- nhead=1,
- no_ffn=False,
- ffn_dim_expansion=4,
- ):
- super(TransformerLayer, self).__init__()
-
- self.dim = d_model
- self.nhead = nhead
- self.no_ffn = no_ffn
-
- # multi-head attention
- self.q_proj = nn.Linear(d_model, d_model, bias=False)
- self.k_proj = nn.Linear(d_model, d_model, bias=False)
- self.v_proj = nn.Linear(d_model, d_model, bias=False)
-
- self.merge = nn.Linear(d_model, d_model, bias=False)
-
- self.norm1 = nn.LayerNorm(d_model)
-
- # no ffn after self-attn, with ffn after cross-attn
- if not self.no_ffn:
- in_channels = d_model * 2
- self.mlp = nn.Sequential(
- nn.Linear(in_channels, in_channels * ffn_dim_expansion, bias=False),
- nn.GELU(),
- nn.Linear(in_channels * ffn_dim_expansion, d_model, bias=False),
- )
-
- self.norm2 = nn.LayerNorm(d_model)
-
- def forward(
- self,
- source,
- target,
- height=None,
- width=None,
- shifted_window_attn_mask=None,
- shifted_window_attn_mask_1d=None,
- attn_type="swin",
- with_shift=False,
- attn_num_splits=None,
- ):
- # source, target: [B, L, C]
- query, key, value = source, target, target
-
- # for stereo: 2d attn in self-attn, 1d attn in cross-attn
- is_self_attn = (query - key).abs().max() < 1e-6
-
- # single-head attention
- query = self.q_proj(query) # [B, L, C]
- key = self.k_proj(key) # [B, L, C]
- value = self.v_proj(value) # [B, L, C]
-
- if attn_type == "swin" and attn_num_splits > 1: # self, cross-attn: both swin 2d
- if self.nhead > 1:
- # we observe that multihead attention slows down the speed and increases the memory consumption
- # without bringing obvious performance gains and thus the implementation is removed
- raise NotImplementedError
- else:
- message = single_head_split_window_attention(
- query,
- key,
- value,
- num_splits=attn_num_splits,
- with_shift=with_shift,
- h=height,
- w=width,
- attn_mask=shifted_window_attn_mask,
- )
-
- elif attn_type == "self_swin2d_cross_1d": # self-attn: swin 2d, cross-attn: full 1d
- if self.nhead > 1:
- raise NotImplementedError
- else:
- if is_self_attn:
- if attn_num_splits > 1:
- message = single_head_split_window_attention(
- query,
- key,
- value,
- num_splits=attn_num_splits,
- with_shift=with_shift,
- h=height,
- w=width,
- attn_mask=shifted_window_attn_mask,
- )
- else:
- # full 2d attn
- message = single_head_full_attention(query, key, value) # [N, L, C]
-
- else:
- # cross attn 1d
- message = single_head_full_attention_1d(
- query,
- key,
- value,
- h=height,
- w=width,
- )
-
- elif attn_type == "self_swin2d_cross_swin1d": # self-attn: swin 2d, cross-attn: swin 1d
- if self.nhead > 1:
- raise NotImplementedError
- else:
- if is_self_attn:
- if attn_num_splits > 1:
- # self attn shift window
- message = single_head_split_window_attention(
- query,
- key,
- value,
- num_splits=attn_num_splits,
- with_shift=with_shift,
- h=height,
- w=width,
- attn_mask=shifted_window_attn_mask,
- )
- else:
- # full 2d attn
- message = single_head_full_attention(query, key, value) # [N, L, C]
- else:
- if attn_num_splits > 1:
- assert shifted_window_attn_mask_1d is not None
- # cross attn 1d shift
- message = single_head_split_window_attention_1d(
- query,
- key,
- value,
- num_splits=attn_num_splits,
- with_shift=with_shift,
- h=height,
- w=width,
- attn_mask=shifted_window_attn_mask_1d,
- )
- else:
- message = single_head_full_attention_1d(
- query,
- key,
- value,
- h=height,
- w=width,
- )
-
- else:
- message = single_head_full_attention(query, key, value) # [B, L, C]
-
- message = self.merge(message) # [B, L, C]
- message = self.norm1(message)
-
- if not self.no_ffn:
- message = self.mlp(torch.cat([source, message], dim=-1))
- message = self.norm2(message)
-
- return source + message
-
-
-class TransformerBlock(nn.Module):
- """self attention + cross attention + FFN"""
-
- def __init__(
- self,
- d_model=128,
- nhead=1,
- ffn_dim_expansion=4,
- ):
- super(TransformerBlock, self).__init__()
-
- self.self_attn = TransformerLayer(
- d_model=d_model,
- nhead=nhead,
- no_ffn=True,
- ffn_dim_expansion=ffn_dim_expansion,
- )
-
- self.cross_attn_ffn = TransformerLayer(
- d_model=d_model,
- nhead=nhead,
- ffn_dim_expansion=ffn_dim_expansion,
- )
-
- def forward(
- self,
- source,
- target,
- height=None,
- width=None,
- shifted_window_attn_mask=None,
- shifted_window_attn_mask_1d=None,
- attn_type="swin",
- with_shift=False,
- attn_num_splits=None,
- ):
- # source, target: [B, L, C]
-
- # self attention
- source = self.self_attn(
- source,
- source,
- height=height,
- width=width,
- shifted_window_attn_mask=shifted_window_attn_mask,
- attn_type=attn_type,
- with_shift=with_shift,
- attn_num_splits=attn_num_splits,
- )
-
- # cross attention and ffn
- source = self.cross_attn_ffn(
- source,
- target,
- height=height,
- width=width,
- shifted_window_attn_mask=shifted_window_attn_mask,
- shifted_window_attn_mask_1d=shifted_window_attn_mask_1d,
- attn_type=attn_type,
- with_shift=with_shift,
- attn_num_splits=attn_num_splits,
- )
-
- return source
-
-
-class FeatureTransformer(nn.Module):
- def __init__(
- self,
- num_layers=6,
- d_model=128,
- nhead=1,
- ffn_dim_expansion=4,
- ):
- super(FeatureTransformer, self).__init__()
-
- self.d_model = d_model
- self.nhead = nhead
-
- self.layers = nn.ModuleList(
- [
- TransformerBlock(
- d_model=d_model,
- nhead=nhead,
- ffn_dim_expansion=ffn_dim_expansion,
- )
- for i in range(num_layers)
- ]
- )
-
- for p in self.parameters():
- if p.dim() > 1:
- nn.init.xavier_uniform_(p)
-
- def forward(
- self,
- feature0,
- feature1,
- attn_type="swin",
- attn_num_splits=None,
- **kwargs,
- ):
- b, c, h, w = feature0.shape
- assert self.d_model == c
-
- feature0 = feature0.flatten(-2).permute(0, 2, 1) # [B, H*W, C]
- feature1 = feature1.flatten(-2).permute(0, 2, 1) # [B, H*W, C]
-
- # 2d attention
- if "swin" in attn_type and attn_num_splits > 1:
- # global and refine use different number of splits
- window_size_h = h // attn_num_splits
- window_size_w = w // attn_num_splits
-
- # compute attn mask once
- shifted_window_attn_mask = generate_shift_window_attn_mask(
- input_resolution=(h, w),
- window_size_h=window_size_h,
- window_size_w=window_size_w,
- shift_size_h=window_size_h // 2,
- shift_size_w=window_size_w // 2,
- device=feature0.device,
- ) # [K*K, H/K*W/K, H/K*W/K]
- else:
- shifted_window_attn_mask = None
-
- # 1d attention
- if "swin1d" in attn_type and attn_num_splits > 1:
- window_size_w = w // attn_num_splits
-
- # compute attn mask once
- shifted_window_attn_mask_1d = generate_shift_window_attn_mask_1d(
- input_w=w,
- window_size_w=window_size_w,
- shift_size_w=window_size_w // 2,
- device=feature0.device,
- ) # [K, W/K, W/K]
- else:
- shifted_window_attn_mask_1d = None
-
- # concat feature0 and feature1 in batch dimension to compute in parallel
- concat0 = torch.cat((feature0, feature1), dim=0) # [2B, H*W, C]
- concat1 = torch.cat((feature1, feature0), dim=0) # [2B, H*W, C]
-
- for i, layer in enumerate(self.layers):
- concat0 = layer(
- concat0,
- concat1,
- height=h,
- width=w,
- attn_type=attn_type,
- with_shift="swin" in attn_type and attn_num_splits > 1 and i % 2 == 1,
- attn_num_splits=attn_num_splits,
- shifted_window_attn_mask=shifted_window_attn_mask,
- shifted_window_attn_mask_1d=shifted_window_attn_mask_1d,
- )
-
- # update feature1
- concat1 = torch.cat(concat0.chunk(chunks=2, dim=0)[::-1], dim=0)
-
- feature0, feature1 = concat0.chunk(chunks=2, dim=0) # [B, H*W, C]
-
- # reshape back
- feature0 = feature0.view(b, h, w, c).permute(0, 3, 1, 2).contiguous() # [B, C, H, W]
- feature1 = feature1.view(b, h, w, c).permute(0, 3, 1, 2).contiguous() # [B, C, H, W]
-
- return feature0, feature1
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/trident_conv.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/trident_conv.py
deleted file mode 100644
index d87579b95dfb5e40d7933264fcf917dbc508bb98..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/trident_conv.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# https://github.com/facebookresearch/detectron2/blob/main/projects/TridentNet/tridentnet/trident_conv.py
-
-import torch
-from torch import nn
-from torch.nn import functional as F
-from torch.nn.modules.utils import _pair
-
-
-class MultiScaleTridentConv(nn.Module):
- def __init__(
- self,
- in_channels,
- out_channels,
- kernel_size,
- stride=1,
- strides=1,
- paddings=0,
- dilations=1,
- dilation=1,
- groups=1,
- num_branch=1,
- test_branch_idx=-1,
- bias=False,
- norm=None,
- activation=None,
- ):
- super(MultiScaleTridentConv, self).__init__()
- self.in_channels = in_channels
- self.out_channels = out_channels
- self.kernel_size = _pair(kernel_size)
- self.num_branch = num_branch
- self.stride = _pair(stride)
- self.groups = groups
- self.with_bias = bias
- self.dilation = dilation
- if isinstance(paddings, int):
- paddings = [paddings] * self.num_branch
- if isinstance(dilations, int):
- dilations = [dilations] * self.num_branch
- if isinstance(strides, int):
- strides = [strides] * self.num_branch
- self.paddings = [_pair(padding) for padding in paddings]
- self.dilations = [_pair(dilation) for dilation in dilations]
- self.strides = [_pair(stride) for stride in strides]
- self.test_branch_idx = test_branch_idx
- self.norm = norm
- self.activation = activation
-
- assert len({self.num_branch, len(self.paddings), len(self.strides)}) == 1
-
- self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, *self.kernel_size))
- if bias:
- self.bias = nn.Parameter(torch.Tensor(out_channels))
- else:
- self.bias = None
-
- nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
- if self.bias is not None:
- nn.init.constant_(self.bias, 0)
-
- def forward(self, inputs):
- num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1
- assert len(inputs) == num_branch
-
- if self.training or self.test_branch_idx == -1:
- outputs = [
- F.conv2d(input, self.weight, self.bias, stride, padding, self.dilation, self.groups)
- for input, stride, padding in zip(inputs, self.strides, self.paddings)
- ]
- else:
- outputs = [
- F.conv2d(
- inputs[0],
- self.weight,
- self.bias,
- self.strides[self.test_branch_idx] if self.test_branch_idx == -1 else self.strides[-1],
- self.paddings[self.test_branch_idx] if self.test_branch_idx == -1 else self.paddings[-1],
- self.dilation,
- self.groups,
- )
- ]
-
- if self.norm is not None:
- outputs = [self.norm(x) for x in outputs]
- if self.activation is not None:
- outputs = [self.activation(x) for x in outputs]
- return outputs
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/unimatch.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/unimatch.py
deleted file mode 100644
index c625b991627d7cb378a29ba0b1091e80c32eae65..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/unimatch.py
+++ /dev/null
@@ -1,393 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .attention import SelfAttnPropagation
-from .backbone import CNNEncoder
-from .geometry import compute_flow_with_depth_pose, flow_warp
-from .matching import (
- correlation_softmax_depth,
- global_correlation_softmax,
- global_correlation_softmax_stereo,
- local_correlation_softmax,
- local_correlation_softmax_stereo,
- local_correlation_with_flow,
-)
-from .reg_refine import BasicUpdateBlock
-from .transformer import FeatureTransformer
-from .utils import feature_add_position, normalize_img, upsample_flow_with_mask
-
-
-class UniMatch(nn.Module):
- def __init__(
- self,
- num_scales=1,
- feature_channels=128,
- upsample_factor=8,
- num_head=1,
- ffn_dim_expansion=4,
- num_transformer_layers=6,
- reg_refine=False, # optional local regression refinement
- task="flow",
- ):
- super(UniMatch, self).__init__()
-
- self.feature_channels = feature_channels
- self.num_scales = num_scales
- self.upsample_factor = upsample_factor
- self.reg_refine = reg_refine
-
- # CNN
- self.backbone = CNNEncoder(output_dim=feature_channels, num_output_scales=num_scales)
-
- # Transformer
- self.transformer = FeatureTransformer(
- num_layers=num_transformer_layers,
- d_model=feature_channels,
- nhead=num_head,
- ffn_dim_expansion=ffn_dim_expansion,
- )
-
- # propagation with self-attn
- self.feature_flow_attn = SelfAttnPropagation(in_channels=feature_channels)
-
- if not self.reg_refine or task == "depth":
- # convex upsampling simiar to RAFT
- # concat feature0 and low res flow as input
- self.upsampler = nn.Sequential(
- nn.Conv2d(2 + feature_channels, 256, 3, 1, 1),
- nn.ReLU(inplace=True),
- nn.Conv2d(256, upsample_factor**2 * 9, 1, 1, 0),
- )
- # thus far, all the learnable parameters are task-agnostic
-
- if reg_refine:
- # optional task-specific local regression refinement
- self.refine_proj = nn.Conv2d(128, 256, 1)
- self.refine = BasicUpdateBlock(
- corr_channels=(2 * 4 + 1) ** 2,
- downsample_factor=upsample_factor,
- flow_dim=2 if task == "flow" else 1,
- bilinear_up=task == "depth",
- )
-
- def extract_feature(self, img0, img1):
- concat = torch.cat((img0, img1), dim=0) # [2B, C, H, W]
- features = self.backbone(concat) # list of [2B, C, H, W], resolution from high to low
-
- # reverse: resolution from low to high
- features = features[::-1]
-
- feature0, feature1 = [], []
-
- for i in range(len(features)):
- feature = features[i]
- chunks = torch.chunk(feature, 2, 0) # tuple
- feature0.append(chunks[0])
- feature1.append(chunks[1])
-
- return feature0, feature1
-
- def upsample_flow(self, flow, feature, bilinear=False, upsample_factor=8, is_depth=False):
- if bilinear:
- multiplier = 1 if is_depth else upsample_factor
- up_flow = (
- F.interpolate(flow, scale_factor=upsample_factor, mode="bilinear", align_corners=True) * multiplier
- )
- else:
- concat = torch.cat((flow, feature), dim=1)
- mask = self.upsampler(concat)
- up_flow = upsample_flow_with_mask(flow, mask, upsample_factor=self.upsample_factor, is_depth=is_depth)
-
- return up_flow
-
- def forward(
- self,
- img0,
- img1,
- attn_type=None,
- attn_splits_list=None,
- corr_radius_list=None,
- prop_radius_list=None,
- num_reg_refine=1,
- pred_bidir_flow=False,
- task="flow",
- intrinsics=None,
- pose=None, # relative pose transform
- min_depth=1.0 / 0.5, # inverse depth range
- max_depth=1.0 / 10,
- num_depth_candidates=64,
- depth_from_argmax=False,
- pred_bidir_depth=False,
- **kwargs,
- ):
- if pred_bidir_flow:
- assert task == "flow"
-
- if task == "depth":
- assert self.num_scales == 1 # multi-scale depth model is not supported yet
-
- results_dict = {}
- flow_preds = []
-
- if task == "flow":
- # stereo and depth tasks have normalized img in dataloader
- img0, img1 = normalize_img(img0, img1) # [B, 3, H, W]
-
- # list of features, resolution low to high
- feature0_list, feature1_list = self.extract_feature(img0, img1) # list of features
-
- flow = None
-
- if task != "depth":
- assert len(attn_splits_list) == len(corr_radius_list) == len(prop_radius_list) == self.num_scales
- else:
- assert len(attn_splits_list) == len(prop_radius_list) == self.num_scales == 1
-
- for scale_idx in range(self.num_scales):
- feature0, feature1 = feature0_list[scale_idx], feature1_list[scale_idx]
-
- if pred_bidir_flow and scale_idx > 0:
- # predicting bidirectional flow with refinement
- feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0)
-
- feature0_ori, feature1_ori = feature0, feature1
-
- upsample_factor = self.upsample_factor * (2 ** (self.num_scales - 1 - scale_idx))
-
- if task == "depth":
- # scale intrinsics
- intrinsics_curr = intrinsics.clone()
- intrinsics_curr[:, :2] = intrinsics_curr[:, :2] / upsample_factor
-
- if scale_idx > 0:
- assert task != "depth" # not supported for multi-scale depth model
- flow = F.interpolate(flow, scale_factor=2, mode="bilinear", align_corners=True) * 2
-
- if flow is not None:
- assert task != "depth"
- flow = flow.detach()
-
- if task == "stereo":
- # construct flow vector for disparity
- # flow here is actually disparity
- zeros = torch.zeros_like(flow) # [B, 1, H, W]
- # NOTE: reverse disp, disparity is positive
- displace = torch.cat((-flow, zeros), dim=1) # [B, 2, H, W]
- feature1 = flow_warp(feature1, displace) # [B, C, H, W]
- elif task == "flow":
- feature1 = flow_warp(feature1, flow) # [B, C, H, W]
- else:
- raise NotImplementedError
-
- attn_splits = attn_splits_list[scale_idx]
- if task != "depth":
- corr_radius = corr_radius_list[scale_idx]
- prop_radius = prop_radius_list[scale_idx]
-
- # add position to features
- feature0, feature1 = feature_add_position(feature0, feature1, attn_splits, self.feature_channels)
-
- # Transformer
- feature0, feature1 = self.transformer(
- feature0,
- feature1,
- attn_type=attn_type,
- attn_num_splits=attn_splits,
- )
-
- # correlation and softmax
- if task == "depth":
- # first generate depth candidates
- b, _, h, w = feature0.size()
- depth_candidates = torch.linspace(min_depth, max_depth, num_depth_candidates).type_as(feature0)
- depth_candidates = depth_candidates.view(1, num_depth_candidates, 1, 1).repeat(
- b, 1, h, w
- ) # [B, D, H, W]
-
- flow_pred = correlation_softmax_depth(
- feature0,
- feature1,
- intrinsics_curr,
- pose,
- depth_candidates=depth_candidates,
- depth_from_argmax=depth_from_argmax,
- pred_bidir_depth=pred_bidir_depth,
- )[0]
-
- else:
- if corr_radius == -1: # global matching
- if task == "flow":
- flow_pred = global_correlation_softmax(feature0, feature1, pred_bidir_flow)[0]
- elif task == "stereo":
- flow_pred = global_correlation_softmax_stereo(feature0, feature1)[0]
- else:
- raise NotImplementedError
- else: # local matching
- if task == "flow":
- flow_pred = local_correlation_softmax(feature0, feature1, corr_radius)[0]
- elif task == "stereo":
- flow_pred = local_correlation_softmax_stereo(feature0, feature1, corr_radius)[0]
- else:
- raise NotImplementedError
-
- # flow or residual flow
- flow = flow + flow_pred if flow is not None else flow_pred
-
- if task == "stereo":
- flow = flow.clamp(min=0) # positive disparity
-
- # upsample to the original resolution for supervison at training time only
- if self.training:
- flow_bilinear = self.upsample_flow(
- flow, None, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth"
- )
- flow_preds.append(flow_bilinear)
-
- # flow propagation with self-attn
- if (pred_bidir_flow or pred_bidir_depth) and scale_idx == 0:
- feature0 = torch.cat((feature0, feature1), dim=0) # [2*B, C, H, W] for propagation
-
- flow = self.feature_flow_attn(
- feature0,
- flow.detach(),
- local_window_attn=prop_radius > 0,
- local_window_radius=prop_radius,
- )
-
- # bilinear exclude the last one
- if self.training and scale_idx < self.num_scales - 1:
- flow_up = self.upsample_flow(
- flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth"
- )
- flow_preds.append(flow_up)
-
- if scale_idx == self.num_scales - 1:
- if not self.reg_refine:
- # upsample to the original image resolution
-
- if task == "stereo":
- flow_pad = torch.cat((-flow, torch.zeros_like(flow)), dim=1) # [B, 2, H, W]
- flow_up_pad = self.upsample_flow(flow_pad, feature0)
- flow_up = -flow_up_pad[:, :1] # [B, 1, H, W]
- elif task == "depth":
- depth_pad = torch.cat((flow, torch.zeros_like(flow)), dim=1) # [B, 2, H, W]
- depth_up_pad = self.upsample_flow(depth_pad, feature0, is_depth=True).clamp(
- min=min_depth, max=max_depth
- )
- flow_up = depth_up_pad[:, :1] # [B, 1, H, W]
- else:
- flow_up = self.upsample_flow(flow, feature0)
-
- flow_preds.append(flow_up)
- else:
- # task-specific local regression refinement
- # supervise current flow
- if self.training:
- flow_up = self.upsample_flow(
- flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth"
- )
- flow_preds.append(flow_up)
-
- assert num_reg_refine > 0
- for refine_iter_idx in range(num_reg_refine):
- flow = flow.detach()
-
- if task == "stereo":
- zeros = torch.zeros_like(flow) # [B, 1, H, W]
- # NOTE: reverse disp, disparity is positive
- displace = torch.cat((-flow, zeros), dim=1) # [B, 2, H, W]
- correlation = local_correlation_with_flow(
- feature0_ori,
- feature1_ori,
- flow=displace,
- local_radius=4,
- ) # [B, (2R+1)^2, H, W]
- elif task == "depth":
- if pred_bidir_depth and refine_iter_idx == 0:
- intrinsics_curr = intrinsics_curr.repeat(2, 1, 1)
- pose = torch.cat((pose, torch.inverse(pose)), dim=0)
-
- feature0_ori, feature1_ori = torch.cat((feature0_ori, feature1_ori), dim=0), torch.cat(
- (feature1_ori, feature0_ori), dim=0
- )
-
- flow_from_depth = compute_flow_with_depth_pose(
- 1.0 / flow.squeeze(1),
- intrinsics_curr,
- extrinsics_rel=pose,
- )
-
- correlation = local_correlation_with_flow(
- feature0_ori,
- feature1_ori,
- flow=flow_from_depth,
- local_radius=4,
- ) # [B, (2R+1)^2, H, W]
-
- else:
- correlation = local_correlation_with_flow(
- feature0_ori,
- feature1_ori,
- flow=flow,
- local_radius=4,
- ) # [B, (2R+1)^2, H, W]
-
- proj = self.refine_proj(feature0)
-
- net, inp = torch.chunk(proj, chunks=2, dim=1)
-
- net = torch.tanh(net)
- inp = torch.relu(inp)
-
- net, up_mask, residual_flow = self.refine(
- net,
- inp,
- correlation,
- flow.clone(),
- )
-
- if task == "depth":
- flow = (flow - residual_flow).clamp(min=min_depth, max=max_depth)
- else:
- flow = flow + residual_flow
-
- if task == "stereo":
- flow = flow.clamp(min=0) # positive
-
- if self.training or refine_iter_idx == num_reg_refine - 1:
- if task == "depth":
- if refine_iter_idx < num_reg_refine - 1:
- # bilinear upsampling
- flow_up = self.upsample_flow(
- flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=True
- )
- else:
- # last one convex upsampling
- # NOTE: clamp depth due to the zero padding in the unfold in the convex upsampling
- # pad depth to 2 channels as flow
- depth_pad = torch.cat((flow, torch.zeros_like(flow)), dim=1) # [B, 2, H, W]
- depth_up_pad = self.upsample_flow(depth_pad, feature0, is_depth=True).clamp(
- min=min_depth, max=max_depth
- )
- flow_up = depth_up_pad[:, :1] # [B, 1, H, W]
-
- else:
- flow_up = upsample_flow_with_mask(
- flow, up_mask, upsample_factor=self.upsample_factor, is_depth=task == "depth"
- )
-
- flow_preds.append(flow_up)
-
- if task == "stereo":
- for i in range(len(flow_preds)):
- flow_preds[i] = flow_preds[i].squeeze(1) # [B, H, W]
-
- # convert inverse depth to depth
- if task == "depth":
- for i in range(len(flow_preds)):
- flow_preds[i] = 1.0 / flow_preds[i].squeeze(1) # [B, H, W]
-
- results_dict.update({"flow_preds": flow_preds})
-
- return results_dict
diff --git a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/utils.py b/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/utils.py
deleted file mode 100644
index 60f40bea290ddd9a3f36adc7b4defb6e26588d1b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/mm/OpenSora1.1/tools/scoring/optical_flow/unimatch/utils.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-from .position import PositionEmbeddingSine
-
-
-def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None):
- assert device is not None
-
- x, y = torch.meshgrid(
- [torch.linspace(w_min, w_max, len_w, device=device), torch.linspace(h_min, h_max, len_h, device=device)],
- )
- grid = torch.stack((x, y), -1).transpose(0, 1).float() # [H, W, 2]
-
- return grid
-
-
-def normalize_coords(coords, h, w):
- # coords: [B, H, W, 2]
- c = torch.Tensor([(w - 1) / 2.0, (h - 1) / 2.0]).float().to(coords.device)
- return (coords - c) / c # [-1, 1]
-
-
-def normalize_img(img0, img1):
- # loaded images are in [0, 255]
- # normalize by ImageNet mean and std
- mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(img1.device)
- std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(img1.device)
- img0 = (img0 / 255.0 - mean) / std
- img1 = (img1 / 255.0 - mean) / std
-
- return img0, img1
-
-
-def split_feature(
- feature,
- num_splits=2,
- channel_last=False,
-):
- if channel_last: # [B, H, W, C]
- b, h, w, c = feature.size()
- assert h % num_splits == 0 and w % num_splits == 0
-
- b_new = b * num_splits * num_splits
- h_new = h // num_splits
- w_new = w // num_splits
-
- feature = (
- feature.view(b, num_splits, h // num_splits, num_splits, w // num_splits, c)
- .permute(0, 1, 3, 2, 4, 5)
- .reshape(b_new, h_new, w_new, c)
- ) # [B*K*K, H/K, W/K, C]
- else: # [B, C, H, W]
- b, c, h, w = feature.size()
- assert h % num_splits == 0 and w % num_splits == 0
-
- b_new = b * num_splits * num_splits
- h_new = h // num_splits
- w_new = w // num_splits
-
- feature = (
- feature.view(b, c, num_splits, h // num_splits, num_splits, w // num_splits)
- .permute(0, 2, 4, 1, 3, 5)
- .reshape(b_new, c, h_new, w_new)
- ) # [B*K*K, C, H/K, W/K]
-
- return feature
-
-
-def merge_splits(
- splits,
- num_splits=2,
- channel_last=False,
-):
- if channel_last: # [B*K*K, H/K, W/K, C]
- b, h, w, c = splits.size()
- new_b = b // num_splits // num_splits
-
- splits = splits.view(new_b, num_splits, num_splits, h, w, c)
- merge = (
- splits.permute(0, 1, 3, 2, 4, 5).contiguous().view(new_b, num_splits * h, num_splits * w, c)
- ) # [B, H, W, C]
- else: # [B*K*K, C, H/K, W/K]
- b, c, h, w = splits.size()
- new_b = b // num_splits // num_splits
-
- splits = splits.view(new_b, num_splits, num_splits, c, h, w)
- merge = (
- splits.permute(0, 3, 1, 4, 2, 5).contiguous().view(new_b, c, num_splits * h, num_splits * w)
- ) # [B, C, H, W]
-
- return merge
-
-
-def generate_shift_window_attn_mask(
- input_resolution, window_size_h, window_size_w, shift_size_h, shift_size_w, device=torch.device("cuda")
-):
- # ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
- # calculate attention mask for SW-MSA
- h, w = input_resolution
- img_mask = torch.zeros((1, h, w, 1)).to(device) # 1 H W 1
- h_slices = (slice(0, -window_size_h), slice(-window_size_h, -shift_size_h), slice(-shift_size_h, None))
- w_slices = (slice(0, -window_size_w), slice(-window_size_w, -shift_size_w), slice(-shift_size_w, None))
- cnt = 0
- for h in h_slices:
- for w in w_slices:
- img_mask[:, h, w, :] = cnt
- cnt += 1
-
- mask_windows = split_feature(img_mask, num_splits=input_resolution[-1] // window_size_w, channel_last=True)
-
- mask_windows = mask_windows.view(-1, window_size_h * window_size_w)
- attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
- attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-
- return attn_mask
-
-
-def feature_add_position(feature0, feature1, attn_splits, feature_channels):
- pos_enc = PositionEmbeddingSine(num_pos_feats=feature_channels // 2)
-
- if attn_splits > 1: # add position in splited window
- feature0_splits = split_feature(feature0, num_splits=attn_splits)
- feature1_splits = split_feature(feature1, num_splits=attn_splits)
-
- position = pos_enc(feature0_splits)
-
- feature0_splits = feature0_splits + position
- feature1_splits = feature1_splits + position
-
- feature0 = merge_splits(feature0_splits, num_splits=attn_splits)
- feature1 = merge_splits(feature1_splits, num_splits=attn_splits)
- else:
- position = pos_enc(feature0)
-
- feature0 = feature0 + position
- feature1 = feature1 + position
-
- return feature0, feature1
-
-
-def upsample_flow_with_mask(flow, up_mask, upsample_factor, is_depth=False):
- # convex upsampling following raft
-
- mask = up_mask
- b, flow_channel, h, w = flow.shape
- mask = mask.view(b, 1, 9, upsample_factor, upsample_factor, h, w) # [B, 1, 9, K, K, H, W]
- mask = torch.softmax(mask, dim=2)
-
- multiplier = 1 if is_depth else upsample_factor
- up_flow = F.unfold(multiplier * flow, [3, 3], padding=1)
- up_flow = up_flow.view(b, flow_channel, 9, 1, 1, h, w) # [B, 2, 9, 1, 1, H, W]
-
- up_flow = torch.sum(mask * up_flow, dim=2) # [B, 2, K, K, H, W]
- up_flow = up_flow.permute(0, 1, 4, 2, 5, 3) # [B, 2, K, H, K, W]
- up_flow = up_flow.reshape(b, flow_channel, upsample_factor * h, upsample_factor * w) # [B, 2, K*H, K*W]
-
- return up_flow
-
-
-def split_feature_1d(
- feature,
- num_splits=2,
-):
- # feature: [B, W, C]
- b, w, c = feature.size()
- assert w % num_splits == 0
-
- b_new = b * num_splits
- w_new = w // num_splits
-
- feature = feature.view(b, num_splits, w // num_splits, c).view(b_new, w_new, c) # [B*K, W/K, C]
-
- return feature
-
-
-def merge_splits_1d(
- splits,
- h,
- num_splits=2,
-):
- b, w, c = splits.size()
- new_b = b // num_splits // h
-
- splits = splits.view(new_b, h, num_splits, w, c)
- merge = splits.view(new_b, h, num_splits * w, c) # [B, H, W, C]
-
- return merge
-
-
-def window_partition_1d(x, window_size_w):
- """
- Args:
- x: (B, W, C)
- window_size (int): window size
-
- Returns:
- windows: (num_windows*B, window_size, C)
- """
- B, W, C = x.shape
- x = x.view(B, W // window_size_w, window_size_w, C).view(-1, window_size_w, C)
- return x
-
-
-def generate_shift_window_attn_mask_1d(input_w, window_size_w, shift_size_w, device=torch.device("cuda")):
- # calculate attention mask for SW-MSA
- img_mask = torch.zeros((1, input_w, 1)).to(device) # 1 W 1
- w_slices = (slice(0, -window_size_w), slice(-window_size_w, -shift_size_w), slice(-shift_size_w, None))
- cnt = 0
- for w in w_slices:
- img_mask[:, w, :] = cnt
- cnt += 1
-
- mask_windows = window_partition_1d(img_mask, window_size_w) # nW, window_size, 1
- mask_windows = mask_windows.view(-1, window_size_w)
- attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) # nW, window_size, window_size
- attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-
- return attn_mask